diff options
author | Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> | 2018-03-12 23:34:10 +0300 |
---|---|---|
committer | Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> | 2018-03-12 23:34:10 +0300 |
commit | 6d0c75cf48bab913e2c9c52f1c4c6cd0d656005d (patch) | |
tree | 717342edade369af33a771f00a7dd05354ea8afb /src | |
parent | 5f2eedc6e505eecf5bdef474be3e4f7066702fa7 (diff) |
Autoformat files
Diffstat (limited to 'src')
105 files changed, 2111 insertions, 2159 deletions
diff --git a/src/command/marian.cpp b/src/command/marian.cpp index 637506e5..92b7be7a 100644 --- a/src/command/marian.cpp +++ b/src/command/marian.cpp @@ -10,7 +10,6 @@ #include "training/graph_group_async_drop.h" #endif - bool configureMPI(int, char**); int main(int argc, char** argv) { @@ -18,7 +17,7 @@ int main(int argc, char** argv) { auto options = New<Config>(argc, argv); auto devices = options->getDevices(); - + if(options->get<bool>("multi-node")) { ABORT_IF(!configureMPI(argc, argv), "MPI not found."); diff --git a/src/common/config.cpp b/src/common/config.cpp index 9eca1f10..76b84950 100644 --- a/src/common/config.cpp +++ b/src/common/config.cpp @@ -65,5 +65,4 @@ void Config::AddYamlToNpz(const YAML::Node& yaml, unsigned shape = out.size() + 1; cnpy::npz_save(fName, varName, out.c_str(), &shape, 1, "a"); } - } diff --git a/src/common/config.h b/src/common/config.h index d6749478..68b61a58 100644 --- a/src/common/config.h +++ b/src/common/config.h @@ -76,7 +76,9 @@ public: log(); if(has("version")) - LOG(info, "[config] Model created with Marian {}", get("version").as<std::string>()); + LOG(info, + "[config] Model created with Marian {}", + get("version").as<std::string>()); } Config(const Config& other) : config_(YAML::Clone(other.config_)) {} @@ -110,9 +112,7 @@ public: YAML::Node getModelParameters(); void loadModelParameters(const std::string& name); - const std::vector<DeviceId>& getDevices() { - return devices_; - } + const std::vector<DeviceId>& getDevices() { return devices_; } void save(const std::string& name) { OutputFileStream out(name); diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 31e76f92..4f5b6701 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -2,8 +2,8 @@ #include <boost/algorithm/string.hpp> #include <boost/regex.hpp> #include <set> -#include <string> #include <stdexcept> +#include <string> #if MKL_FOUND //#include <omp.h> @@ -15,9 +15,9 @@ #endif #endif - #include "3rd_party/cnpy/cnpy.h" #include "common/definitions.h" + #include "common/config.h" #include "common/config_parser.h" #include "common/file_stream.h" @@ -146,13 +146,14 @@ bool ConfigParser::has(const std::string& key) const { void ConfigParser::validateOptions() const { if(mode_ == ConfigMode::translating) { - UTIL_THROW_IF2(!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(), + UTIL_THROW_IF2( + !has("vocabs") || get<std::vector<std::string>>("vocabs").empty(), "Translating, but vocabularies are not given!"); for(const auto& modelFile : get<std::vector<std::string>>("models")) { boost::filesystem::path modelPath(modelFile); UTIL_THROW_IF2(!boost::filesystem::exists(modelPath), - "Model file does not exist: " + modelFile); + "Model file does not exist: " + modelFile); } return; @@ -177,9 +178,10 @@ void ConfigParser::validateOptions() const { if(mode_ == ConfigMode::rescoring) { UTIL_THROW_IF2(!boost::filesystem::exists(modelPath), - "Model file does not exist: " + modelPath.string()); + "Model file does not exist: " + modelPath.string()); - UTIL_THROW_IF2(!has("vocabs") || get<std::vector<std::string>>("vocabs").empty(), + UTIL_THROW_IF2( + !has("vocabs") || get<std::vector<std::string>>("vocabs").empty(), "Scoring, but vocabularies are not given!"); return; @@ -193,8 +195,9 @@ void ConfigParser::validateOptions() const { !modelDir.empty() && !boost::filesystem::is_directory(modelDir), "Model directory does not exist"); - UTIL_THROW_IF2(!modelDir.empty() && !(boost::filesystem::status(modelDir).permissions() - & boost::filesystem::owner_write), + UTIL_THROW_IF2(!modelDir.empty() + && !(boost::filesystem::status(modelDir).permissions() + & boost::filesystem::owner_write), "No write permission in model directory"); UTIL_THROW_IF2( @@ -835,7 +838,6 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) { SET_OPTION("transformer-dim-ffn", int); SET_OPTION("transformer-ffn-activation", std::string); - #ifdef CUDNN SET_OPTION("char-stride", int); SET_OPTION("char-highway", int); @@ -976,7 +978,7 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) { SET_OPTION("relative-paths", bool); SET_OPTION("devices", std::vector<std::string>); SET_OPTION("cpu-threads", size_t); - //SET_OPTION("omp-threads", size_t); + // SET_OPTION("omp-threads", size_t); SET_OPTION("mini-batch", int); SET_OPTION("maxi-batch", int); @@ -1021,24 +1023,22 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) { exit(0); } -// @TODO: this should probably be in processOptionDevices() -//#ifdef BLAS_FOUND -// //omp_set_num_threads(vm_["omp-threads"].as<size_t>()); -//#ifdef MKL_FOUND -// mkl_set_num_threads(vm_["omp-threads"].as<size_t>()); -//#endif -//#endif + // @TODO: this should probably be in processOptionDevices() + //#ifdef BLAS_FOUND + // //omp_set_num_threads(vm_["omp-threads"].as<size_t>()); + //#ifdef MKL_FOUND + // mkl_set_num_threads(vm_["omp-threads"].as<size_t>()); + //#endif + //#endif } std::vector<DeviceId> ConfigParser::getDevices() { std::vector<DeviceId> devices; try { - std::string devicesStr = Join(config_["devices"].as<std::vector<std::string>>()); - if(mode_ == ConfigMode::training && get<bool>("multi-node")) { auto parts = Split(devicesStr, ":"); for(size_t i = 1; i < parts.size(); ++i) { @@ -1061,11 +1061,10 @@ std::vector<DeviceId> ConfigParser::getDevices() { if(config_["cpu-threads"].as<size_t>() > 0) { devices.clear(); for(size_t i = 0; i < config_["cpu-threads"].as<size_t>(); ++i) - devices.push_back({i, DeviceType::cpu}); + devices.push_back({i, DeviceType::cpu}); } - } - catch(...) { + } catch(...) { ABORT("Problem parsing devices, please report an issue on github"); } diff --git a/src/common/definitions.h b/src/common/definitions.h index 8fb3bbb6..d7975dc4 100644 --- a/src/common/definitions.h +++ b/src/common/definitions.h @@ -1,10 +1,10 @@ #pragma once #include <functional> +#include <iostream> #include <memory> #include <string> #include <vector> -#include <iostream> #include "common/logging.h" #include "shape.h" @@ -57,7 +57,6 @@ struct DeviceId { friend bool operator==(DeviceId id1, DeviceId id2) { return id1.no == id2.no && id1.type == id2.type; } - }; class TensorBase; diff --git a/src/common/shape.h b/src/common/shape.h index da98a3b9..5cab14fb 100644 --- a/src/common/shape.h +++ b/src/common/shape.h @@ -1,195 +1,191 @@ #pragma once +#include <algorithm> #include <cstdint> #include <iostream> -#include <string> #include <sstream> +#include <string> #include <vector> -#include <algorithm> #include "common/logging.h" namespace marian { struct Shape { - public: - std::vector<int> shape_; - - public: - Shape() : shape_{1} {} - - Shape(std::initializer_list<int> il) : Shape() { - shape_.resize(il.size()); - std::copy(il.begin(), il.end(), begin()); - } - - void resize(size_t n) { - shape_.resize(n, 1); - } - - const int* data() const { - return shape_.data(); - } - - int* data() { - return shape_.data(); - } - - Shape(const Shape& shape) : Shape() { - shape_.resize(shape.size()); - std::copy(shape.begin(), shape.end(), begin()); - } - - inline void set(int i, int val) { - dim(i) = val; - } - - inline int& dim(int i) { - if(i >= 0) { - ABORT_IF(i >= size(), - "Index {} is out of bounds, shape has {} dimension", i, size()); - return shape_[i]; - } - else { - ABORT_IF((int)size() + i < 0, - "Negative index {} is out of bounds, shape has {} dimension", i, size()); - return shape_[size() + i]; - } - } - - inline const int& dim(int i) const { return const_cast<Shape&>(*this).dim(i); } - - inline int operator[](int i) { return dim(i); } - - inline int operator[](int i) const { return dim(i); } +public: + std::vector<int> shape_; - inline int& back() { return shape_.back(); } +public: + Shape() : shape_{1} {} - inline int stride(int i) const { - std::vector<int> stride(shape_.size(), 1); - for(int j = shape_.size() - 2; j >= 0; --j) - stride[j] = stride[j + 1] * shape_[j + 1]; + Shape(std::initializer_list<int> il) : Shape() { + shape_.resize(il.size()); + std::copy(il.begin(), il.end(), begin()); + } - if(i >= 0) - return stride[i]; - else - return stride[size() + i]; - } - - inline size_t size() const { return shape_.size(); } - - inline int elements() const { - int el = 1; - for(auto s : shape_) - el *= s; - return el; - } - - inline void dims(int i, std::vector<int>& d) const { - d.resize(shape_.size()); - - std::vector<int> stride(shape_.size(), 1); - for(int j = shape_.size() - 2; j >= 0; --j) - stride[j] = stride[j + 1] * shape_[j + 1]; - - for(int j = 0; j < d.size(); ++j) - d[j] = (i / stride[j]) % shape_[j]; - } - - auto begin() -> decltype(shape_.begin()) { return shape_.begin(); } - auto begin() const -> decltype(shape_.begin()) { return shape_.begin(); } - - auto end() -> decltype(shape_.end()) { return shape_.end(); } - auto end() const -> decltype(shape_.end()) { return shape_.end(); } - - auto rbegin() -> decltype(shape_.rbegin()) { return shape_.rbegin(); } - auto rbegin() const -> decltype(shape_.rbegin()) { return shape_.rbegin(); } - - auto rend() -> decltype(shape_.rend()) { return shape_.rend(); } - auto rend() const -> decltype(shape_.rend()) { return shape_.rend(); } + void resize(size_t n) { shape_.resize(n, 1); } - bool operator==(const Shape& other) const { - return size() == other.size() && std::equal(begin(), end(), other.begin()); - } - - bool operator!=(const Shape& other) const { return !(*this == other); } - - std::string toString() const { - std::stringstream strm; - strm << "shape=" << (*this)[0]; - for(int i = 1; i < size(); ++i) - strm << "x" << (*this)[i]; - strm << " size=" << elements() << " (" - << elements() * sizeof(float) << "B)"; - return strm.str(); - } - - friend std::ostream& operator<<(std::ostream& strm, const Shape& shape) { - strm << shape.toString(); - return strm; - } + const int* data() const { return shape_.data(); } - operator std::string() const { - std::stringstream ss; - ss << *this; - return ss.str(); - } + int* data() { return shape_.data(); } - int axis(int ax) { - if(ax < 0) - return size() + ax; - else - return ax; - } + Shape(const Shape& shape) : Shape() { + shape_.resize(shape.size()); + std::copy(shape.begin(), shape.end(), begin()); + } - static Shape broadcast(const std::vector<Shape>& shapes) { - int maxDims = 0; - for(auto& s : shapes) - if(s.size() > maxDims) - maxDims = s.size(); - - Shape shape; - shape.resize(maxDims); - - for(auto& s : shapes) { - for(int i = 0; i < s.size(); ++i) { - ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1, - "Shapes {} and {} cannot be broadcasted", - (std::string)shape, - (std::string)s); - shape.set(-i, std::max(shape[-i], s[-i])); - } + inline void set(int i, int val) { dim(i) = val; } + + inline int& dim(int i) { + if(i >= 0) { + ABORT_IF(i >= size(), + "Index {} is out of bounds, shape has {} dimension", + i, + size()); + return shape_[i]; + } else { + ABORT_IF((int)size() + i < 0, + "Negative index {} is out of bounds, shape has {} dimension", + i, + size()); + return shape_[size() + i]; + } + } + + inline const int& dim(int i) const { + return const_cast<Shape&>(*this).dim(i); + } + + inline int operator[](int i) { return dim(i); } + + inline int operator[](int i) const { return dim(i); } + + inline int& back() { return shape_.back(); } + + inline int stride(int i) const { + std::vector<int> stride(shape_.size(), 1); + for(int j = shape_.size() - 2; j >= 0; --j) + stride[j] = stride[j + 1] * shape_[j + 1]; + + if(i >= 0) + return stride[i]; + else + return stride[size() + i]; + } + + inline size_t size() const { return shape_.size(); } + + inline int elements() const { + int el = 1; + for(auto s : shape_) + el *= s; + return el; + } + + inline void dims(int i, std::vector<int>& d) const { + d.resize(shape_.size()); + + std::vector<int> stride(shape_.size(), 1); + for(int j = shape_.size() - 2; j >= 0; --j) + stride[j] = stride[j + 1] * shape_[j + 1]; + + for(int j = 0; j < d.size(); ++j) + d[j] = (i / stride[j]) % shape_[j]; + } + + auto begin() -> decltype(shape_.begin()) { return shape_.begin(); } + auto begin() const -> decltype(shape_.begin()) { return shape_.begin(); } + + auto end() -> decltype(shape_.end()) { return shape_.end(); } + auto end() const -> decltype(shape_.end()) { return shape_.end(); } + + auto rbegin() -> decltype(shape_.rbegin()) { return shape_.rbegin(); } + auto rbegin() const -> decltype(shape_.rbegin()) { return shape_.rbegin(); } + + auto rend() -> decltype(shape_.rend()) { return shape_.rend(); } + auto rend() const -> decltype(shape_.rend()) { return shape_.rend(); } + + bool operator==(const Shape& other) const { + return size() == other.size() && std::equal(begin(), end(), other.begin()); + } + + bool operator!=(const Shape& other) const { return !(*this == other); } + + std::string toString() const { + std::stringstream strm; + strm << "shape=" << (*this)[0]; + for(int i = 1; i < size(); ++i) + strm << "x" << (*this)[i]; + strm << " size=" << elements() << " (" << elements() * sizeof(float) + << "B)"; + return strm.str(); + } + + friend std::ostream& operator<<(std::ostream& strm, const Shape& shape) { + strm << shape.toString(); + return strm; + } + + operator std::string() const { + std::stringstream ss; + ss << *this; + return ss.str(); + } + + int axis(int ax) { + if(ax < 0) + return size() + ax; + else + return ax; + } + + static Shape broadcast(const std::vector<Shape>& shapes) { + int maxDims = 0; + for(auto& s : shapes) + if(s.size() > maxDims) + maxDims = s.size(); + + Shape shape; + shape.resize(maxDims); + + for(auto& s : shapes) { + for(int i = 0; i < s.size(); ++i) { + ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1, + "Shapes {} and {} cannot be broadcasted", + (std::string)shape, + (std::string)s); + shape.set(-i, std::max(shape[-i], s[-i])); } - return shape; - } - - template <typename T> - static Shape broadcast(const std::initializer_list<T>& il) { - return broadcast(std::vector<T>(il)); } - - template <typename T> - static Shape broadcast(const std::vector<T>& nodes) { - int maxDims = 0; - for(auto& n : nodes) - if(n->shape().size() > maxDims) - maxDims = n->shape().size(); - - Shape shape; - shape.resize(maxDims); - - for(auto& node : nodes) { - const Shape& shapen = node->shape(); - for(int i = 1; i <= shapen.size(); ++i) { - ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1, - "Shapes {} and {} cannot be broadcasted", - (std::string)shape, - (std::string)shapen); - shape.set(-i, std::max(shape[-i], shapen[-i])); - } + return shape; + } + + template <typename T> + static Shape broadcast(const std::initializer_list<T>& il) { + return broadcast(std::vector<T>(il)); + } + + template <typename T> + static Shape broadcast(const std::vector<T>& nodes) { + int maxDims = 0; + for(auto& n : nodes) + if(n->shape().size() > maxDims) + maxDims = n->shape().size(); + + Shape shape; + shape.resize(maxDims); + + for(auto& node : nodes) { + const Shape& shapen = node->shape(); + for(int i = 1; i <= shapen.size(); ++i) { + ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1, + "Shapes {} and {} cannot be broadcasted", + (std::string)shape, + (std::string)shapen); + shape.set(-i, std::max(shape[-i], shapen[-i])); } - return shape; } + return shape; + } }; - } diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h index 0a6f801d..3ddf8752 100644 --- a/src/data/batch_generator.h +++ b/src/data/batch_generator.h @@ -1,11 +1,11 @@ #pragma once +#include <boost/timer/timer.hpp> +#include <condition_variable> #include <deque> #include <functional> -#include <queue> #include <mutex> -#include <condition_variable> -#include <boost/timer/timer.hpp> +#include <queue> #include "common/config.h" #include "data/batch_stats.h" @@ -47,16 +47,17 @@ private: void fillBatches(bool shuffle = true) { typedef typename sample::value_type Item; - auto itemCmp = [](const Item& sa, const Item& sb) { - return sa.size() < sb.size(); - }; + auto itemCmp + = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; auto cmpSrc = [itemCmp](const sample& a, const sample& b) { - return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end(), itemCmp); + return std::lexicographical_compare( + a.begin(), a.end(), b.begin(), b.end(), itemCmp); }; auto cmpTrg = [itemCmp](const sample& a, const sample& b) { - return std::lexicographical_compare(a.rbegin(), a.rend(), b.rbegin(), b.rend(), itemCmp); + return std::lexicographical_compare( + a.rbegin(), a.rend(), b.rbegin(), b.rend(), itemCmp); }; auto cmpNone = [](const sample& a, const sample& b) { return &a < &b; }; @@ -168,9 +169,8 @@ public: operator bool() const { // wait if empty but loading std::unique_lock<std::mutex> lock(loadMutex_); - loadCondition_.wait(lock, [this]{ - return loadReady_ || !bufferedBatches_.empty(); - }); + loadCondition_.wait( + lock, [this] { return loadReady_ || !bufferedBatches_.empty(); }); return !bufferedBatches_.empty(); } @@ -178,15 +178,16 @@ public: BatchPtr next() { { std::unique_lock<std::mutex> lock(loadMutex_); - loadCondition_.wait(lock, [this]{ - return loadReady_ || !bufferedBatches_.empty(); - }); + loadCondition_.wait( + lock, [this] { return loadReady_ || !bufferedBatches_.empty(); }); } ABORT_IF(bufferedBatches_.empty(), "No batches to fetch, run prepare()"); currentBatch_ = bufferedBatches_.front(); - if(loadReady_ && bufferedBatches_.size() <= std::max(options_->get<int>("maxi-batch") / 5, 1)) { + if(loadReady_ + && bufferedBatches_.size() + <= std::max(options_->get<int>("maxi-batch") / 5, 1)) { { std::unique_lock<std::mutex> lock(loadMutex_); loadReady_ = false; diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h index ebd157bc..cb2a1518 100644 --- a/src/data/corpus_base.h +++ b/src/data/corpus_base.h @@ -400,7 +400,6 @@ public: std::cerr << std::endl; } } - }; class CorpusIterator; diff --git a/src/data/corpus_nbest.cpp b/src/data/corpus_nbest.cpp index 9eefe898..105b5acd 100644 --- a/src/data/corpus_nbest.cpp +++ b/src/data/corpus_nbest.cpp @@ -1,7 +1,7 @@ #include <random> -#include "data/corpus_nbest.h" #include "common/utils.h" +#include "data/corpus_nbest.h" namespace marian { namespace data { @@ -10,8 +10,8 @@ CorpusNBest::CorpusNBest(Ptr<Config> options, bool translate /*= false*/) : CorpusBase(options, translate) {} CorpusNBest::CorpusNBest(std::vector<std::string> paths, - std::vector<Ptr<Vocab>> vocabs, - Ptr<Config> options) + std::vector<Ptr<Vocab>> vocabs, + Ptr<Config> options) : CorpusBase(paths, vocabs, options) {} int numFromNbest(const std::string& line) { @@ -19,7 +19,8 @@ int numFromNbest(const std::string& line) { Split(line, fields, " ||| ", true); ABORT_IF(fields.size() < 4, "Too few fields ({}) in line \"{}\", is this a correct n-best list?", - fields.size(), line); + fields.size(), + line); return std::stoi(fields[0]); } @@ -28,7 +29,8 @@ std::string lineFromNbest(const std::string& line) { Split(line, fields, " ||| ", true); ABORT_IF(fields.size() < 4, "Too few fields ({}) in line \"{}\", is this a correct n-best list?", - fields.size(), line); + fields.size(), + line); return fields[1]; } @@ -56,7 +58,8 @@ SentenceTuple CorpusNBest::next() { for(size_t i = 0; i < last; ++i) { if(curr_num > lastNum_) { ABORT_IF(!std::getline((std::istream&)*files_[i], lastLines_[i]), - "Too few lines in input {}", i); + "Too few lines in input {}", + i); } addWordsToSentenceTuple(lastLines_[i], i, tup); } @@ -90,6 +93,5 @@ void CorpusNBest::reset() { files_.emplace_back(new InputFileStream(path)); } } - } } diff --git a/src/data/corpus_sqlite.cpp b/src/data/corpus_sqlite.cpp index 04cbc1b9..9f2d95bb 100644 --- a/src/data/corpus_sqlite.cpp +++ b/src/data/corpus_sqlite.cpp @@ -154,6 +154,5 @@ void CorpusSQLite::restore(Ptr<TrainingState> ts) { reset(); } } - } } diff --git a/src/data/corpus_sqlite.h b/src/data/corpus_sqlite.h index c604edab..2e871805 100644 --- a/src/data/corpus_sqlite.h +++ b/src/data/corpus_sqlite.h @@ -19,7 +19,6 @@ #include <SQLiteCpp/SQLiteCpp.h> #include <SQLiteCpp/sqlite3/sqlite3.h> - static void SQLiteRandomSeed(sqlite3_context* context, int argc, sqlite3_value** argv) { diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp index 07d16c33..196c147c 100644 --- a/src/data/vocab.cpp +++ b/src/data/vocab.cpp @@ -138,8 +138,9 @@ void Vocab::create(const std::string& vocabPath, const std::string& trainPath) { "Specified vocab directory {} does not exist", dir); - ABORT_IF(!dir.empty() && !(boost::filesystem::status(dir).permissions() - & boost::filesystem::owner_write), + ABORT_IF(!dir.empty() + && !(boost::filesystem::status(dir).permissions() + & boost::filesystem::owner_write), "No write permission in vocab directory {}", dir); diff --git a/src/functional/array.h b/src/functional/array.h index 7e415abb..9d187999 100644 --- a/src/functional/array.h +++ b/src/functional/array.h @@ -32,7 +32,5 @@ struct Array { data_[i] = val; } }; - } - } diff --git a/src/functional/floats.h b/src/functional/floats.h index 679c4750..33478843 100644 --- a/src/functional/floats.h +++ b/src/functional/floats.h @@ -4,93 +4,90 @@ #include "functional/operands.h" namespace marian { - namespace functional { - - namespace float2unsigned { - constexpr float abs(float x) { return x < 0 ? -x : x; } - - constexpr int exponent(float x) { - return abs(x) >= 2 ? exponent(x / 2) + 1 : - abs(x) < 1 ? exponent(x * 2) - 1 : - 0; - } - - constexpr float scalbn(float value, int exponent) { - return exponent == 0 ? value : - exponent > 0 ? scalbn(value * 2, exponent - 1) : - scalbn(value / 2, exponent + 1); - } - - constexpr unsigned mantissa(float x, int exp) { - // remove hidden 1 and bias the exponent to get integer - return abs(x) < std::numeric_limits<float>::infinity() ? - scalbn(scalbn(abs(x), -exp) - 1, 23) : 0; - } - - constexpr unsigned to_binary(float x, unsigned sign, int exp) { - return sign * (1u << 31) - + (exp + 127) * (1u << 23) - + mantissa(x, exp); - } - - constexpr unsigned to_binary(float x) { - return x == 0 ? 0 : to_binary(x, x < 0, exponent(x)); - } - - } - - namespace unsigned2float { - - constexpr float sign(unsigned i) { - return (i & (1u << 31)) ? -1.f : 1.f; - } - - constexpr int exponent(unsigned i) { - return int((i >> 23) & 255u) - 127; - } - - constexpr float sig(unsigned i, unsigned shift) { - return ((i >> shift) & 1u) * 1.f / (1u << (23 - shift)) - + (shift > 0 ? sig(i, shift - 1) : 0); - } - - constexpr float powr(int exp) { - return exp > 0 ? 2.f * powr(exp - 1) : 1.f; - } - - constexpr float pow(int exp) { - return exp < 0 ? 1.f / powr(-exp) : powr(exp); - } - - constexpr float from_binary(unsigned i) { - return (1.f + sig(i, 22u)) - * pow(exponent(i)) - * sign(i); - } - } - - constexpr unsigned f2i(float x) { - return float2unsigned::to_binary(x); - } - - constexpr float i2f(float x) { - return unsigned2float::from_binary(x); - } - - template <unsigned V> - struct F { - static constexpr auto value = i2f(V); - static constexpr auto binary = V; - - template <typename ...Args> - __HDI__ constexpr float operator()(Args&&... args) const { - return value; - } - - std::string to_string() { - return "F<" + std::to_string(value) + ">"; - } - }; - +namespace functional { + +namespace float2unsigned { +constexpr float abs(float x) { + return x < 0 ? -x : x; +} + +// clang-format off +constexpr int exponent(float x) { + return abs(x) >= 2 ? exponent(x / 2) + 1 : + abs(x) < 1 ? exponent(x * 2) - 1 : + 0; +} + +constexpr float scalbn(float value, int exponent) { + return exponent == 0 ? value : + exponent > 0 ? scalbn(value * 2, exponent - 1) : + scalbn(value / 2, exponent + 1); +} +// clang-format on + +constexpr unsigned mantissa(float x, int exp) { + // remove hidden 1 and bias the exponent to get integer + return abs(x) < std::numeric_limits<float>::infinity() + ? scalbn(scalbn(abs(x), -exp) - 1, 23) + : 0; +} + +constexpr unsigned to_binary(float x, unsigned sign, int exp) { + return sign * (1u << 31) + (exp + 127) * (1u << 23) + mantissa(x, exp); +} + +constexpr unsigned to_binary(float x) { + return x == 0 ? 0 : to_binary(x, x < 0, exponent(x)); +} +} + +namespace unsigned2float { + +constexpr float sign(unsigned i) { + return (i & (1u << 31)) ? -1.f : 1.f; +} + +constexpr int exponent(unsigned i) { + return int((i >> 23) & 255u) - 127; +} + +constexpr float sig(unsigned i, unsigned shift) { + return ((i >> shift) & 1u) * 1.f / (1u << (23 - shift)) + + (shift > 0 ? sig(i, shift - 1) : 0); +} + +constexpr float powr(int exp) { + return exp > 0 ? 2.f * powr(exp - 1) : 1.f; +} + +constexpr float pow(int exp) { + return exp < 0 ? 1.f / powr(-exp) : powr(exp); +} + +constexpr float from_binary(unsigned i) { + return (1.f + sig(i, 22u)) * pow(exponent(i)) * sign(i); +} +} + +constexpr unsigned f2i(float x) { + return float2unsigned::to_binary(x); +} + +constexpr float i2f(float x) { + return unsigned2float::from_binary(x); +} + +template <unsigned V> +struct F { + static constexpr auto value = i2f(V); + static constexpr auto binary = V; + + template <typename... Args> + __HDI__ constexpr float operator()(Args&&... args) const { + return value; } -}
\ No newline at end of file + + std::string to_string() { return "F<" + std::to_string(value) + ">"; } +}; +} +} diff --git a/src/functional/functional.h b/src/functional/functional.h index c5a174e8..42ebcdeb 100644 --- a/src/functional/functional.h +++ b/src/functional/functional.h @@ -4,25 +4,25 @@ #include "functional/predicates.h" namespace marian { - namespace functional { +namespace functional { - template <int N> - using ref = Assignee<N>; +template <int N> +using ref = Assignee<N>; - static ref<1> _1; - static ref<2> _2; - static ref<3> _3; - static ref<4> _4; - static ref<5> _5; - static ref<6> _6; - static ref<7> _7; - static ref<8> _8; - static ref<9> _9; +static ref<1> _1; +static ref<2> _2; +static ref<3> _3; +static ref<4> _4; +static ref<5> _5; +static ref<6> _6; +static ref<7> _7; +static ref<8> _8; +static ref<9> _9; - static C<0> _0c; - static C<1> _1c; - static C<2> _2c; - static C<-1> _1cneg; - static C<-2> _2cneg; - } +static C<0> _0c; +static C<1> _1c; +static C<2> _2c; +static C<-1> _1cneg; +static C<-2> _2cneg; +} }
\ No newline at end of file diff --git a/src/functional/operands.h b/src/functional/operands.h index f3f3c22e..573770d1 100644 --- a/src/functional/operands.h +++ b/src/functional/operands.h @@ -5,71 +5,69 @@ #include "functional/defs.h" namespace marian { - namespace functional { - - template <class C> - using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type; - - template <int N> - struct Select { - template <typename T, typename ...Args> - __HDI__ static auto apply(T&& arg, Args&&... args) -> decltype(Select<N-1>::apply(args...)) { - return Select<N-1>::apply(args...); - } - }; - - template <> - struct Select<0> { - template <typename T, typename ...Args> - __HDI__ static T apply(T&& arg, Args&&... args) { - return arg; - } - }; +namespace functional { + +template <class C> +using IsClass = typename std::enable_if<std::is_class<C>::value, C>::type; + +template <int N> +struct Select { + template <typename T, typename... Args> + __HDI__ static auto apply(T&& arg, Args&&... args) + -> decltype(Select<N - 1>::apply(args...)) { + return Select<N - 1>::apply(args...); + } +}; + +template <> +struct Select<0> { + template <typename T, typename... Args> + __HDI__ static T apply(T&& arg, Args&&... args) { + return arg; + } +}; /******************************************************************************/ - template <int V> - struct C { - static constexpr auto value = V; +template <int V> +struct C { + static constexpr auto value = V; - template <typename ...Args> - __HDI__ float operator()(Args&&... args) { return V; } + template <typename... Args> + __HDI__ float operator()(Args&&... args) { + return V; + } - std::string to_string() { - return "C<" + std::to_string(V) + ">"; - } - }; + std::string to_string() { return "C<" + std::to_string(V) + ">"; } +}; /******************************************************************************/ - struct Capture { - float value; +struct Capture { + float value; - Capture(float val) : value(val) {}; + Capture(float val) : value(val){}; - template <typename ...Args> - __HDI__ float operator()(Args&&... args) { return value; } + template <typename... Args> + __HDI__ float operator()(Args&&... args) { + return value; + } - std::string to_string() { - return "Cap(" + std::to_string(value) + ")"; - } - }; + std::string to_string() { return "Cap(" + std::to_string(value) + ")"; } +}; /******************************************************************************/ - template <int N> - struct Var { - static constexpr auto index = N; - - template <typename ...Args> - __HDI__ float& operator()(Args&&... args) { - return Select<N-1>::apply(args...); - } - - std::string to_string() { - return "Var<" + std::to_string(N) + ">"; - } - }; +template <int N> +struct Var { + static constexpr auto index = N; + template <typename... Args> + __HDI__ float& operator()(Args&&... args) { + return Select<N - 1>::apply(args...); } + + std::string to_string() { return "Var<" + std::to_string(N) + ">"; } +}; +} } diff --git a/src/functional/predicates.h b/src/functional/predicates.h index eb091d43..41a741bb 100644 --- a/src/functional/predicates.h +++ b/src/functional/predicates.h @@ -4,237 +4,230 @@ #include "functional/operands.h" namespace marian { - namespace functional { - - template <typename Function, typename X> - struct UnaryFunctor { - X x; - - template <class Arg> - UnaryFunctor(Arg a) : x(a) {} - - template <typename ...Args> - __HDI__ float operator()(Args&&... args) { - return Function::apply(x(args...)); - } - - std::string to_string() { - return Function::n() + "<" + x.to_string() + ">"; - } - }; - - template <class Function, class X, class Y> - struct BinaryFunctor { - X x; - Y y; - - template <class Arg1, class Arg2> - BinaryFunctor(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {} - - template <typename ...Args> - __HDI__ float operator()(Args&&... args) { - return Function::apply(x(args...), y(args...)); - } - - std::string to_string() { - return Function::n() + - "<" + x.to_string() + - "," + y.to_string() + ">"; - } - }; - - #define UNARY(name, name2, func) \ - namespace elem { \ - struct name { \ - __HDI__ static float apply(float x) { return func; } \ - static std::string n() { return #name; }\ - }; \ - }\ - template <class X> using name = UnaryFunctor<elem::name, X>;\ - template <typename X>\ - name<IsClass<X>> name2(X x) {\ - return name<X>(x);\ - }\ - static name<Capture> name2(Capture x) {\ - return name<Capture>(x);\ - } - - #define BINARY(name, name2, func) \ - namespace elem { \ - struct name { \ - __HDI__ static float apply(float x, float y) { return func; } \ - static std::string n() { return #name; }\ - }; \ - }\ - template <class X, class Y> using name = BinaryFunctor<elem::name, X, Y>;\ - template <class X, class Y>\ - name<IsClass<X>, IsClass<Y>> name2(X x, Y y) {\ - return name<X, Y>(x, y);\ - }\ - template <class Y>\ - name<Capture, IsClass<Y>> name2(Capture x, Y y) {\ - return name<Capture, Y>(x, y);\ - }\ - template <class X>\ - name<IsClass<X>, Capture> name2(X x, Capture y) {\ - return name<X, Capture>(x, y);\ - } - - UNARY(Tanh, tanh, tanhf(x)); - UNARY(Sin, sin, sinf(x)); - UNARY(Cos, cos, cosf(x)); - UNARY(Tan, tan, tanf(x)); - UNARY(Log, log, logf(x)); - UNARY(Exp, exp, expf(x)); - UNARY(Abs, abs, fabs(x)); - UNARY(Sqrt, sqrt, sqrtf(x)); - UNARY(Neg, operator-, -x); - UNARY(Logit, logit, x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x)))); - - BINARY(Plus, operator+, x + y); - BINARY(Minus, operator-, x - y); - BINARY(Mult, operator*, x * y); - BINARY(Div, operator/, x / y); - - UNARY(Negate, operator!, !x); - BINARY(Eq, operator==, x == y); - BINARY(NEq, operator!=, x != y); - BINARY(Gt, operator>, x > y); - BINARY(Lt, operator<, x < y); - BINARY(Geq, operator>=, x >= y); - BINARY(Leq, operator<=, x <= y); - BINARY(And, operator&&, x && y); - BINARY(Or, operator||, x || y); - - template <typename T> - __HDI__ T sgn(T val) { - return (float(0) < val) - (val < float(0)); - } - - UNARY(Sgn, sgn, sgn(x)); - - BINARY(Pow, pow, pow(x, y)); - - BINARY(Clip, clip, fabs(x) >= y ? sgn(x) * y : x); - - UNARY(sReLU, ReLU, x > 0.f ? x : 0.f); - UNARY(sReLUBack, ReLUback, x > 0.f ? 1.f : 0.f); - BINARY(sPReLU, PReLU, x > 0.f ? x : x * y); - BINARY(sPReLUBack, PReLUback, x > 0.f ? 1.f : y); - - template <class Function, class X, class Y, class Z> - struct TernaryFunctor { - X x; - Y y; - Z z; - - template <class Arg1, class Arg2, class Arg3> - TernaryFunctor(Arg1 arg1, Arg2 arg2, Arg3 arg3) - : x(arg1), y(arg2), z(arg3) {} - - template <typename ...Args> - __HDI__ float operator()(Args&&... args) { - return Function::apply(x(args...), y(args...), z(args...)); - } - }; - - #define TERNARY(name, name2, func) \ - namespace elem { \ - struct name { \ - __HDI__ static float apply(float x, float y, float z) { return func; } \ - }; \ - }\ - template <class X, class Y, class Z> using name = TernaryFunctor<elem::name, X, Y, Z>;\ - template <typename X, typename Y, typename Z>\ - name<IsClass<X>, IsClass<Y>, IsClass<Z>> name2(X x, Y y, Z z) {\ - return name<X, Y, Z>(x, y, z);\ - }\ - template <typename X, typename Z>\ - name<IsClass<X>, Capture, IsClass<Z>> name2(X x, Capture y, Z z) {\ - return name<X, Capture, Z>(x, y, z);\ - }\ - template <typename Y, typename Z>\ - name<Capture, IsClass<Y>, IsClass<Z>> name2(Capture x, Y y, Z z) {\ - return name<Capture, Y, Z>(x, y, z);\ - }\ - template <typename X>\ - name<IsClass<X>, Capture, Capture> name2(X x, Capture y, Capture z) {\ - return name<X, Capture, Capture>(x, y, z);\ - }\ - template <typename Y>\ - name<Capture, IsClass<Y>, Capture> name2(Capture x, Y y, Capture z) {\ - return name<Capture, Y, Capture>(x, y, z);\ - }\ - template <typename Z>\ - name<Capture, Capture, IsClass<Z>> name2(Capture x, Capture y, Z z) {\ - return name<Capture, Capture, Z>(x, y, z);\ - } - - TERNARY(IfThenElse, if_then_else, x ? y : z); - - - - template <class X, class Y> - struct Assign { - X x; - Y y; - - template <class Arg1, class Arg2> - Assign(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {} - - template <typename ...Args> - __HDI__ float operator()(Args&&... args) { - return x(args...) = y(args...); - } - }; - - template <int N> - struct Assignee { - Var<N> var; - - Assignee() {} - Assignee(Var<N> v) : var(v) {} - - template <typename ...Args> - __HDI__ float& operator()(Args&&... args) { - return var(args...); - } - - template <class X> - Assign<Var<N>, IsClass<X>> operator=(X x) { - return Assign<Var<N>, X>(var, x); - } - - Assign<Var<N>, Capture> operator=(Capture x) { - return Assign<Var<N>, Capture>(var, x); - } - - template <class X> - auto operator+=(X x)->decltype(*this = *this + x) { - return *this = *this + x; - } - - template <class X> - auto operator-=(X x)->decltype(*this = *this - x) { - return *this = *this - x; - } - - template <class X> - auto operator*=(X x)->decltype(*this = *this * x) { - return *this = *this * x; - } - - template <class X> - auto operator/=(X x)->decltype(*this = *this / x) { - return *this = *this / x; - } - - std::string to_string() { - return var.to_string(); - } - }; +namespace functional { -/******************************************************************************/ +template <typename Function, typename X> +struct UnaryFunctor { + X x; + + template <class Arg> + UnaryFunctor(Arg a) : x(a) {} + + template <typename... Args> + __HDI__ float operator()(Args&&... args) { + return Function::apply(x(args...)); + } + + std::string to_string() { return Function::n() + "<" + x.to_string() + ">"; } +}; + +template <class Function, class X, class Y> +struct BinaryFunctor { + X x; + Y y; + + template <class Arg1, class Arg2> + BinaryFunctor(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {} + + template <typename... Args> + __HDI__ float operator()(Args&&... args) { + return Function::apply(x(args...), y(args...)); + } + + std::string to_string() { + return Function::n() + "<" + x.to_string() + "," + y.to_string() + ">"; + } +}; + +#define UNARY(name, name2, func) \ + namespace elem { \ + struct name { \ + __HDI__ static float apply(float x) { return func; } \ + static std::string n() { return #name; } \ + }; \ + } \ + template <class X> \ + using name = UnaryFunctor<elem::name, X>; \ + template <typename X> \ + name<IsClass<X>> name2(X x) { \ + return name<X>(x); \ + } \ + static name<Capture> name2(Capture x) { return name<Capture>(x); } + +#define BINARY(name, name2, func) \ + namespace elem { \ + struct name { \ + __HDI__ static float apply(float x, float y) { return func; } \ + static std::string n() { return #name; } \ + }; \ + } \ + template <class X, class Y> \ + using name = BinaryFunctor<elem::name, X, Y>; \ + template <class X, class Y> \ + name<IsClass<X>, IsClass<Y>> name2(X x, Y y) { \ + return name<X, Y>(x, y); \ + } \ + template <class Y> \ + name<Capture, IsClass<Y>> name2(Capture x, Y y) { \ + return name<Capture, Y>(x, y); \ + } \ + template <class X> \ + name<IsClass<X>, Capture> name2(X x, Capture y) { \ + return name<X, Capture>(x, y); \ + } + +UNARY(Tanh, tanh, tanhf(x)); +UNARY(Sin, sin, sinf(x)); +UNARY(Cos, cos, cosf(x)); +UNARY(Tan, tan, tanf(x)); +UNARY(Log, log, logf(x)); +UNARY(Exp, exp, expf(x)); +UNARY(Abs, abs, fabs(x)); +UNARY(Sqrt, sqrt, sqrtf(x)); +UNARY(Neg, operator-, -x); +UNARY(Logit, + logit, + x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x)))); + +BINARY(Plus, operator+, x + y); +BINARY(Minus, operator-, x - y); +BINARY(Mult, operator*, x* y); +BINARY(Div, operator/, x / y); + +UNARY(Negate, operator!, !x); +BINARY(Eq, operator==, x == y); +BINARY(NEq, operator!=, x != y); +BINARY(Gt, operator>, x> y); +BINARY(Lt, operator<, x<y); +BINARY(Geq, operator>=, x >= y); +BINARY(Leq, operator<=, x <= y); +BINARY(And, operator&&, x&& y); +BINARY(Or, operator||, x || y); + +template <typename T> +__HDI__ T sgn(T val) { + return (float(0) < val) - (val < float(0)); +} + +UNARY(Sgn, sgn, sgn(x)); + +BINARY(Pow, pow, pow(x, y)); + +BINARY(Clip, clip, fabs(x) >= y ? sgn(x) * y : x); + +UNARY(sReLU, ReLU, x > 0.f ? x : 0.f); +UNARY(sReLUBack, ReLUback, x > 0.f ? 1.f : 0.f); +BINARY(sPReLU, PReLU, x > 0.f ? x : x * y); +BINARY(sPReLUBack, PReLUback, x > 0.f ? 1.f : y); + +template <class Function, class X, class Y, class Z> +struct TernaryFunctor { + X x; + Y y; + Z z; + + template <class Arg1, class Arg2, class Arg3> + TernaryFunctor(Arg1 arg1, Arg2 arg2, Arg3 arg3) : x(arg1), y(arg2), z(arg3) {} + + template <typename... Args> + __HDI__ float operator()(Args&&... args) { + return Function::apply(x(args...), y(args...), z(args...)); + } +}; + +#define TERNARY(name, name2, func) \ + namespace elem { \ + struct name { \ + __HDI__ static float apply(float x, float y, float z) { return func; } \ + }; \ + } \ + template <class X, class Y, class Z> \ + using name = TernaryFunctor<elem::name, X, Y, Z>; \ + template <typename X, typename Y, typename Z> \ + name<IsClass<X>, IsClass<Y>, IsClass<Z>> name2(X x, Y y, Z z) { \ + return name<X, Y, Z>(x, y, z); \ + } \ + template <typename X, typename Z> \ + name<IsClass<X>, Capture, IsClass<Z>> name2(X x, Capture y, Z z) { \ + return name<X, Capture, Z>(x, y, z); \ + } \ + template <typename Y, typename Z> \ + name<Capture, IsClass<Y>, IsClass<Z>> name2(Capture x, Y y, Z z) { \ + return name<Capture, Y, Z>(x, y, z); \ + } \ + template <typename X> \ + name<IsClass<X>, Capture, Capture> name2(X x, Capture y, Capture z) { \ + return name<X, Capture, Capture>(x, y, z); \ + } \ + template <typename Y> \ + name<Capture, IsClass<Y>, Capture> name2(Capture x, Y y, Capture z) { \ + return name<Capture, Y, Capture>(x, y, z); \ + } \ + template <typename Z> \ + name<Capture, Capture, IsClass<Z>> name2(Capture x, Capture y, Z z) { \ + return name<Capture, Capture, Z>(x, y, z); \ + } +TERNARY(IfThenElse, if_then_else, x ? y : z); + +template <class X, class Y> +struct Assign { + X x; + Y y; + + template <class Arg1, class Arg2> + Assign(Arg1 arg1, Arg2 arg2) : x(arg1), y(arg2) {} + + template <typename... Args> + __HDI__ float operator()(Args&&... args) { + return x(args...) = y(args...); + } +}; + +template <int N> +struct Assignee { + Var<N> var; + + Assignee() {} + Assignee(Var<N> v) : var(v) {} + + template <typename... Args> + __HDI__ float& operator()(Args&&... args) { + return var(args...); + } + + template <class X> + Assign<Var<N>, IsClass<X>> operator=(X x) { + return Assign<Var<N>, X>(var, x); } + + Assign<Var<N>, Capture> operator=(Capture x) { + return Assign<Var<N>, Capture>(var, x); + } + + template <class X> + auto operator+=(X x) -> decltype(*this = *this + x) { + return *this = *this + x; + } + + template <class X> + auto operator-=(X x) -> decltype(*this = *this - x) { + return *this = *this - x; + } + + template <class X> + auto operator*=(X x) -> decltype(*this = *this * x) { + return *this = *this * x; + } + + template <class X> + auto operator/=(X x) -> decltype(*this = *this / x) { + return *this = *this / x; + } + + std::string to_string() { return var.to_string(); } +}; + +/******************************************************************************/ +} } diff --git a/src/functional/shape.h b/src/functional/shape.h index e8781531..3212a3ed 100644 --- a/src/functional/shape.h +++ b/src/functional/shape.h @@ -17,7 +17,6 @@ namespace functional { * @brief Represents the size of each dimension in a tensor. */ - template <const int N> struct ConstantShape { Array<int, N> shape_; @@ -32,10 +31,10 @@ struct ConstantShape { } __HD__ ConstantShape(const ConstantShape& shape) - : shape_(shape.shape_), - stride_(shape.stride_), - bstride_(shape.bstride_), - elements_(shape.elements_) {} + : shape_(shape.shape_), + stride_(shape.stride_), + bstride_(shape.bstride_), + elements_(shape.elements_) {} ConstantShape(const Shape& shape) { size_t filled = shape.size(); @@ -43,7 +42,8 @@ struct ConstantShape { ABORT_IF(filled > N, "Recompile with CONST_SHAPE_DIMS >= " + std::to_string(filled)); - std::copy(shape.shape_.begin(), shape.shape_.end(), shape_.begin() + N - filled); + std::copy( + shape.shape_.begin(), shape.shape_.end(), shape_.begin() + N - filled); if(N - filled) std::fill_n(shape_.begin(), N - filled, 1); updateStrides(); @@ -51,7 +51,6 @@ struct ConstantShape { } __HDI__ void updateStrides() { - stride_[N - 1] = 1; bstride_[N - 1] = shape_[N - 1] == 1 ? 0 : stride_[N - 1]; @@ -73,7 +72,6 @@ struct ConstantShape { updateElements(); } - __HDI__ int dim(int i) { return shape_[i]; } __HDI__ int dim(int i) const { @@ -92,9 +90,7 @@ struct ConstantShape { __HDI__ static constexpr size_t size() { return N; } - __HDI__ int elements() const { - return elements_; - } + __HDI__ int elements() const { return elements_; } __HDI__ int index(const Array<int, N>& d) const { int i = 0; @@ -113,7 +109,7 @@ struct ConstantShape { __HDI__ void dims(int i, Array<int, N>& d) const { for(int j = 0; j < N; ++j) d[j] = (i / stride_[j]) % shape_[j]; - } + } __HDI__ bool operator==(const ConstantShape& other) const { for(int i = 0; i < N; ++i) @@ -128,7 +124,5 @@ struct ConstantShape { }; typedef ConstantShape<CONST_SHAPE_DIMS> Shape; - } - } diff --git a/src/functional/tensor.h b/src/functional/tensor.h index 8c48c11e..43fb8fe0 100644 --- a/src/functional/tensor.h +++ b/src/functional/tensor.h @@ -7,7 +7,7 @@ namespace marian { namespace functional { -template<typename T> +template <typename T> struct Tensor { T* data_; functional::Shape shape_; @@ -15,19 +15,20 @@ struct Tensor { __HD__ Tensor() {} __HD__ Tensor(T* ptr, const functional::Shape& shape) - : data_(ptr), shape_(shape) {} + : data_(ptr), shape_(shape) {} - __H__ Tensor(marian::Tensor t) - : data_(t->data()), shape_(t->shape()) {} + __H__ Tensor(marian::Tensor t) : data_(t->data()), shape_(t->shape()) {} __HDI__ float& operator[](size_t i) { return data_[i]; } __HDI__ const float& operator[](size_t i) const { return data_[i]; } - __HDI__ float& operator[](const functional::Array<int, functional::Shape::size()>& indices) { + __HDI__ float& operator[]( + const functional::Array<int, functional::Shape::size()>& indices) { return data_[shape_.index(indices)]; } - __HDI__ const float& operator[](const functional::Array<int, functional::Shape::size()>& indices) const { + __HDI__ const float& operator[]( + const functional::Array<int, functional::Shape::size()>& indices) const { return data_[shape_.index(indices)]; } @@ -37,6 +38,5 @@ struct Tensor { __HDI__ Shape& shape() { return shape_; } __HDI__ const Shape& shape() const { return shape_; } }; - } }
\ No newline at end of file diff --git a/src/functional/tmp.h b/src/functional/tmp.h index 9d155fa0..65cb49d8 100644 --- a/src/functional/tmp.h +++ b/src/functional/tmp.h @@ -12,82 +12,86 @@ struct FApply {}; template <class Functor> struct FApply<1, Functor> { - __HDI__ static float apply(Functor functor, - functional::Array<functional::Tensor<float>, 1>& in, - const functional::Array<int, 1>& indices) { + __HDI__ static float apply( + Functor functor, + functional::Array<functional::Tensor<float>, 1>& in, + const functional::Array<int, 1>& indices) { return functor(in[0][indices[0]]); } - __HDI__ static float apply(Functor functor, - functional::Array<functional::Tensor<float>, 1>& in, - int index) { + __HDI__ static float apply( + Functor functor, + functional::Array<functional::Tensor<float>, 1>& in, + int index) { return functor(in[0][index]); } }; template <class Functor> struct FApply<2, Functor> { - __HDI__ static float apply(Functor functor, - functional::Array<functional::Tensor<float>, 2>& in, - const functional::Array<int, 2>& indices) { + __HDI__ static float apply( + Functor functor, + functional::Array<functional::Tensor<float>, 2>& in, + const functional::Array<int, 2>& indices) { return functor(in[0][indices[0]], in[1][indices[1]]); } - __HDI__ static float apply(Functor functor, - functional::Array<functional::Tensor<float>, 2>& in, - int index) { + __HDI__ static float apply( + Functor functor, + functional::Array<functional::Tensor<float>, 2>& in, + int index) { return functor(in[0][index], in[1][index]); } }; template <class Functor> struct FApply<3, Functor> { - __HDI__ static float apply(Functor functor, - functional::Array<functional::Tensor<float>, 3>& in, - const functional::Array<int, 3>& indices) { + __HDI__ static float apply( + Functor functor, + functional::Array<functional::Tensor<float>, 3>& in, + const functional::Array<int, 3>& indices) { return functor(in[0][indices[0]], in[1][indices[1]], in[2][indices[2]]); } - __HDI__ static float apply(Functor functor, - functional::Array<functional::Tensor<float>, 3>& in, - int index) { + __HDI__ static float apply( + Functor functor, + functional::Array<functional::Tensor<float>, 3>& in, + int index) { return functor(in[0][index], in[1][index], in[2][index]); } }; - template <class Functor> struct FApply<4, Functor> { - __HDI__ static float apply(Functor functor, - functional::Array<functional::Tensor<float>, 4>& in, - const functional::Array<int, 4>& indices) { + __HDI__ static float apply( + Functor functor, + functional::Array<functional::Tensor<float>, 4>& in, + const functional::Array<int, 4>& indices) { return functor(in[0][indices[0]], in[1][indices[1]], in[2][indices[2]], in[3][indices[3]]); } - __HDI__ static float apply(Functor functor, - functional::Array<functional::Tensor<float>, 4>& in, - int index) { - return functor(in[0][index], - in[1][index], - in[2][index], - in[3][index]); + __HDI__ static float apply( + Functor functor, + functional::Array<functional::Tensor<float>, 4>& in, + int index) { + return functor(in[0][index], in[1][index], in[2][index], in[3][index]); } }; template <size_t K, class Functor> - __HDI__ float apply(Functor functor, - functional::Array<functional::Tensor<float>, K>& in, - const functional::Array<int, K>& indices) { +__HDI__ float apply(Functor functor, + functional::Array<functional::Tensor<float>, K>& in, + const functional::Array<int, K>& indices) { return FApply<K, Functor>::apply(functor, in, indices); } template <size_t K, class Functor> - __HDI__ float apply(Functor functor, - functional::Array<functional::Tensor<float>, K>& in, - int index) { +__HDI__ float apply(Functor functor, + functional::Array<functional::Tensor<float>, K>& in, + int index) { return FApply<K, Functor>::apply(functor, in, index); } @@ -96,11 +100,12 @@ template <size_t K, class Functor> template <size_t n, size_t N, size_t K> struct Loop { template <class Functor> - __HDI__ static float result(Functor functor, - functional::Array<functional::Tensor<float>, K>& in, - const functional::Array<int, K>& pAcc, - const functional::Array<int, N>& length, - const functional::Array<int, N>& dim) { + __HDI__ static float result( + Functor functor, + functional::Array<functional::Tensor<float>, K>& in, + const functional::Array<int, K>& pAcc, + const functional::Array<int, N>& length, + const functional::Array<int, N>& dim) { float sum = 0; functional::Array<int, K> acc; for(int i = 0; i < length[N - n]; ++i) { @@ -116,11 +121,12 @@ struct Loop { template <size_t N, size_t K> struct Loop<1, N, K> { template <class Functor> - __HDI__ static float result(Functor functor, - functional::Array<functional::Tensor<float>, K>& in, - const functional::Array<int, K>& pAcc, - const functional::Array<int, N>& length, - const functional::Array<int, N>& dim) { + __HDI__ static float result( + Functor functor, + functional::Array<functional::Tensor<float>, K>& in, + const functional::Array<int, K>& pAcc, + const functional::Array<int, N>& length, + const functional::Array<int, N>& dim) { float sum = 0; functional::Array<int, K> acc; for(int i = 0; i < length[N - 1]; ++i) { @@ -141,6 +147,5 @@ __HDI__ float loops(Functor functor, functional::Array<int, K> acc = {0}; return Loop<N, N, K>::result(functor, in, acc, length, dim); } - } -}
\ No newline at end of file +} diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index f0ae1ffa..4a0edb34 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -1,5 +1,5 @@ -#include <sstream> #include "graph/expression_graph.h" +#include <sstream> #include "tensors/tensor_operators.h" @@ -18,15 +18,12 @@ void ExpressionGraph::setDevice(DeviceId deviceId) { } Expr ExpressionGraph::dropout(float prob, const Shape& shape) { - return Expression<ConstantNode>(shared_from_this(), - shape, - [prob, this](Tensor t) { - Dropout(t, prob); - }); + return Expression<ConstantNode>( + shared_from_this(), shape, [prob, this](Tensor t) { Dropout(t, prob); }); } void ExpressionGraph::checkNan(Tensor t) { ABORT_IF(throwNaN_, "Not implemented"); - //ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN"); + // ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN"); } } diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index ea1645ec..c6cd4558 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -215,7 +215,9 @@ public: ABORT_IF(shape != p->shape(), "Requested shape {} for existing parameter '{}' does not match " "original shape {}", - shape, name, p->shape()); + shape, + name, + p->shape()); p->setTrainable(!fixed); add(p); @@ -239,10 +241,8 @@ public: return p; } - Expr constant(const Shape& shape, - const NodeInitializer& init) { - return Expression<ConstantNode>( - shared_from_this(), shape, init); + Expr constant(const Shape& shape, const NodeInitializer& init) { + return Expression<ConstantNode>(shared_from_this(), shape, init); } Expr ones(const Shape& shape) { diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index a1c9faa4..a4a8b079 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -126,7 +126,6 @@ Expr repeat(Expr a, size_t repeats, keywords::axis_k ax) { return concatenate(std::vector<Expr>(repeats, a), ax); } - Expr reshape(Expr a, Shape shape) { return Expression<ReshapeNodeOp>(a, shape); } @@ -165,10 +164,7 @@ Expr flatten(Expr a) { } Expr flatten_2d(Expr a) { - Shape shape = { - a->shape().elements() / a->shape()[-1], - a->shape()[-1] - }; + Shape shape = {a->shape().elements() / a->shape()[-1], a->shape()[-1]}; return Expression<ReshapeNodeOp>(a, shape); } @@ -232,17 +228,16 @@ Expr step(Expr a, int step, int axis) { } Expr cross_entropy(Expr a, Expr b) { - //auto sOrig = a->shape(); - //auto sOut = a->shape(); - //Shape sTemp({sOrig[0] * sOrig[2] * sOrig[3], sOrig[1], 1, 1}); - //sOut.set(1, 1); - //return reshape(Expression<CrossEntropyNodeOp>(reshape(a, sTemp), b), sOut); + // auto sOrig = a->shape(); + // auto sOut = a->shape(); + // Shape sTemp({sOrig[0] * sOrig[2] * sOrig[3], sOrig[1], 1, 1}); + // sOut.set(1, 1); + // return reshape(Expression<CrossEntropyNodeOp>(reshape(a, sTemp), b), sOut); return Expression<CrossEntropyNodeOp>(a, b); } -Expr affine(Expr a, Expr b, Expr c, - bool transA, bool transB, float scalar) { +Expr affine(Expr a, Expr b, Expr c, bool transA, bool transB, float scalar) { std::vector<Expr> nodes = {a, b, c}; return Expression<AffineNodeOp>(nodes, transA, transB, scalar); } @@ -299,6 +294,7 @@ Expr highway(Expr y, Expr x, Expr t) { } Expr highway(const std::string prefix, Expr x) { + // clang-format off size_t outDim = x->shape()[-1]; auto g = mlp::dense(x->graph()) ("prefix", prefix + "_highway_d1") @@ -311,6 +307,7 @@ Expr highway(const std::string prefix, Expr x) { ("activation", mlp::act::ReLU) .construct()->apply(x); return (g * relued) + ((1 - g) * x); + // clang-format on } // Expr batch_norm(Expr x, Expr gamma, Expr beta) { @@ -334,41 +331,26 @@ Expr shift(Expr a, Shape shift) { #ifdef CUDA_FOUND -Expr avg_pooling( - Expr x, - int height, - int width, - int padHeight, - int padWidth, - int strideHeight, - int strideWidth) { - return Expression<PoolingOp>(x, - height, - width, - padHeight, - padWidth, - strideHeight, - strideWidth, - "avg"); -} - -Expr max_pooling( - Expr x, - int height, - int width, - int padHeight, - int padWidth, - int strideHeight, - int strideWidth) -{ - return Expression<PoolingOp>(x, - height, - width, - padHeight, - padWidth, - strideHeight, - strideWidth, - "max"); +Expr avg_pooling(Expr x, + int height, + int width, + int padHeight, + int padWidth, + int strideHeight, + int strideWidth) { + return Expression<PoolingOp>( + x, height, width, padHeight, padWidth, strideHeight, strideWidth, "avg"); +} + +Expr max_pooling(Expr x, + int height, + int width, + int padHeight, + int padWidth, + int strideHeight, + int strideWidth) { + return Expression<PoolingOp>( + x, height, width, padHeight, padWidth, strideHeight, strideWidth, "max"); } Expr convert2cudnnFormat(Expr x) { @@ -377,13 +359,13 @@ Expr convert2cudnnFormat(Expr x) { int embSize = x->shape()[2]; std::vector<size_t> newIndeces; - for (int b = 0; b < numExamples; ++b) { - for (int t = 0; t < numWords; ++t) { + for(int b = 0; b < numExamples; ++b) { + for(int t = 0; t < numWords; ++t) { newIndeces.push_back((t * numExamples) + b); } } - auto xRows = reshape(x, {x->shape()[0] * x ->shape()[1], x->shape()[2]}); + auto xRows = reshape(x, {x->shape()[0] * x->shape()[1], x->shape()[2]}); Shape outShape({numExamples, 1, numWords, embSize}); return reshape(rows(xRows, newIndeces), outShape); @@ -397,8 +379,8 @@ Expr convertFromcudnnFormat(Expr x) { auto reshapedX = reshape(x, {batchDim * sentenceDim, embSize}); std::vector<size_t> newIndeces; - for (int t = 0; t < sentenceDim; ++t) { - for (int b = 0; b < batchDim; ++b) { + for(int t = 0; t < sentenceDim; ++t) { + for(int b = 0; b < batchDim; ++b) { newIndeces.push_back(b * sentenceDim + t); } } @@ -412,5 +394,4 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) { } #endif - } diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index 1145be3c..c637105f 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -110,7 +110,6 @@ Expr mean(Expr a, keywords::axis_k ax = 0); Expr cross_entropy(Expr a, Expr b); - Expr scalar_product(Expr a, Expr b, keywords::axis_k ax = 0); Expr weighted_average(Expr in, Expr weights, keywords::axis_k ax = 0); @@ -161,6 +160,5 @@ Expr max_pooling(Expr x, int strideHeight = 1, int strideWidth = 1); -Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven=false); - +Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven = false); } diff --git a/src/graph/node.cpp b/src/graph/node.cpp index 1c93683c..721cb30f 100644 --- a/src/graph/node.cpp +++ b/src/graph/node.cpp @@ -1,4 +1,5 @@ #include "tensors/backend.h" + #include "graph/expression_graph.h" #include "graph/node.h" diff --git a/src/graph/node.h b/src/graph/node.h index 74af5771..15f223aa 100644 --- a/src/graph/node.h +++ b/src/graph/node.h @@ -33,8 +33,7 @@ protected: public: Node(Ptr<ExpressionGraph> graph, Shape shape) - : graph_(graph), - shape_(shape) {} + : graph_(graph), shape_(shape) {} virtual ~Node() { if(destroy_) { @@ -152,7 +151,7 @@ struct NaryNodeOp : public Node { } NaryNodeOp(const std::vector<Expr>& nodes) - : NaryNodeOp(nodes, nodes[0]->shape()) {} + : NaryNodeOp(nodes, nodes[0]->shape()) {} virtual ~NaryNodeOp() {} diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp index 0d131c61..6650ede6 100644 --- a/src/graph/node_initializers.cpp +++ b/src/graph/node_initializers.cpp @@ -109,9 +109,8 @@ void ortho(Tensor t) { NodeInitializer from_vector(const std::vector<float>& v) { auto vPtr = New<std::vector<float>>(v.begin(), v.end()); - return [vPtr](Tensor t) { - t->set(vPtr->data(), vPtr->data() + vPtr->size()); - }; + return + [vPtr](Tensor t) { t->set(vPtr->data(), vPtr->data() + vPtr->size()); }; } NodeInitializer from_vector(const std::vector<size_t>& v) { @@ -138,9 +137,9 @@ NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np) { // move this somewhere else NodeInitializer from_word2vec(const std::string& file, - int dimVoc, - int dimEmb, - bool normalize /*= false*/) { + int dimVoc, + int dimEmb, + bool normalize /*= false*/) { return [file, dimVoc, dimEmb, normalize](Tensor t) { auto embs = Word2VecReader().read(file, dimVoc, dimEmb); diff --git a/src/graph/node_initializers.h b/src/graph/node_initializers.h index 5b069657..bd74c6c4 100644 --- a/src/graph/node_initializers.h +++ b/src/graph/node_initializers.h @@ -70,9 +70,9 @@ NodeInitializer from_sparse_vector( NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np); NodeInitializer from_word2vec(const std::string& file, - int dimVoc, - int dimEmb, - bool normalize = false); + int dimVoc, + int dimEmb, + bool normalize = false); } } // namespace marian diff --git a/src/graph/node_operators.h b/src/graph/node_operators.h index 8720d0bb..4e97fff3 100644 --- a/src/graph/node_operators.h +++ b/src/graph/node_operators.h @@ -7,11 +7,12 @@ namespace marian { struct ConstantNode : public Node { - ConstantNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init) + ConstantNode(Ptr<ExpressionGraph> graph, + const Shape& shape, + const NodeInitializer& init) : Node(graph, shape), init_(new NodeInitializer(init)), initialized_(false) { - setTrainable(false); } @@ -41,11 +42,13 @@ private: }; struct ParamNode : public Node { - ParamNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init, bool fixed = false) + ParamNode(Ptr<ExpressionGraph> graph, + const Shape& shape, + const NodeInitializer& init, + bool fixed = false) : Node(graph, shape), init_(new NodeInitializer(init)), initialized_(false) { - setTrainable(!fixed); } diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index c9e67cd7..6fc08690 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -16,13 +16,8 @@ private: float scalar_; public: - DotNodeOp(Expr a, - Expr b, - bool transA, - bool transB, - float scalar) - : NaryNodeOp({a, b}, - newShape(a, b, transA, transB)), + DotNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar) + : NaryNodeOp({a, b}, newShape(a, b, transA, transB)), transA_(transA), transB_(transB), scalar_(scalar) {} @@ -49,14 +44,13 @@ public: NodeOps forwardOps() { // C = alpha * dot(op(A), op(B)) - return {NodeOp(Prod( - val_, - child(0)->val(), - child(1)->val(), - transA_, - transB_, - 0.f, - scalar_))}; + return {NodeOp(Prod(val_, + child(0)->val(), + child(1)->val(), + transA_, + transB_, + 0.f, + scalar_))}; } NodeOps backwardOps() { @@ -149,7 +143,7 @@ public: : NaryNodeOp(nodes, newShape(nodes[0], nodes[1], transA, transB)), transA_(transA), transB_(transB), - scalar_(scalar){} + scalar_(scalar) {} Shape newShape(Expr a, Expr b, bool transA, bool transB) { auto shapeA = a->shape(); @@ -171,19 +165,17 @@ public: return outShape; } - NodeOps forwardOps() { using namespace functional; return { - NodeOp(Prod( - val_, - child(0)->val(), - child(1)->val(), - transA_, - transB_, - 0.f, - scalar_); - Add(_1, val_, child(2)->val())) + NodeOp(Prod(val_, + child(0)->val(), + child(1)->val(), + transA_, + transB_, + 0.f, + scalar_); + Add(_1, val_, child(2)->val())) }; } @@ -266,7 +258,6 @@ public: const std::string type() { return "affine"; } }; - class DotBatchedNodeOp : public NaryNodeOp { private: bool transA_; @@ -274,13 +265,8 @@ private: float scalar_; public: - DotBatchedNodeOp(Expr a, - Expr b, - bool transA, - bool transB, - float scalar) - : NaryNodeOp({a, b}, - newShape(a, b, transA, transB)), + DotBatchedNodeOp(Expr a, Expr b, bool transA, bool transB, float scalar) + : NaryNodeOp({a, b}, newShape(a, b, transA, transB)), transA_(transA), transB_(transB), scalar_(scalar) {} @@ -307,14 +293,13 @@ public: NodeOps forwardOps() { // C = alpha * dot(op(A), op(B)) - return {NodeOp(ProdBatched( - val_, - child(0)->val(), - child(1)->val(), - transA_, - transB_, - 0.f, - scalar_))}; + return {NodeOp(ProdBatched(val_, + child(0)->val(), + child(1)->val(), + transA_, + transB_, + 0.f, + scalar_))}; } NodeOps backwardOps() { @@ -325,71 +310,67 @@ public: // to sum gradients from different graph parts if(!transA_ && transB_) - return { - NodeOp(ProdBatched(child(0)->grad(), - adj_, - child(1)->val(), - false, - false, - 1.0, - scalar_)), - NodeOp(ProdBatched(child(1)->grad(), - adj_, - child(0)->val(), - true, - false, - 1.0, - scalar_))}; + return {NodeOp(ProdBatched(child(0)->grad(), + adj_, + child(1)->val(), + false, + false, + 1.0, + scalar_)), + NodeOp(ProdBatched(child(1)->grad(), + adj_, + child(0)->val(), + true, + false, + 1.0, + scalar_))}; if(transA_ && !transB_) - return { - NodeOp(ProdBatched(child(0)->grad(), - child(1)->val(), - adj_, - false, - true, - 1.0, - scalar_)), - NodeOp(ProdBatched(child(1)->grad(), - child(0)->val(), - adj_, - false, - false, - 1.0, - scalar_))}; + return {NodeOp(ProdBatched(child(0)->grad(), + child(1)->val(), + adj_, + false, + true, + 1.0, + scalar_)), + NodeOp(ProdBatched(child(1)->grad(), + child(0)->val(), + adj_, + false, + false, + 1.0, + scalar_))}; if(transA_ && transB_) - return { - NodeOp(ProdBatched(child(0)->grad(), - child(1)->val(), - adj_, - true, - true, - 1.0, - scalar_)), - NodeOp(ProdBatched(child(1)->grad(), - adj_, - child(0)->val(), - true, - true, - 1.0, - scalar_))}; - - return { - NodeOp(ProdBatched(child(0)->grad(), - adj_, - child(1)->val(), - false, - true, - 1.0, - scalar_)), - NodeOp(ProdBatched(child(1)->grad(), - child(0)->val(), - adj_, - true, - false, - 1.0, - scalar_))}; + return {NodeOp(ProdBatched(child(0)->grad(), + child(1)->val(), + adj_, + true, + true, + 1.0, + scalar_)), + NodeOp(ProdBatched(child(1)->grad(), + adj_, + child(0)->val(), + true, + true, + 1.0, + scalar_))}; + + return {NodeOp(ProdBatched(child(0)->grad(), + adj_, + child(1)->val(), + false, + true, + 1.0, + scalar_)), + NodeOp(ProdBatched(child(1)->grad(), + child(0)->val(), + adj_, + true, + false, + 1.0, + scalar_))}; } const std::string type() { return "•"; } @@ -400,8 +381,7 @@ public: struct ScalarProductNodeOp : public NaryNodeOp { template <typename... Args> ScalarProductNodeOp(Expr a, Expr b, Args... args) - : NaryNodeOp({a, b}, newShape(a, b, args...)) { - } + : NaryNodeOp({a, b}, newShape(a, b, args...)) {} template <typename... Args> Shape newShape(Expr a, Expr b, Args... args) { @@ -433,12 +413,9 @@ struct ScalarProductNodeOp : public NaryNodeOp { }; struct ElementBinaryNodeOp : public NaryNodeOp { - ElementBinaryNodeOp(Expr a, Expr b) - : NaryNodeOp({a, b}, newShape(a, b)) {} + ElementBinaryNodeOp(Expr a, Expr b) : NaryNodeOp({a, b}, newShape(a, b)) {} - Shape newShape(Expr a, Expr b) { - return Shape::broadcast({a, b}); - } + Shape newShape(Expr a, Expr b) { return Shape::broadcast({a, b}); } const std::string color() { return "yellow"; } }; @@ -553,8 +530,7 @@ struct DivNodeOp : public ElementBinaryNodeOp { // Cross-entropy node. It computes -b*log(softmax(a)), summing rowwise. struct CrossEntropyNodeOp : public NaryNodeOp { - CrossEntropyNodeOp(Expr a, Expr b) - : NaryNodeOp({a, b}, newShape(a)) {} + CrossEntropyNodeOp(Expr a, Expr b) : NaryNodeOp({a, b}, newShape(a)) {} Shape newShape(Expr a) { Shape shape1 = a->shape(); @@ -578,7 +554,9 @@ struct CrossEntropyNodeOp : public NaryNodeOp { struct ConcatenateNodeOp : public NaryNodeOp { template <typename... Args> ConcatenateNodeOp(const std::vector<Expr>& nodes, Args... args) - : NaryNodeOp(nodes, newShape(nodes, keywords::Get(keywords::axis, 0, args...))) {} + : NaryNodeOp(nodes, + newShape(nodes, keywords::Get(keywords::axis, 0, args...))) { + } Shape newShape(const std::vector<Expr>& nodes, int ax) { Shape shape = nodes.back()->shape(); @@ -730,38 +708,33 @@ struct HighwayNodeOp : public NaryNodeOp { class ConvolutionOp : public NaryNodeOp { public: - ConvolutionOp( - const std::vector<Expr>& nodes, - int hPad = 0, - int wPad = 0, - int hStride = 1, - int wStride = 1) - : NaryNodeOp(nodes), - conv_(nodes[1]->shape(), - nodes[2]->shape(), - hPad, - wPad, - hStride, - wStride) { + ConvolutionOp(const std::vector<Expr>& nodes, + int hPad = 0, + int wPad = 0, + int hStride = 1, + int wStride = 1) + : NaryNodeOp(nodes), + conv_(nodes[1]->shape(), + nodes[2]->shape(), + hPad, + wPad, + hStride, + wStride) { conv_.getOutputShape(nodes[0]->shape(), shape_); } NodeOps forwardOps() { return {NodeOp(conv_.forward( - child(0)->val(), - child(1)->val(), - child(2)->val(), - val_))}; + child(0)->val(), child(1)->val(), child(2)->val(), val_))}; } NodeOps backwardOps() { - return {NodeOp(conv_.backward( - child(0)->val(), - child(0)->grad(), - child(1)->val(), - child(1)->grad(), - child(2)->grad(), - adj_))}; + return {NodeOp(conv_.backward(child(0)->val(), + child(0)->grad(), + child(1)->val(), + child(1)->grad(), + child(2)->grad(), + adj_))}; } const std::string type() { return "layer_convolution"; } @@ -769,5 +742,4 @@ public: protected: ConvolutionWrapper conv_; }; - } diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 0ca2c2a2..8d81a63a 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -12,11 +12,9 @@ namespace marian { struct UnaryNodeOp : public NaryNodeOp { - UnaryNodeOp(Expr a, Shape shape) - : NaryNodeOp({a}, shape) {} + UnaryNodeOp(Expr a, Shape shape) : NaryNodeOp({a}, shape) {} - UnaryNodeOp(Expr a) - : NaryNodeOp({a}, a->shape()) {} + UnaryNodeOp(Expr a) : NaryNodeOp({a}, a->shape()) {} const std::string color() { return "yellow"; } }; @@ -26,9 +24,7 @@ private: float scalar_{0}; public: - ScalarAddNodeOp(Expr a, float scalar) - : UnaryNodeOp(a), - scalar_{scalar} {} + ScalarAddNodeOp(Expr a, float scalar) : UnaryNodeOp(a), scalar_{scalar} {} NodeOps forwardOps() { using namespace functional; @@ -67,8 +63,7 @@ private: float scalar_{0}; public: - ScalarMultNodeOp(Expr a, float scalar) - : UnaryNodeOp(a), scalar_{scalar} {} + ScalarMultNodeOp(Expr a, float scalar) : UnaryNodeOp(a), scalar_{scalar} {} NodeOps forwardOps() { using namespace functional; @@ -210,7 +205,6 @@ struct TanhNodeOp : public NaryNodeOp { const std::string type() { return "tanh"; } }; - struct ReLUNodeOp : public UnaryNodeOp { ReLUNodeOp(Expr a) : UnaryNodeOp(a) {} @@ -262,8 +256,7 @@ struct ReLUNodeOp : public UnaryNodeOp { * \f] */ struct PReLUNodeOp : public UnaryNodeOp { - PReLUNodeOp(float alpha, Expr a) - : UnaryNodeOp(a), alpha_(alpha) {} + PReLUNodeOp(float alpha, Expr a) : UnaryNodeOp(a), alpha_(alpha) {} NodeOps forwardOps() { using namespace functional; @@ -334,11 +327,9 @@ struct SwishNodeOp : public UnaryNodeOp { }; struct SoftmaxNodeOp : public UnaryNodeOp { - SoftmaxNodeOp(Expr a) - : UnaryNodeOp(a), mask_(nullptr) {} + SoftmaxNodeOp(Expr a) : UnaryNodeOp(a), mask_(nullptr) {} - SoftmaxNodeOp(Expr a, Expr mask) - : UnaryNodeOp(a), mask_(mask) {} + SoftmaxNodeOp(Expr a, Expr mask) : UnaryNodeOp(a), mask_(mask) {} Expr mask_; @@ -407,17 +398,18 @@ struct SumNodeOp : public UnaryNodeOp { int ax_; template <typename... Args> - SumNodeOp(Expr a, Args... args) - : UnaryNodeOp(a, newShape(a, args...)) {} + SumNodeOp(Expr a, Args... args) : UnaryNodeOp(a, newShape(a, args...)) {} NodeOps forwardOps() { using namespace functional; - return {NodeOp(Reduce(_1, val_, child(0)->val()))}; } + return {NodeOp(Reduce(_1, val_, child(0)->val()))}; + } NodeOps backwardOps() { using namespace functional; - return {NodeOp(Add(_1, child(0)->grad(), adj_))}; } + return {NodeOp(Add(_1, child(0)->grad(), adj_))}; + } template <class... Args> Shape newShape(Expr a, Args... args) { @@ -456,8 +448,7 @@ struct MeanNodeOp : public UnaryNodeOp { int ax_; template <typename... Args> - MeanNodeOp(Expr a, Args... args) - : UnaryNodeOp(a, newShape(a, args...)) {} + MeanNodeOp(Expr a, Args... args) : UnaryNodeOp(a, newShape(a, args...)) {} NodeOps forwardOps() { using namespace functional; @@ -543,8 +534,7 @@ struct ExpNodeOp : public UnaryNodeOp { struct SqrtNodeOp : public UnaryNodeOp { float epsilon_; - SqrtNodeOp(Expr a, float epsilon) - : UnaryNodeOp(a), epsilon_(epsilon) {} + SqrtNodeOp(Expr a, float epsilon) : UnaryNodeOp(a), epsilon_(epsilon) {} NodeOps forwardOps() { using namespace functional; @@ -614,8 +604,7 @@ struct NegNodeOp : public UnaryNodeOp { struct RowsNodeOp : public UnaryNodeOp { RowsNodeOp(Expr a, const std::vector<size_t>& indeces) - : UnaryNodeOp(a, newShape(a, indeces)), - indices_(indeces) {} + : UnaryNodeOp(a, newShape(a, indeces)), indices_(indeces) {} NodeOps forwardOps() { // @TODO: solve this with a tensor! @@ -666,8 +655,7 @@ struct RowsNodeOp : public UnaryNodeOp { struct ColsNodeOp : public UnaryNodeOp { ColsNodeOp(Expr a, const std::vector<size_t>& indeces) - : UnaryNodeOp(a, newShape(a, indeces)), - indices_(indeces) {} + : UnaryNodeOp(a, newShape(a, indeces)), indices_(indeces) {} NodeOps forwardOps() { // @TODO: solve this with a tensor! @@ -716,8 +704,7 @@ struct ColsNodeOp : public UnaryNodeOp { struct SelectNodeOp : public UnaryNodeOp { SelectNodeOp(Expr a, int axis, const std::vector<size_t>& indeces) - : UnaryNodeOp(a, newShape(a, axis, indeces)), - indices_(indeces) {} + : UnaryNodeOp(a, newShape(a, axis, indeces)), indices_(indeces) {} NodeOps forwardOps() { return {NodeOp( @@ -772,8 +759,7 @@ struct TransposeNodeOp : public UnaryNodeOp { std::vector<int> axes_; TransposeNodeOp(Expr a, const std::vector<int>& axes) - : UnaryNodeOp(a, newShape(a, axes)), - axes_{axes} {} + : UnaryNodeOp(a, newShape(a, axes)), axes_{axes} {} NodeOps forwardOps() { return {NodeOp(TransposeND(val_, child(0)->val(), axes_))}; @@ -788,7 +774,7 @@ struct TransposeNodeOp : public UnaryNodeOp { Shape shape = a->shape(); ABORT_IF(shape.size() != axes.size(), - "Shape and transpose axes have different number of dimensions"); + "Shape and transpose axes have different number of dimensions"); for(int i = 0; i < shape.size(); ++i) shape.set(i, a->shape()[axes[i]]); @@ -829,8 +815,7 @@ private: public: template <typename... Args> - ReshapeNodeOp(Expr a, Shape shape) - : UnaryNodeOp(a, shape), reshapee_(a) { + ReshapeNodeOp(Expr a, Shape shape) : UnaryNodeOp(a, shape), reshapee_(a) { Node::destroy_ = false; } @@ -894,9 +879,7 @@ private: public: StepNodeOp(Expr a, int step, int axis) - : UnaryNodeOp(a, newShape(a, axis)), - stepNode_(a), - step_(step) { + : UnaryNodeOp(a, newShape(a, axis)), stepNode_(a), step_(step) { Node::destroy_ = false; } @@ -1056,67 +1039,54 @@ public: padWidth, strideHeight, strideWidth, - mode) { - } + mode) {} NodeOps forwardOps() { return {NodeOp(pooling_.forward(child(0)->val(), val_))}; } NodeOps backwardOps() { - return {NodeOp(pooling_.backward( - child(0)->val(), - child(0)->grad(), - val_, - adj_))}; + return {NodeOp( + pooling_.backward(child(0)->val(), child(0)->grad(), val_, adj_))}; } const std::string type() { return "layer_pooling"; } - protected: PoolingWrapper pooling_; }; class PoolingWithMaskingOp : public UnaryNodeOp { - public: - PoolingWithMaskingOp( Expr x, Expr mask, int width, bool isEven=false) - : UnaryNodeOp(x), - mask_(mask), - width_(width), - isEven_(isEven) - { - auto xShape = x->shape(); - int dimBatch = xShape[0]; - int dimWord = xShape[1]; - int cols = (isEven_) ? xShape[2] - 1 : xShape[2]; - int dimSentence = (cols / width_) + (cols % width_ != 0); - shape_ = {dimBatch, dimWord, dimSentence}; - } +public: + PoolingWithMaskingOp(Expr x, Expr mask, int width, bool isEven = false) + : UnaryNodeOp(x), mask_(mask), width_(width), isEven_(isEven) { + auto xShape = x->shape(); + int dimBatch = xShape[0]; + int dimWord = xShape[1]; + int cols = (isEven_) ? xShape[2] - 1 : xShape[2]; + int dimSentence = (cols / width_) + (cols % width_ != 0); + shape_ = {dimBatch, dimWord, dimSentence}; + } - NodeOps forwardOps() { - return {NodeOp(PoolingWithMaskingForward(val_, + NodeOps forwardOps() { + return {NodeOp(PoolingWithMaskingForward( + val_, child(0)->val(), mask_->val(), width_, isEven_))}; + } + + NodeOps backwardOps() { + return {NodeOp(PoolingWithMaskingBackward(adj_, + child(0)->grad(), child(0)->val(), mask_->val(), width_, isEven_))}; - } - - NodeOps backwardOps() { - return {NodeOp(PoolingWithMaskingBackward(adj_, - child(0)->grad(), - child(0)->val(), - mask_->val(), - width_, - isEven_))}; - } + } - const std::string type() {return "layer_pooling";} + const std::string type() { return "layer_pooling"; } - protected: - Expr mask_; - int width_; - bool isEven_; +protected: + Expr mask_; + int width_; + bool isEven_; }; - } diff --git a/src/layers/constructors.h b/src/layers/constructors.h index 4f76c36c..a95d6253 100644 --- a/src/layers/constructors.h +++ b/src/layers/constructors.h @@ -23,18 +23,18 @@ struct LayerFactory : public Factory { return as<Cast>() != nullptr; } - virtual Ptr<Layer> construct() = 0; + virtual Ptr<Layer> construct() = 0; }; class DenseFactory : public LayerFactory { protected: - //std::vector<std::pair<std::string, std::string>> tiedParams_; + // std::vector<std::pair<std::string, std::string>> tiedParams_; std::vector<std::pair<std::string, std::string>> tiedParamsTransposed_; public: DenseFactory(Ptr<ExpressionGraph> graph) : LayerFactory(graph) {} - //Accumulator<DenseFactory> tie(const std::string& param, + // Accumulator<DenseFactory> tie(const std::string& param, // const std::string& tied) { // tiedParams_.push_back({param, tied}); // return Accumulator<DenseFactory>(*this); @@ -48,21 +48,20 @@ public: Ptr<Layer> construct() { auto dense = New<Dense>(graph_, options_); - //for(auto& p : tiedParams_) + // for(auto& p : tiedParams_) // dense->tie(p.first, p.second); for(auto& p : tiedParamsTransposed_) dense->tie_transposed(p.first, p.second); return dense; } - + DenseFactory clone() { DenseFactory aClone(graph_); aClone.options_->merge(options_); - //aClone.tiedParams_ = tiedParams_; + // aClone.tiedParams_ = tiedParams_; aClone.tiedParamsTransposed_ = tiedParamsTransposed_; return aClone; } - }; typedef Accumulator<DenseFactory> dense; @@ -95,7 +94,6 @@ public: } void push_back(Ptr<Layer> layer) { layers_.push_back(layer); } - }; class MLPFactory : public Factory { @@ -121,7 +119,7 @@ public: layers_.push_back(New<LF>(lf)); return Accumulator<MLPFactory>(*this); } - + MLPFactory clone() { MLPFactory aClone(graph_); aClone.options_->merge(options_); @@ -129,7 +127,6 @@ public: aClone.push_back(lf->clone()); return aClone; } - }; typedef Accumulator<MLPFactory> mlp; diff --git a/src/layers/convolution.cpp b/src/layers/convolution.cpp index 064abedf..eb1b0554 100644 --- a/src/layers/convolution.cpp +++ b/src/layers/convolution.cpp @@ -2,8 +2,7 @@ #include "graph/node_operators_binary.h" namespace marian { -Convolution::Convolution(Ptr<ExpressionGraph> graph) - : Factory(graph) {} +Convolution::Convolution(Ptr<ExpressionGraph> graph) : Factory(graph) {} Expr Convolution::apply(Expr x) { auto prefix = opt<std::string>("prefix"); @@ -13,28 +12,21 @@ Expr Convolution::apply(Expr x) { auto strides = opt<std::pair<int, int>>("strides", std::make_pair(1, 1)); int layerIn = x->shape()[1]; - auto kernel = graph_->param(prefix + "_conv_kernels", - {layerIn, - kernelNum, - kernelDims.first, - kernelDims.second}, - inits::glorot_uniform); + auto kernel + = graph_->param(prefix + "_conv_kernels", + {layerIn, kernelNum, kernelDims.first, kernelDims.second}, + inits::glorot_uniform); - auto bias = graph_->param(prefix + "_conv_bias", - {1, kernelNum, 1, 1}, - inits::zeros); + auto bias = graph_->param( + prefix + "_conv_bias", {1, kernelNum, 1, 1}, inits::zeros); std::vector<Expr> nodes = {x, kernel, bias}; - return Expression<ConvolutionOp>(nodes, - paddings.first, - paddings.second, - strides.first, - strides.second); + return Expression<ConvolutionOp>( + nodes, paddings.first, paddings.second, strides.first, strides.second); } Expr Convolution::apply(const std::vector<Expr>&) { ABORT("Can't apply convolution on many inputs at once"); return nullptr; } - } diff --git a/src/layers/factory.h b/src/layers/factory.h index 32e725c6..62976b68 100644 --- a/src/layers/factory.h +++ b/src/layers/factory.h @@ -33,6 +33,7 @@ public: template <class BaseFactory> class Accumulator : public BaseFactory { typedef BaseFactory Factory; + public: Accumulator() : Factory(nullptr) {} Accumulator(Ptr<ExpressionGraph> graph) : Factory(graph) {} diff --git a/src/layers/generic.h b/src/layers/generic.h index b9c1d100..dbff3a10 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -1,6 +1,7 @@ #pragma once #include "marian.h" + #include "layers/factory.h" namespace marian { @@ -75,11 +76,9 @@ public: if(tiedParams_.count(nameW)) { W = tiedParams_[nameW]; transposeW = true; - } - else { - W = g->param(name + "_" + nameW, - {in->shape()[-1], dim}, - inits::glorot_uniform); + } else { + W = g->param( + name + "_" + nameW, {in->shape()[-1], dim}, inits::glorot_uniform); } Expr b; @@ -87,8 +86,7 @@ public: if(tiedParams_.count(nameB)) b = tiedParams_[nameB]; else - b = g->param( - name + "_" + nameB, {1, dim}, inits::zeros); + b = g->param(name + "_" + nameB, {1, dim}, inits::zeros); params_.push_back(W); params_.push_back(b); @@ -98,19 +96,19 @@ public: auto ln_s = g->param(name + "_ln_s" + std::to_string(i), {1, dim}, inits::from_value(1.f)); - auto ln_b = g->param(name + "_ln_b" + std::to_string(i), - {1, dim}, - inits::zeros); + auto ln_b = g->param( + name + "_ln_b" + std::to_string(i), {1, dim}, inits::zeros); - outputs.push_back( - layer_norm(affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS)); + outputs.push_back(layer_norm( + affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS)); } else { auto gamma = g->param(name + "_gamma" + std::to_string(i), {1, dim}, inits::from_value(1.0)); params_.push_back(gamma); - outputs.push_back(layer_norm(dot(in, W, false, transposeW), gamma, b)); + outputs.push_back( + layer_norm(dot(in, W, false, transposeW), gamma, b)); } } else { @@ -147,11 +145,9 @@ public: if(tiedParams_.count(nameW)) { transposeW = true; W = tiedParams_[nameW]; - } - else { - W = g->param(name + "_" + nameW, - {input->shape()[-1], dim}, - inits::glorot_uniform); + } else { + W = g->param( + name + "_" + nameW, {input->shape()[-1], dim}, inits::glorot_uniform); } Expr b; std::string nameB = "b"; @@ -165,16 +161,14 @@ public: Expr out; if(layerNorm) { if(nematusNorm) { - auto ln_s = g->param( - name + "_ln_s", {1, dim}, inits::from_value(1.f)); - auto ln_b - = g->param(name + "_ln_b", {1, dim}, inits::zeros); + auto ln_s = g->param(name + "_ln_s", {1, dim}, inits::from_value(1.f)); + auto ln_b = g->param(name + "_ln_b", {1, dim}, inits::zeros); - out = layer_norm(affine(input, W, b, false, transposeW), - ln_s, ln_b, NEMATUS_LN_EPS); + out = layer_norm( + affine(input, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS); } else { - auto gamma = g->param( - name + "_gamma", {1, dim}, inits::from_value(1.0)); + auto gamma + = g->param(name + "_gamma", {1, dim}, inits::from_value(1.0)); params_.push_back(gamma); out = layer_norm(dot(input, W, false, transposeW), gamma, b); @@ -217,22 +211,18 @@ struct EmbeddingFactory : public Factory { } } - return graph_->param(name, - {dimVoc, dimEmb}, - initFunc, - fixed); + return graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed); } }; typedef Accumulator<EmbeddingFactory> embedding; -static inline -Expr Cost(Expr logits, - Expr indices, - Expr mask, - std::string costType = "cross-entropy", - float smoothing = 0, - Expr weights = nullptr) { +static inline Expr Cost(Expr logits, + Expr indices, + Expr mask, + std::string costType = "cross-entropy", + float smoothing = 0, + Expr weights = nullptr) { using namespace keywords; auto ce = cross_entropy(logits, indices); @@ -255,15 +245,17 @@ Expr Cost(Expr logits, // axes: // - time axis (words): -3 // - batch axis (sentences): -2 - if(costType == "ce-mean" || costType == "cross-entropy") { // sum over words; average over sentences + if(costType == "ce-mean" + || costType + == "cross-entropy") { // sum over words; average over sentences cost = mean(costSum, axis = -2); - } else if(costType == "ce-mean-words") { // average over target tokens + } else if(costType == "ce-mean-words") { // average over target tokens cost = sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2); - } else if(costType == "ce-sum") { // sum over target tokens + } else if(costType == "ce-sum") { // sum over target tokens cost = sum(costSum, axis = -2); - } else if(costType == "perplexity") { // ==exp('ce-mean-words') + } else if(costType == "perplexity") { // ==exp('ce-mean-words') cost = exp(sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2)); - } else if(costType == "ce-rescore") { // sum over words, keep batch axis + } else if(costType == "ce-rescore") { // sum over words, keep batch axis cost = -costSum; } else { // same as ce-mean cost = mean(costSum, axis = -2); diff --git a/src/layers/guided_alignment.h b/src/layers/guided_alignment.h index c353f649..c5dec742 100644 --- a/src/layers/guided_alignment.h +++ b/src/layers/guided_alignment.h @@ -4,20 +4,18 @@ namespace marian { -static inline -Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph, - Ptr<data::CorpusBatch> batch, - Ptr<Options> options, - Expr att) { +static inline Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph, + Ptr<data::CorpusBatch> batch, + Ptr<Options> options, + Expr att) { using namespace keywords; int dimBatch = att->shape()[0]; int dimSrc = att->shape()[2]; int dimTrg = att->shape()[3]; - auto aln = graph->constant( - {dimBatch, 1, dimSrc, dimTrg}, - inits::from_vector(batch->getGuidedAlignment())); + auto aln = graph->constant({dimBatch, 1, dimSrc, dimTrg}, + inits::from_vector(batch->getGuidedAlignment())); std::string guidedCostType = options->get<std::string>("guided-alignment-cost"); diff --git a/src/models/char_s2s.h b/src/models/char_s2s.h index 64d1585f..f0c5de5c 100644 --- a/src/models/char_s2s.h +++ b/src/models/char_s2s.h @@ -2,6 +2,7 @@ #include "marian.h" #include "models/s2s.h" + #include "layers/convolution.h" namespace marian { @@ -29,21 +30,18 @@ public: int dimEmb = opt<int>("dim-emb"); auto convSizes = options_->get<std::vector<int>>("char-conv-filters-num"); - auto convWidths = options_->get<std::vector<int>>("char-conv-filters-widths"); + auto convWidths + = options_->get<std::vector<int>>("char-conv-filters-widths"); int stride = opt<int>("char-stride"); int highwayNum = opt<int>("char-highway"); auto conved = CharConvPooling( - prefix_ + "conv_pooling", - dimEmb, - convWidths, - convSizes, - stride) - (batchEmbeddings, batchMask); + prefix_ + "conv_pooling", dimEmb, convWidths, convSizes, stride)( + batchEmbeddings, batchMask); auto inHighway = conved; - for (int i = 0; i < highwayNum; ++i) { - inHighway = highway(prefix_ +"_" + std::to_string(i), inHighway); + for(int i = 0; i < highwayNum; ++i) { + inHighway = highway(prefix_ + "_" + std::to_string(i), inHighway); } Expr stridedMask = getStridedMask(graph, batch, stride); @@ -52,24 +50,26 @@ public: return New<EncoderState>(context, stridedMask, batch); } + protected: - Expr getStridedMask(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, + Expr getStridedMask(Ptr<ExpressionGraph> graph, + Ptr<data::CorpusBatch> batch, int stride) { auto subBatch = (*batch)[batchIndex_]; int dimBatch = subBatch->batchSize(); std::vector<float> strided; - for (size_t wordIdx = 0; wordIdx < subBatch->mask().size(); wordIdx += stride * dimBatch) { - for (size_t j = wordIdx; j < wordIdx + dimBatch; ++j) { + for(size_t wordIdx = 0; wordIdx < subBatch->mask().size(); + wordIdx += stride * dimBatch) { + for(size_t j = wordIdx; j < wordIdx + dimBatch; ++j) { strided.push_back(subBatch->mask()[j]); } } int dimWords = strided.size() / dimBatch; - auto stridedMask = graph->constant({dimWords, dimBatch, 1}, - inits::from_vector(strided)); + auto stridedMask + = graph->constant({dimWords, dimBatch, 1}, inits::from_vector(strided)); return stridedMask; } }; - } diff --git a/src/models/encdec.h b/src/models/encdec.h index 39229953..dc38fef1 100644 --- a/src/models/encdec.h +++ b/src/models/encdec.h @@ -1,6 +1,7 @@ #pragma once #include "marian.h" + #include "layers/generic.h" #include "layers/guided_alignment.h" #include "model_base.h" @@ -30,8 +31,8 @@ protected: auto batchEmbeddings = reshape(chosenEmbeddings, {dimWords, dimBatch, dimEmb}); - auto batchMask = graph->constant( - {dimWords, dimBatch, 1}, inits::from_vector(subBatch->mask())); + auto batchMask = graph->constant({dimWords, dimBatch, 1}, + inits::from_vector(subBatch->mask())); return std::make_tuple(batchEmbeddings, batchMask); } @@ -129,7 +130,8 @@ public: virtual void selectEmbeddings(Ptr<ExpressionGraph> graph, Ptr<DecoderState> state, const std::vector<size_t>& embIdx, - int dimBatch, int dimBeam) { + int dimBatch, + int dimBeam) { using namespace keywords; int dimTrgEmb = opt<int>("dim-emb"); @@ -149,12 +151,10 @@ public: Expr selectedEmbs; if(embIdx.empty()) { - selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb}, - inits::zeros); + selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb}, inits::zeros); } else { selectedEmbs = rows(yEmb, embIdx); - selectedEmbs - = reshape(selectedEmbs, {dimBeam, 1, dimBatch, dimTrgEmb}); + selectedEmbs = reshape(selectedEmbs, {dimBeam, 1, dimBatch, dimTrgEmb}); } state->setTargetEmbeddings(selectedEmbs); } @@ -174,14 +174,16 @@ public: virtual void selectEmbeddings(Ptr<ExpressionGraph> graph, Ptr<DecoderState> state, const std::vector<size_t>&, - int dimBatch, int beamSize) + int dimBatch, + int beamSize) = 0; virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph, Ptr<DecoderState>, const std::vector<size_t>&, const std::vector<size_t>&, - int dimBatch, int beamSize) + int dimBatch, + int beamSize) = 0; virtual Ptr<DecoderState> step(Ptr<ExpressionGraph>, Ptr<DecoderState>) = 0; @@ -224,7 +226,8 @@ protected: decoder["mini-batch"] = opt<size_t>("valid-mini-batch"); decoder["maxi-batch"] = opt<size_t>("valid-mini-batch") > 1 ? 100 : 1; - decoder["maxi-batch-sort"] = opt<size_t>("valid-mini-batch") > 1 ? "trg" : "none"; + decoder["maxi-batch-sort"] + = opt<size_t>("valid-mini-batch") > 1 ? "trg" : "none"; decoder["relative-paths"] = false; @@ -320,8 +323,10 @@ public: Ptr<DecoderState> state, const std::vector<size_t>& hypIndices, const std::vector<size_t>& embIndices, - int dimBatch, int beamSize) { - auto selectedState = hypIndices.empty() ? state : state->select(hypIndices, beamSize); + int dimBatch, + int beamSize) { + auto selectedState + = hypIndices.empty() ? state : state->select(hypIndices, beamSize); selectEmbeddings(graph, selectedState, embIndices, dimBatch, beamSize); selectedState->setSingleStep(true); auto nextState = step(graph, selectedState); @@ -332,7 +337,8 @@ public: virtual void selectEmbeddings(Ptr<ExpressionGraph> graph, Ptr<DecoderState> state, const std::vector<size_t>& embIdx, - int dimBatch, int beamSize) { + int dimBatch, + int beamSize) { decoders_[0]->selectEmbeddings(graph, state, embIdx, dimBatch, beamSize); } @@ -366,17 +372,12 @@ public: int dimBatch = batch->size(); int dimWords = sentenceWeighting ? 1 : batch->back()->batchWidth(); - weights = graph->constant( - {1, dimWords, dimBatch, 1}, - inits::from_vector(batch->getDataWeights())); + weights = graph->constant({1, dimWords, dimBatch, 1}, + inits::from_vector(batch->getDataWeights())); } - auto cost = Cost(nextState->getProbs(), - trgIdx, - trgMask, - costType, - ls, - weights); + auto cost + = Cost(nextState->getProbs(), trgIdx, trgMask, costType, ls, weights); if(options_->has("guided-alignment") && !inference_) { auto alignments = decoders_[0]->getAlignments(); @@ -428,7 +429,7 @@ public: do { size_t current = (start + end) / 2; - //std::cerr << i << " " << current << std::endl; + // std::cerr << i << " " << current << std::endl; auto batch = data::CorpusBatch::fakeBatch(lengths, current, options_); build(graph, batch); fits = graph->fits(); @@ -436,8 +437,7 @@ public: if(fits) { stats->add(batch, multiplier); start = current + 1; - } - else { + } else { end = current - 1; } } while(end - start > step); diff --git a/src/models/hardatt.h b/src/models/hardatt.h index 85aa12a2..ff49ab05 100644 --- a/src/models/hardatt.h +++ b/src/models/hardatt.h @@ -1,9 +1,10 @@ #pragma once #include "marian.h" + #include "layers/generic.h" -#include "rnn/types.h" #include "rnn/attention_constructors.h" +#include "rnn/types.h" #include <numeric> @@ -21,13 +22,16 @@ public: : DecoderState(states, probs, encStates), attentionIndices_(attentionIndices) {} - virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, int beamSize) { + virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, + int beamSize) { std::vector<size_t> selectedAttentionIndices; for(auto i : selIdx) selectedAttentionIndices.push_back(attentionIndices_[i]); - return New<DecoderStateHardAtt>( - states_.select(selIdx, beamSize), probs_, encStates_, selectedAttentionIndices); + return New<DecoderStateHardAtt>(states_.select(selIdx, beamSize), + probs_, + encStates_, + selectedAttentionIndices); } virtual void setAttentionIndices( @@ -85,11 +89,11 @@ public: Expr start; if(!meanContexts.empty()) { // apply single layer network to mean to map into decoder space - auto mlp = mlp::mlp(graph) // - .push_back(mlp::dense(graph) // - ("prefix", prefix_ + "_ff_state") // - ("dim", opt<int>("dim-rnn")) // - ("activation", (int)mlp::act::tanh)// + auto mlp = mlp::mlp(graph) // + .push_back(mlp::dense(graph) // + ("prefix", prefix_ + "_ff_state") // + ("dim", opt<int>("dim-rnn")) // + ("activation", (int)mlp::act::tanh) // ("layer-normalization", opt<bool>("layer-normalization"))); start = mlp->apply(meanContexts); @@ -133,7 +137,8 @@ public: int dimBeam = trgEmbeddings->shape()[-4]; if(dropoutTrg) { - trgEmbeddings = dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1}); + trgEmbeddings + = dropout(trgEmbeddings, dropoutTrg, {dimTrgWords, dimBatch, 1}); } auto flatContext = reshape(context, {dimBatch * dimSrcWords, dimContext}); diff --git a/src/models/model_base.h b/src/models/model_base.h index 3f077df7..b000af27 100644 --- a/src/models/model_base.h +++ b/src/models/model_base.h @@ -1,7 +1,7 @@ #pragma once -#include "marian.h" #include <string> +#include "marian.h" namespace marian { namespace models { diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 2cd8b631..ea7f809b 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -1,12 +1,13 @@ #include "marian.h" + #include "models/model_factory.h" -#include "models/s2s.h" -#include "models/transformer.h" -#include "models/hardatt.h" #include "models/amun.h" -#include "models/nematus.h" #include "models/encdec.h" +#include "models/hardatt.h" +#include "models/nematus.h" +#include "models/s2s.h" +#include "models/transformer.h" #ifdef CUDNN #include "models/char_s2s.h" diff --git a/src/models/model_factory.h b/src/models/model_factory.h index 409afe3d..0d3f4c89 100644 --- a/src/models/model_factory.h +++ b/src/models/model_factory.h @@ -1,8 +1,9 @@ #pragma once #include "marian.h" -#include "layers/factory.h" + #include "encdec.h" +#include "layers/factory.h" namespace marian { diff --git a/src/models/s2s.h b/src/models/s2s.h index a9528b28..4b9da7bf 100644 --- a/src/models/s2s.h +++ b/src/models/s2s.h @@ -1,9 +1,10 @@ #pragma once #include "marian.h" + #include "layers/constructors.h" -#include "rnn/constructors.h" #include "rnn/attention_constructors.h" +#include "rnn/constructors.h" namespace marian { diff --git a/src/models/states.h b/src/models/states.h index f1f3c704..e2635c87 100644 --- a/src/models/states.h +++ b/src/models/states.h @@ -48,8 +48,10 @@ public: virtual Expr getProbs() { return probs_; } virtual void setProbs(Expr probs) { probs_ = probs; } - virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, int beamSize) { - return New<DecoderState>(states_.select(selIdx, beamSize), probs_, encStates_); + virtual Ptr<DecoderState> select(const std::vector<size_t>& selIdx, + int beamSize) { + return New<DecoderState>( + states_.select(selIdx, beamSize), probs_, encStates_); } virtual const rnn::States& getStates() { return states_; } diff --git a/src/models/transformer.h b/src/models/transformer.h index 02478caa..ac20228c 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -1,11 +1,12 @@ #pragma once #include "marian.h" -#include "layers/factory.h" + +#include "encdec.h" #include "layers/constructors.h" +#include "layers/factory.h" #include "model_base.h" #include "model_factory.h" -#include "encdec.h" namespace marian { @@ -34,8 +35,8 @@ public: } // shared across batch entries - auto signal = graph->constant({dimWords, 1, dimEmb}, - inits::from_vector(vPos)); + auto signal + = graph->constant({dimWords, 1, dimEmb}, inits::from_vector(vPos)); return input + signal; } @@ -47,15 +48,14 @@ public: for(int i = 0; i < length; ++i) for(int j = 0; j <= i; ++j) vMask[i * length + j] = 1.f; - return graph->constant({1, length, length}, - inits::from_vector(vMask)); + return graph->constant({1, length, length}, inits::from_vector(vMask)); } Expr InverseMask(Expr mask) { // convert 0/1 mask to transformer style -inf mask auto ms = mask->shape(); mask = (1 - mask) * -99999999.f; - return reshape(mask, {ms[-3], 1, ms[-2], ms[-1]}) ; + return reshape(mask, {ms[-3], 1, ms[-2], ms[-1]}); } Expr SplitHeads(Expr input, int dimHeads) { @@ -135,18 +135,17 @@ public: if(op == 'h') { auto Wh = graph->param( prefix + "_Wh", {dimModel, dimModel}, inits::glorot_uniform); - auto bh - = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros); + auto bh = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros); auto t = affine(prevInput, Wh, bh); output = highway(output, prevInput, t); } // layer normalization if(op == 'n') { - auto scale = graph->param( - prefix + "_ln_scale", {1, dimModel}, inits::ones); - auto bias = graph->param( - prefix + "_ln_bias", {1, dimModel}, inits::zeros); + auto scale + = graph->param(prefix + "_ln_scale", {1, dimModel}, inits::ones); + auto bias + = graph->param(prefix + "_ln_bias", {1, dimModel}, inits::zeros); output = layer_norm(output, scale, bias, 1e-6); } } @@ -219,17 +218,13 @@ public: if(i > 0) prefixProj += "_enc" + std::to_string(i + 1); - auto Wk = graph->param(prefixProj + "_Wk", - {dimModel, dimModel}, - inits::glorot_uniform); - auto bk = graph->param( - prefixProj + "_bk", {1, dimModel}, inits::zeros); + auto Wk = graph->param( + prefixProj + "_Wk", {dimModel, dimModel}, inits::glorot_uniform); + auto bk = graph->param(prefixProj + "_bk", {1, dimModel}, inits::zeros); - auto Wv = graph->param(prefixProj + "_Wv", - {dimModel, dimModel}, - inits::glorot_uniform); - auto bv = graph->param( - prefixProj + "_bv", {1, dimModel}, inits::zeros); + auto Wv = graph->param( + prefixProj + "_Wv", {dimModel, dimModel}, inits::glorot_uniform); + auto bv = graph->param(prefixProj + "_bv", {1, dimModel}, inits::zeros); auto kh = affine(keys[i], Wk, bk); auto vh = affine(values[i], Wv, bv); @@ -254,8 +249,8 @@ public: int dimAtt = output->shape()[-1]; - auto Wo = graph->param( - prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform); + auto Wo + = graph->param(prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform); auto bo = graph->param(prefix + "_bo", {1, dimOut}, inits::zeros); output = affine(output, Wo, bo); @@ -468,11 +463,12 @@ public: std::vector<Ptr<EncoderState>> &encStates) : DecoderState(states, probs, encStates) {} - virtual Ptr<DecoderState> select(const std::vector<size_t> &selIdx, int beamSize) { + virtual Ptr<DecoderState> select(const std::vector<size_t> &selIdx, + int beamSize) { rnn::States selectedStates; int dimDepth = states_[0].output->shape()[-1]; - int dimTime = states_[0].output->shape()[-2]; + int dimTime = states_[0].output->shape()[-2]; int dimBatch = selIdx.size() / beamSize; std::vector<size_t> selIdx2; @@ -553,7 +549,7 @@ public: decoderMask = reshape(TransposeTimeBatch(decoderMask), {1, dimBatch, 1, dimTrgWords}); selfMask = selfMask * decoderMask; - //if(dimBeam > 1) + // if(dimBeam > 1) // selfMask = repeat(selfMask, dimBeam, axis = -4); } @@ -586,7 +582,8 @@ public: for(int i = 1; i <= opt<int>("dec-depth"); ++i) { auto values = query; if(prevDecoderStates.size() > 0) - values = concatenate({prevDecoderStates[i - 1].output, query}, axis = -2); + values + = concatenate({prevDecoderStates[i - 1].output, query}, axis = -2); decoderStates.push_back({values, nullptr}); diff --git a/src/optimizers/clippers.cpp b/src/optimizers/clippers.cpp index ea2a9b30..0ed21f91 100644 --- a/src/optimizers/clippers.cpp +++ b/src/optimizers/clippers.cpp @@ -15,5 +15,4 @@ void Norm::clip(Tensor t) { if(l2Norm >= c_) Element(_1 = (c_ / l2Norm) * _1, t); } - } diff --git a/src/optimizers/optimizers.cpp b/src/optimizers/optimizers.cpp index 3841cbcc..d532d2c9 100644 --- a/src/optimizers/optimizers.cpp +++ b/src/optimizers/optimizers.cpp @@ -58,7 +58,8 @@ void Adagrad::load(const std::string& name, // extract data into vectors if(name == "adagrad_gt") { vGt.resize(totalSize); - std::copy((float*)np->data(), (float*)np->data() + totalSize, vGt.begin()); + std::copy( + (float*)np->data(), (float*)np->data() + totalSize, vGt.begin()); } } @@ -178,11 +179,13 @@ void Adam::load(const std::string& name, // extract data into vectors if(name == "adam_mt") { vMt.resize(totalSize); - std::copy((float*)np->data(), (float*)np->data() + totalSize, vMt.begin()); + std::copy( + (float*)np->data(), (float*)np->data() + totalSize, vMt.begin()); } if(name == "adam_vt") { vVt.resize(totalSize); - std::copy((float*)np->data(), (float*)np->data() + totalSize, vVt.begin()); + std::copy( + (float*)np->data(), (float*)np->data() + totalSize, vVt.begin()); } } diff --git a/src/optimizers/optimizers.h b/src/optimizers/optimizers.h index ea51c625..41fe2404 100644 --- a/src/optimizers/optimizers.h +++ b/src/optimizers/optimizers.h @@ -1,8 +1,8 @@ #pragma once +#include <algorithm> #include <map> #include <memory> -#include <algorithm> #include "common/config.h" #include "graph/expression_graph.h" diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h index 5ba409f3..8e9cd95e 100644 --- a/src/rescorer/rescorer.h +++ b/src/rescorer/rescorer.h @@ -42,9 +42,11 @@ private: public: Rescore(Ptr<Config> options) : options_(options), - corpus_(options_->get<bool>("n-best") ? - std::static_pointer_cast<CorpusBase>(New<CorpusNBest>(options_)) : - std::static_pointer_cast<CorpusBase>(New<Corpus>(options_))) { + corpus_( + options_->get<bool>("n-best") + ? std::static_pointer_cast<CorpusBase>( + New<CorpusNBest>(options_)) + : std::static_pointer_cast<CorpusBase>(New<Corpus>(options_))) { corpus_->prepare(); auto devices = options_->getDevices(); @@ -66,12 +68,12 @@ public: models_.resize(graphs_.size()); ThreadPool pool(graphs_.size(), graphs_.size()); for(int i = 0; i < graphs_.size(); ++i) { - - pool.enqueue([=](int j) { - models_[j] = New<Model>(temp); - models_[j]->load(graphs_[j], modelFile); - }, i); - + pool.enqueue( + [=](int j) { + models_[j] = New<Model>(temp); + models_[j]->load(graphs_[j], modelFile); + }, + i); } } @@ -81,9 +83,10 @@ public: auto batchGenerator = New<BatchGenerator<CorpusBase>>(corpus_, options_); batchGenerator->prepare(false); - Ptr<ScoreCollector> output = options_->get<bool>("n-best") ? - std::static_pointer_cast<ScoreCollector>(New<ScoreCollectorNBest>(options_)) : - New<ScoreCollector>(); + Ptr<ScoreCollector> output = options_->get<bool>("n-best") + ? std::static_pointer_cast<ScoreCollector>( + New<ScoreCollectorNBest>(options_)) + : New<ScoreCollector>(); bool summarize = options_->has("summary"); std::string summary diff --git a/src/rescorer/score_collector.h b/src/rescorer/score_collector.h index e828f18f..3144e3fb 100644 --- a/src/rescorer/score_collector.h +++ b/src/rescorer/score_collector.h @@ -13,8 +13,7 @@ namespace marian { class ScoreCollector { public: - ScoreCollector() - : nextId_(0), outStrm_(new OutputFileStream(std::cout)) {}; + ScoreCollector() : nextId_(0), outStrm_(new OutputFileStream(std::cout)){}; virtual void Write(long id, const std::string& message) { boost::mutex::scoped_lock lock(mutex_); @@ -52,9 +51,7 @@ public: } } - virtual void Write(long id, float value) { - Write(id, std::to_string(value)); - } + virtual void Write(long id, float value) { Write(id, std::to_string(value)); } protected: long nextId_{0}; @@ -86,7 +83,9 @@ public: ScoreCollectorNBest(const ScoreCollectorNBest&) = delete; - std::string addToNBest(const std::string nbest, const std::string feature, float score) { + std::string addToNBest(const std::string nbest, + const std::string feature, + float score) { std::vector<std::string> fields; Split(nbest, fields, "|||"); std::stringstream ss; @@ -96,13 +95,15 @@ public: } virtual void Write(long id, float score) { - std::string line; { boost::mutex::scoped_lock lock(mutex_); auto iter = buffer_.find(id); if(iter == buffer_.end()) { - ABORT_IF(lastRead_ >= id, "Entry {} < {} already read but not in buffer", id, lastRead_); + ABORT_IF(lastRead_ >= id, + "Entry {} < {} already read but not in buffer", + id, + lastRead_); std::string line; while(lastRead_ < id && std::getline((std::istream&)*file_, line)) { lastRead_++; @@ -116,6 +117,5 @@ public: ScoreCollector::Write(id, addToNBest(line, fname_, score)); } - }; } diff --git a/src/rnn/attention_constructors.h b/src/rnn/attention_constructors.h index 30e93481..046e8ce8 100644 --- a/src/rnn/attention_constructors.h +++ b/src/rnn/attention_constructors.h @@ -1,10 +1,11 @@ #pragma once #include "marian.h" + #include "layers/factory.h" -#include "rnn/types.h" -#include "rnn/constructors.h" #include "rnn/attention.h" +#include "rnn/constructors.h" +#include "rnn/types.h" namespace marian { namespace rnn { diff --git a/src/rnn/cells.cpp b/src/rnn/cells.cpp index 8b38780f..26cd3e75 100644 --- a/src/rnn/cells.cpp +++ b/src/rnn/cells.cpp @@ -52,8 +52,7 @@ Expr gruOps(const std::vector<Expr>& nodes, bool final) { /******************************************************************************/ struct LSTMCellNodeOp : public NaryNodeOp { - LSTMCellNodeOp(const std::vector<Expr>& nodes) - : NaryNodeOp(nodes) {} + LSTMCellNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {} NodeOps forwardOps() { std::vector<Tensor> inputs; @@ -89,8 +88,7 @@ struct LSTMCellNodeOp : public NaryNodeOp { }; struct LSTMOutputNodeOp : public NaryNodeOp { - LSTMOutputNodeOp(const std::vector<Expr>& nodes) - : NaryNodeOp(nodes) {} + LSTMOutputNodeOp(const std::vector<Expr>& nodes) : NaryNodeOp(nodes) {} NodeOps forwardOps() { std::vector<Tensor> inputs; diff --git a/src/rnn/cells.h b/src/rnn/cells.h index 5750bda4..bcfa4a90 100644 --- a/src/rnn/cells.h +++ b/src/rnn/cells.h @@ -43,8 +43,7 @@ public: {dimInput, dimState}, inits::glorot_uniform); - b_ = graph->param( - prefix + "_b", {1, dimState}, inits::zeros); + b_ = graph->param(prefix + "_b", {1, dimState}, inits::zeros); if(dropout_ > 0.0f) { if(dimInput) @@ -520,8 +519,7 @@ public: {dimInput, 4 * dimState}, inits::glorot_uniform); - b_ = graph->param( - prefix + "_b", {1, 4 * dimState}, inits::zeros); + b_ = graph->param(prefix + "_b", {1, 4 * dimState}, inits::zeros); if(dropout_ > 0.0f) { if(dimInput) diff --git a/src/rnn/constructors.h b/src/rnn/constructors.h index 41879892..638464c8 100644 --- a/src/rnn/constructors.h +++ b/src/rnn/constructors.h @@ -1,7 +1,7 @@ #pragma once -#include "marian.h" #include "layers/factory.h" +#include "marian.h" #include "rnn/rnn.h" namespace marian { diff --git a/src/rnn/rnn.h b/src/rnn/rnn.h index 4f1be340..6664b48f 100644 --- a/src/rnn/rnn.h +++ b/src/rnn/rnn.h @@ -1,9 +1,9 @@ #pragma once -#include "marian.h" #include "layers/generic.h" -#include "rnn/types.h" +#include "marian.h" #include "rnn/cells.h" +#include "rnn/types.h" #include <algorithm> #include <chrono> @@ -74,10 +74,9 @@ private: j = timeSteps - i - 1; std::vector<Expr> steps(xWs.size()); - std::transform(xWs.begin(), - xWs.end(), - steps.begin(), - [j](Expr e) { return step(e, j, -3); }); + std::transform(xWs.begin(), xWs.end(), steps.begin(), [j](Expr e) { + return step(e, j, -3); + }); if(mask) state = cell_->applyState(steps, state, step(mask, j, -3)); diff --git a/src/tensors/allocator.h b/src/tensors/allocator.h index 6781afa7..df23fd06 100644 --- a/src/tensors/allocator.h +++ b/src/tensors/allocator.h @@ -9,8 +9,8 @@ #include <vector> #include "common/definitions.h" -#include "tensors/memory_piece.h" #include "tensors/device.h" +#include "tensors/memory_piece.h" namespace marian { @@ -92,8 +92,8 @@ private: gaps_.swap(oldGaps); for(auto gap : oldGaps) - gaps_.insert( - Gap(device_->data() + std::distance(oldData, gap.data()), gap.size())); + gaps_.insert(Gap(device_->data() + std::distance(oldData, gap.data()), + gap.size())); insertGap(Gap(device_->data() + oldSize, add)); std::unordered_map<uint8_t*, Ptr<MemoryPiece>> oldAllocated; @@ -141,7 +141,10 @@ private: } public: - Allocator(DeviceId deviceId, size_t bytes, size_t step, size_t alignment = 256) + Allocator(DeviceId deviceId, + size_t bytes, + size_t step, + size_t alignment = 256) : device_(DispatchDevice(deviceId, alignment)), step_(step), available_(0), diff --git a/src/tensors/backend.cpp b/src/tensors/backend.cpp index 05a70b18..e0165cbb 100644 --- a/src/tensors/backend.cpp +++ b/src/tensors/backend.cpp @@ -16,5 +16,4 @@ Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed) { #endif return New<cpu::Backend>(deviceId, seed); } - } diff --git a/src/tensors/backend.h b/src/tensors/backend.h index c69c3c09..3cd51ce1 100644 --- a/src/tensors/backend.h +++ b/src/tensors/backend.h @@ -10,8 +10,7 @@ protected: size_t seed_; public: - Backend(DeviceId deviceId, size_t seed) - : deviceId_(deviceId), seed_(seed) {} + Backend(DeviceId deviceId, size_t seed) : deviceId_(deviceId), seed_(seed) {} virtual DeviceId getDevice() { return deviceId_; }; virtual void setDevice() = 0; @@ -19,5 +18,4 @@ public: }; Ptr<Backend> BackendByDevice(DeviceId deviceId, size_t seed); - } diff --git a/src/tensors/cpu/add.h b/src/tensors/cpu/add.h index f3c93294..228edf89 100644 --- a/src/tensors/cpu/add.h +++ b/src/tensors/cpu/add.h @@ -5,24 +5,22 @@ #pragma once -#include "tensors/tensor.h" #include "functional/functional.h" #include "functional/shape.h" -#include "functional/tmp.h" #include "functional/tensor.h" +#include "functional/tmp.h" +#include "tensors/tensor.h" namespace marian { namespace cpu { - template <size_t K, class Functor> void gAddGeneric(Functor functor, const functional::Shape full, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, float scale = 1.0) { - int outLength = out.shape().elements(); bool same = outLength == full.elements(); for(int i = 0; i < K; ++i) @@ -73,7 +71,6 @@ void gAddReduce(Functor functor, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, float scale = 1.0) { - int rows = full.elements() / full.back(); int cols = full.back(); @@ -100,12 +97,8 @@ void gAddReduce(Functor functor, } } -template <class Functor, class ...Tensors> -void Add(Functor functor, - float scale, - marian::Tensor out, - Tensors... tensors) { - +template <class Functor, class... Tensors> +void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { auto full = marian::Shape::broadcast({out, tensors...}); int length = out->shape().elements(); @@ -113,7 +106,7 @@ void Add(Functor functor, constexpr size_t K = sizeof...(Tensors); functional::Tensor<float> gOut = out; - functional::Array<functional::Tensor<float>, K> gIns = {tensors ...}; + functional::Array<functional::Tensor<float>, K> gIns = {tensors...}; if(full.back() != 1 && out->shape().back() == 1) { size_t m = full.elements() / length; @@ -128,8 +121,5 @@ void Add(Functor functor, cpu::gAddGeneric(functor, full, gOut, gIns, scale); } } - - } - } diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 0c667820..9b55ad83 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -15,17 +15,13 @@ private: public: Backend(DeviceId deviceId, size_t seed) - : marian::Backend(deviceId, seed), - gen_(seed_) {} + : marian::Backend(deviceId, seed), gen_(seed_) {} - void setDevice() { } + void setDevice() {} void synchronize() {} - std::default_random_engine& getRandomGenerator() { - return gen_; - } + std::default_random_engine& getRandomGenerator() { return gen_; } }; - } } diff --git a/src/tensors/cpu/device.cpp b/src/tensors/cpu/device.cpp index 95f09815..985b00f5 100644 --- a/src/tensors/cpu/device.cpp +++ b/src/tensors/cpu/device.cpp @@ -1,31 +1,31 @@ -#include <iostream> #include "tensors/device.h" +#include <iostream> #include <stdlib.h> namespace marian { namespace cpu { - Device::~Device() { - free(data_); - data_ = nullptr; - size_ = 0; - } +Device::~Device() { + free(data_); + data_ = nullptr; + size_ = 0; +} - void Device::reserve(size_t size) { - size = align(size); - ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0"); +void Device::reserve(size_t size) { + size = align(size); + ABORT_IF(size < size_ || size == 0, + "New size must be larger than old size and larger than 0"); - if(data_) { - uint8_t *temp = static_cast<uint8_t*>(aligned_alloc(alignment_, size)); - std::copy(data_, data_ + size_, temp); - free(data_); - data_ = temp; - } else { - data_ = static_cast<uint8_t*>(aligned_alloc(alignment_, size)); - } - size_ = size; + if(data_) { + uint8_t *temp = static_cast<uint8_t *>(aligned_alloc(alignment_, size)); + std::copy(data_, data_ + size_, temp); + free(data_); + data_ = temp; + } else { + data_ = static_cast<uint8_t *>(aligned_alloc(alignment_, size)); } - + size_ = size; +} } } diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp index 6187cf3d..478eefc9 100644 --- a/src/tensors/cpu/dropout.cpp +++ b/src/tensors/cpu/dropout.cpp @@ -1,19 +1,20 @@ #include <algorithm> #include <random> -#include "tensors/tensor_operators.h" #include "tensors/cpu/backend.h" +#include "tensors/tensor_operators.h" namespace marian { - namespace cpu { - - void Dropout(Tensor tensor, float p) { - auto cpuBackend = std::static_pointer_cast<cpu::Backend>(tensor->getBackend()); - auto &gen = cpuBackend->getRandomGenerator(); - std::bernoulli_distribution dist(1.f - p); - std::generate(tensor->data(), tensor->data() + tensor->size(), - [&]() { return dist(gen) / (1.f - p); }); - } +namespace cpu { - } +void Dropout(Tensor tensor, float p) { + auto cpuBackend + = std::static_pointer_cast<cpu::Backend>(tensor->getBackend()); + auto &gen = cpuBackend->getRandomGenerator(); + std::bernoulli_distribution dist(1.f - p); + std::generate(tensor->data(), tensor->data() + tensor->size(), [&]() { + return dist(gen) / (1.f - p); + }); +} +} } diff --git a/src/tensors/cpu/element.h b/src/tensors/cpu/element.h index 1f38bf00..210b9a6c 100644 --- a/src/tensors/cpu/element.h +++ b/src/tensors/cpu/element.h @@ -13,12 +13,11 @@ namespace cpu { template <size_t K, bool broadcast, class Functor> void gElement(Functor functor, functional::Array<functional::Tensor<float>, K> tensors) { - int length = tensors[0].shape().elements(); functional::Array<int, functional::Shape::size()> dims; functional::Array<int, K> indices; - #pragma omp parallel for simd +#pragma omp parallel for simd for(int index = 0; index < length; ++index) { indices.fill(index); if(broadcast) { @@ -30,8 +29,8 @@ void gElement(Functor functor, } } -template <class Functor, class ...Tensors> -void Element(Functor functor, marian::Tensor out, Tensors ...tensors) { +template <class Functor, class... Tensors> +void Element(Functor functor, marian::Tensor out, Tensors... tensors) { constexpr size_t K = sizeof...(tensors) + 1; functional::Array<functional::Tensor<float>, K> gTensors = {out, tensors...}; @@ -46,6 +45,5 @@ void Element(Functor functor, marian::Tensor out, Tensors ...tensors) { else cpu::gElement<K, false>(functor, gTensors); } - } } diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 927d8048..5766e640 100644 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -1,15 +1,15 @@ -/* All or part of this file was contributed by Intel under license: - * Copyright (C) 2017-2018 Intel Corporation - * SPDX-License-Identifier: MIT - */ +/* All or part of this file was contributed by Intel under license: + * Copyright (C) 2017-2018 Intel Corporation + * SPDX-License-Identifier: MIT + */ -#include "tensors/tensor.h" #include "tensors/cpu/backend.h" +#include "tensors/tensor.h" #if MKL_FOUND #include <mkl.h> #else -#if BLAS_FOUND +#if BLAS_FOUND #include <cblas.h> #endif #endif @@ -25,7 +25,6 @@ void Prod(marian::Tensor C, bool transB, float beta, float scalar) { - #if BLAS_FOUND float alpha = scalar; @@ -46,19 +45,20 @@ void Prod(marian::Tensor C, if(transB) ldc = B->shape().elements() / B->shape()[-1]; - cblas_sgemm( - CblasColMajor, - transB ? CblasTrans : CblasNoTrans, - transA ? CblasTrans : CblasNoTrans, - n, m, k, - alpha, - B->data(), - ldb, - A->data(), - lda, - beta, - C->data(), - ldc); + cblas_sgemm(CblasColMajor, + transB ? CblasTrans : CblasNoTrans, + transA ? CblasTrans : CblasNoTrans, + n, + m, + k, + alpha, + B->data(), + ldb, + A->data(), + lda, + beta, + C->data(), + ldc); #else ABORT("Not implemented!"); #endif @@ -73,7 +73,7 @@ void ProdBatched(marian::Tensor C, float scalar) { #if BLAS_FOUND float alpha = scalar; - + size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]); @@ -95,33 +95,34 @@ void ProdBatched(marian::Tensor C, ldc = B->shape()[-2]; auto opA = transA ? CblasTrans : CblasNoTrans; - auto opB = transB ? CblasTrans : CblasNoTrans; - + auto opB = transB ? CblasTrans : CblasNoTrans; + auto strideB = batchB == 1 ? 0 : n * k; auto strideA = batchA == 1 ? 0 : m * k; auto strideC = n * m; - + int steps = std::max(batchA, batchB); - + int offsetA = 0; int offsetB = 0; int offsetC = 0; - + for(int i = 0; i < steps; ++i) { - cblas_sgemm( - CblasColMajor, - opB, - opA, - n, m, k, - alpha, - B->data() + offsetB, - ldb, - A->data() + offsetA, - lda, - beta, - C->data() + offsetC, - ldc); - + cblas_sgemm(CblasColMajor, + opB, + opA, + n, + m, + k, + alpha, + B->data() + offsetB, + ldb, + A->data() + offsetA, + lda, + beta, + C->data() + offsetC, + ldc); + offsetA += strideA; offsetB += strideB; offsetC += strideC; @@ -130,6 +131,5 @@ void ProdBatched(marian::Tensor C, ABORT("Not implemented!"); #endif } - } } diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 087ad022..14097085 100644 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -44,13 +44,13 @@ void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) { } inline void gInsertCols(float* out, - const float* in, - size_t rows, - size_t cols, - size_t cols_out, - size_t cols_in, - size_t offset_out, - size_t offset_in) { + const float* in, + size_t rows, + size_t cols, + size_t cols_out, + size_t cols_in, + size_t offset_out, + size_t offset_in) { for(int j = 0; j < rows; ++j) { float* rowOut = out + j * cols_out + offset_out; const float* rowIn = in + j * cols_in + offset_in; @@ -68,9 +68,10 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) { for(auto in : inputs) { ABORT_IF(rows != in->shape().elements() / in->shape().back(), - "First dimension must be equal"); + "First dimension must be equal"); int cols_in = in->shape().back(); - cpu::gInsertCols(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0); + cpu::gInsertCols( + out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0); offset += cols_in; } } @@ -88,11 +89,10 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) { int cols_in = in->shape().back(); for(auto out : outputs) { ABORT_IF(rows != out->shape().elements() / out->shape().back(), - "First dimension must be equal"); + "First dimension must be equal"); int cols_out = out->shape().back(); - cpu::gInsertCols(out->data(), in->data(), - rows, cols_out, cols_out, cols_in, - 0, offset); + cpu::gInsertCols( + out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset); offset += cols_out; } } @@ -158,24 +158,24 @@ void Softmax(Tensor out_, Tensor in_, Tensor mask_) { int rows = out_->shape().elements() / out_->shape().back(); int cols = out_->shape().back(); - for (int j = 0; j < rows; ++j) { - float* so = out + j*cols; - const float* sp = in + j*cols; - const float* mp = mask ? mask + j*cols : nullptr; + for(int j = 0; j < rows; ++j) { + float* so = out + j * cols; + const float* sp = in + j * cols; + const float* mp = mask ? mask + j * cols : nullptr; float max = sp[0]; - for (int i = 1; i < cols; ++i) { + for(int i = 1; i < cols; ++i) { max = std::max(max, sp[i]); } float sum = 0.f; - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { float ex = !mask || mp[i] ? std::exp(sp[i] - max) : 0.f; so[i] = ex; sum += ex; } - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { so[i] /= sum; } } @@ -188,24 +188,24 @@ void LogSoftmax(Tensor out_, Tensor in_) { int rows = out_->shape().elements() / out_->shape().back(); int cols = out_->shape().back(); - for (int j = 0; j < rows; ++j) { + for(int j = 0; j < rows; ++j) { float* so = out + j * cols; - const float* sp = in + j*cols; + const float* sp = in + j * cols; float max = sp[0]; - for (int i = 1; i < cols; ++i) { + for(int i = 1; i < cols; ++i) { max = std::max(max, sp[i]); } float sum = 0.f; - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { float sm = sp[i] - max; float ex = std::exp(sm); so[i] = sm; sum += ex; } - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { so[i] -= std::log(sum); } } @@ -219,17 +219,17 @@ void SoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) { const float* adj = adj_->data(); const float* val = val_->data(); - for (size_t j = 0; j < rows; ++j) { - float* gradRow = grad + j*cols; - const float* adjRow = adj + j*cols; - const float* valRow = val + j*cols; + for(size_t j = 0; j < rows; ++j) { + float* gradRow = grad + j * cols; + const float* adjRow = adj + j * cols; + const float* valRow = val + j * cols; float sum = 0.f; - for (size_t i = 0; i < cols; ++i) { + for(size_t i = 0; i < cols; ++i) { sum += valRow[i] * adjRow[i]; } - for (size_t i = 0; i < cols; ++i) { + for(size_t i = 0; i < cols; ++i) { gradRow[i] += valRow[i] * (adjRow[i] - sum); } } @@ -243,62 +243,68 @@ void LogSoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) { const float* adj = adj_->data(); const float* val = val_->data(); - for (int j = 0; j < rows; ++j) { - float* gradRow = grad + j*cols; - const float* adjRow = adj + j*cols; - const float* valRow = val + j*cols; + for(int j = 0; j < rows; ++j) { + float* gradRow = grad + j * cols; + const float* adjRow = adj + j * cols; + const float* valRow = val + j * cols; float sum = 0.f; - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { sum += adjRow[i]; } - for (int i = 0; i < cols; ++i) { - gradRow[i] += adjRow[i] - sum*std::exp(valRow[i]); + for(int i = 0; i < cols; ++i) { + gradRow[i] += adjRow[i] - sum * std::exp(valRow[i]); } } } -void CopyRows(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) { +void CopyRows(Tensor out_, + const Tensor in_, + const std::vector<size_t>& indices) { size_t cols = in_->shape()[1]; size_t rows = indices.size(); float* out = out_->data(); const float* in = in_->data(); - #pragma omp parallel for - for (int j = 0; j < rows; ++j) { +#pragma omp parallel for + for(int j = 0; j < rows; ++j) { size_t dst = j; size_t src = indices[j]; - float* rowOut = out + dst*cols; - const float* rowIn = in + src*cols; + float* rowOut = out + dst * cols; + const float* rowIn = in + src * cols; std::copy(rowIn, rowIn + cols, rowOut); } } -void PasteRows(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) { +void PasteRows(Tensor out_, + const Tensor in_, + const std::vector<size_t>& indices) { size_t cols = in_->shape()[-1]; size_t rows = indices.size(); float* out = out_->data(); const float* in = in_->data(); - for (int j = 0; j < rows; ++j) { - size_t dst = indices[j]; // not a permutation - may alias, unlike PasteCols + for(int j = 0; j < rows; ++j) { + size_t dst = indices[j]; // not a permutation - may alias, unlike PasteCols size_t src = j; - float* rowOut = out + dst*cols; - const float* rowIn = in + src*cols; + float* rowOut = out + dst * cols; + const float* rowIn = in + src * cols; - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { rowOut[i] += rowIn[i]; } } } -void CopyCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) { +void CopyCols(Tensor out_, + const Tensor in_, + const std::vector<size_t>& indices) { size_t rows = in_->shape().elements() / in_->shape()[-1]; size_t colsIn = in_->shape()[-1]; size_t colsOut = indices.size(); @@ -306,18 +312,20 @@ void CopyCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) float* out = out_->data(); const float* in = in_->data(); - #pragma omp parallel for - for (int j = 0; j < rows; ++j) { - const float* rowIn = in + j*colsIn; - float* rowOut = out + j*colsOut; +#pragma omp parallel for + for(int j = 0; j < rows; ++j) { + const float* rowIn = in + j * colsIn; + float* rowOut = out + j * colsOut; - for (int i = 0; i < colsOut; ++i) { + for(int i = 0; i < colsOut; ++i) { rowOut[i] = rowIn[indices[i]]; } } } -void PasteCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices) { +void PasteCols(Tensor out_, + const Tensor in_, + const std::vector<size_t>& indices) { size_t rows = out_->shape().elements() / out_->shape()[-1]; size_t colsOut = out_->shape()[-1]; size_t colsIn = indices.size(); @@ -328,12 +336,12 @@ void PasteCols(Tensor out_, const Tensor in_, const std::vector<size_t>& indices /* n.b. Unlike PasteRows, currently appears safe to assume indices[i] is a * permutation i.e. no racy aliases, and no need to sum vs. just assign. */ - for (int j = 0; j < rows; ++j) { - const float* rowIn = in + j*colsIn; - float* rowOut = out + j*colsOut; + for(int j = 0; j < rows; ++j) { + const float* rowIn = in + j * colsIn; + float* rowOut = out + j * colsOut; // @TODO: should this be a sum? - for (int i = 0; i < colsIn; ++i) { + for(int i = 0; i < colsIn; ++i) { rowOut[indices[i]] = rowIn[i]; } } @@ -367,8 +375,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) { const float* b = inputs[3]->data(); const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr; - #pragma omp parallel for - for (int j = 0; j < rows; ++j) { +#pragma omp parallel for + for(int j = 0; j < rows; ++j) { float m = !mask || mask[j]; float* rowOut = out + j * cols; const float* rowState = state + j * cols; @@ -376,8 +384,8 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) { const float* xWrow = xW + j * cols * 3; const float* sUrow = sU + j * cols * 3; - #pragma omp simd - for (int i = 0; i < cols; ++i) { +#pragma omp simd + for(int i = 0; i < cols; ++i) { // @TODO: stable logit float r = stableLogit(xWrow[i] + sUrow[i] + b[i]); @@ -417,8 +425,8 @@ void GRUFastBackward(std::vector<Tensor> outputs, const float* mask = inputs.size() > 4 ? inputs[4]->data() : 0; const float* adj = adj_->data(); - #pragma omp parallel - for (int j = 0; j < rows; ++j) { +#pragma omp parallel + for(int j = 0; j < rows; ++j) { float m = !mask || mask[j]; float* rowOutState = outState + j * cols; @@ -430,8 +438,8 @@ void GRUFastBackward(std::vector<Tensor> outputs, const float* rowSU = sU + j * cols * 3; const float* rowAdj = adj + j * cols; - #pragma omp for simd nowait - for (int i = 0; i < cols; ++i) { +#pragma omp for simd nowait + for(int i = 0; i < cols; ++i) { int k = i + cols; int l = i + 2 * cols; @@ -446,10 +454,11 @@ void GRUFastBackward(std::vector<Tensor> outputs, float adj = rowAdj[i]; - float t = (1-z)*(1-h*h); + float t = (1 - z) * (1 - h * h); // df/ds - if(outState) rowOutState[i] += (m * z - m + 1) * adj; + if(outState) + rowOutState[i] += (m * z - m + 1) * adj; // df/d(xW_r) ... float dfdxW_r = m * r * (1 - r) * t * adj; @@ -457,20 +466,28 @@ void GRUFastBackward(std::vector<Tensor> outputs, dfdxW_r *= rowSU[l] + b[l]; else dfdxW_r *= rowSU[l]; - if(outXW) rowOutXW[i] += dfdxW_r; - if(outSU) rowOutSU[i] += dfdxW_r; - if(outB) outB[i] += dfdxW_r; + if(outXW) + rowOutXW[i] += dfdxW_r; + if(outSU) + rowOutSU[i] += dfdxW_r; + if(outB) + outB[i] += dfdxW_r; // df/d(xW_z) ... float dfdxW_z = m * (1 - z) * z * (rowState[i] - h) * adj; - if(outXW) rowOutXW[k] += dfdxW_z; - if(outSU) rowOutSU[k] += dfdxW_z; - if(outB) outB[k] += dfdxW_z; + if(outXW) + rowOutXW[k] += dfdxW_z; + if(outSU) + rowOutSU[k] += dfdxW_z; + if(outB) + outB[k] += dfdxW_z; // df/d(xW_x) ... float dfdxW_x = m * t * adj; - if(outXW) rowOutXW[l] += dfdxW_x; - if(outSU) rowOutSU[l] += dfdxW_x * r; + if(outXW) + rowOutXW[l] += dfdxW_x; + if(outSU) + rowOutSU[l] += dfdxW_x * r; if(outB) if(final) outB[l] += dfdxW_x * r; @@ -490,18 +507,18 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) { int rows = inShape.elements() / inShape.back(); int cols = inShape.back(); - #pragma omp parallel for - for (int j = 0; j < rows; ++j) { - const float* sp = in + j*cols; +#pragma omp parallel for + for(int j = 0; j < rows; ++j) { + const float* sp = in + j * cols; float max = sp[0]; - #pragma omp simd reduction(max:max) - for (int i = 1; i < cols; ++i) { +#pragma omp simd reduction(max : max) + for(int i = 1; i < cols; ++i) { max = std::max(max, sp[i]); } float sum = 0.f; - #pragma omp simd reduction(+:sum) - for (int i = 0; i < cols; ++i) { +#pragma omp simd reduction(+ : sum) + for(int i = 0; i < cols; ++i) { sum += std::exp(sp[i] - max); } @@ -512,7 +529,10 @@ void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) { } } -void CrossEntropyPickBackward(Tensor out_, Tensor adj_, Tensor a, Tensor pick_) { +void CrossEntropyPickBackward(Tensor out_, + Tensor adj_, + Tensor a, + Tensor pick_) { float* out = out_->data(); Shape& outShape = out_->shape(); const float* adj = adj_->data(); @@ -522,23 +542,23 @@ void CrossEntropyPickBackward(Tensor out_, Tensor adj_, Tensor a, Tensor pick_) int rows = outShape.elements() / outShape.back(); int cols = outShape.back(); - #pragma omp parallel for - for (int j = 0; j < rows; ++j) { - const float* sp = in + j*cols; - float* so = out + j*cols; +#pragma omp parallel for + for(int j = 0; j < rows; ++j) { + const float* sp = in + j * cols; + float* so = out + j * cols; float max = sp[0]; - for (int i = 1; i < cols; ++i) { + for(int i = 1; i < cols; ++i) { max = std::max(max, sp[i]); } float sum = 0.f; - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { sum += std::exp(sp[i] - max); } // cross-entropy - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { float sub = (float)(i == (int)pick[j]); so[i] += adj[j] * (std::exp(sp[i] - max) / sum - sub); } @@ -549,8 +569,8 @@ float L2Norm(Tensor in) { float sum = 0.f; size_t size = in->size(); const float* data = in->data(); - #pragma omp parallel for simd reduction(+:sum) - for (size_t i = 0; i < size; ++i) { +#pragma omp parallel for simd reduction(+ : sum) + for(size_t i = 0; i < size; ++i) { sum += data[i] * data[i]; } return std::sqrt(sum); @@ -570,15 +590,15 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) { int rows = m; int cols = k; - #pragma omp parallel for - for (size_t j = 0; j < rows; ++j) { +#pragma omp parallel for + for(size_t j = 0; j < rows; ++j) { const float* vaRow = va; const float* ctxRow = ctx + (j % (b * t)) * cols; const float* stateRow = state + ((j / (b * t)) * b + j % b) * cols; float sum = 0.f; - #pragma omp simd reduction(+:sum) - for (size_t i = 0; i < cols; ++i) { +#pragma omp simd reduction(+ : sum) + for(size_t i = 0; i < cols; ++i) { float z = ctxRow[i] + stateRow[i]; sum += std::tanh(z) * vaRow[i]; } @@ -587,8 +607,12 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) { } } -void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_, - Tensor va_, Tensor context_, Tensor state_, +void AttBack(Tensor gVa_, + Tensor gContext_, + Tensor gState_, + Tensor va_, + Tensor context_, + Tensor state_, Tensor adj_) { float* gVa = gVa_->data(); float* gContext = gContext_->data(); @@ -603,8 +627,8 @@ void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_, size_t k = context_->shape()[-1]; size_t n = context_->shape()[-2]; - #pragma omp parallel for reduction(+:gState[:n*k], gVa[:k]) - for (size_t j = 0; j < m; ++j) { +#pragma omp parallel for reduction(+ : gState[ : n* k], gVa[ : k]) + for(size_t j = 0; j < m; ++j) { float* gcRow = gContext + j * k; float* gsRow = gState + (j % n) * k; @@ -613,8 +637,8 @@ void AttBack(Tensor gVa_, Tensor gContext_, Tensor gState_, float adj_j = adj[j]; - #pragma omp simd - for (size_t i = 0; i < k; ++i) { +#pragma omp simd + for(size_t i = 0; i < k; ++i) { float z = cRow[i] + sRow[i]; float t = std::tanh(z); @@ -642,31 +666,31 @@ void LayerNormalization(Tensor out_, int rows = in_->shape().elements() / in_->shape().back(); int cols = in_->shape().back(); - #pragma omp parallel for - for (int j = 0; j < rows; ++j) { - float* so = out + j*cols; - const float* sp = in + j*cols; +#pragma omp parallel for + for(int j = 0; j < rows; ++j) { + float* so = out + j * cols; + const float* sp = in + j * cols; float sum = 0.f; - #pragma omp simd reduction(+:sum) - for (int i = 0; i < cols; ++i) { +#pragma omp simd reduction(+ : sum) + for(int i = 0; i < cols; ++i) { sum += sp[i]; } float mean = sum / cols; float sqSum = 0.f; - #pragma omp simd reduction(+:sqSum) - for (int i = 0; i < cols; ++i) { +#pragma omp simd reduction(+ : sqSum) + for(int i = 0; i < cols; ++i) { float ex = sp[i] - mean; - sqSum += ex*ex; + sqSum += ex * ex; } float sigma = std::sqrt(eps + sqSum / cols); - #pragma omp simd - for (int i = 0; i < cols; ++i) { +#pragma omp simd + for(int i = 0; i < cols; ++i) { float t = alpha[i] * ((sp[i] - mean) / sigma); - if (beta != nullptr) { + if(beta != nullptr) { t += beta[i]; } @@ -696,36 +720,36 @@ void LayerNormalizationGrad(Tensor gradX_, size_t rows = y_->shape().elements() / y_->shape()[-1]; size_t cols = y_->shape()[-1]; - if (beta) { - #pragma omp parallel for reduction(+:gradGamma[:cols], gradBeta[:cols]) - for (size_t j = 0; j < rows; ++j) { - const float* xRow = x + j*cols; - const float* yRow = y + j*cols; - const float* adjRow = adj + j*cols; - float* gradXRow = gradX + j*cols; + if(beta) { +#pragma omp parallel for reduction(+ : gradGamma[ : cols], gradBeta[ : cols]) + for(size_t j = 0; j < rows; ++j) { + const float* xRow = x + j * cols; + const float* yRow = y + j * cols; + const float* adjRow = adj + j * cols; + float* gradXRow = gradX + j * cols; float sum_x = 0.f; float sum_adj = 0.f; float sum_adj_x = 0.f; float sum_sqr = 0.f; - #pragma omp simd reduction(+:sum_x, sum_adj_x, sum_adj) - for (size_t i = 0; i < cols; ++i) { +#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj) + for(size_t i = 0; i < cols; ++i) { sum_x += xRow[i]; sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[i] : 0.f)) / gamma[i]; sum_adj += adjRow[i]; } float mean = sum_x / cols; - #pragma omp simd reduction(+:sum_sqr) - for (size_t i = 0; i < cols; ++i) { +#pragma omp simd reduction(+ : sum_sqr) + for(size_t i = 0; i < cols; ++i) { float ex = xRow[i] - mean; - sum_sqr += ex*ex; + sum_sqr += ex * ex; } float sigma = std::sqrt(eps + sum_sqr / cols); - #pragma omp simd - for (size_t i = 0; i < cols; ++i) { +#pragma omp simd + for(size_t i = 0; i < cols; ++i) { float grad_x = 0.f; float x_hat = (yRow[i] - beta[i]) / gamma[i]; grad_x += cols * adjRow[i]; @@ -739,35 +763,35 @@ void LayerNormalizationGrad(Tensor gradX_, } } } else { - #pragma omp parallel for reduction(+:gradGamma[:cols]) - for (size_t j = 0; j < rows; ++j) { - const float* xRow = x + j*cols; - const float* yRow = y + j*cols; - const float* adjRow = adj + j*cols; - float* gradXRow = gradX + j*cols; +#pragma omp parallel for reduction(+ : gradGamma[ : cols]) + for(size_t j = 0; j < rows; ++j) { + const float* xRow = x + j * cols; + const float* yRow = y + j * cols; + const float* adjRow = adj + j * cols; + float* gradXRow = gradX + j * cols; float sum_x = 0.f; float sum_adj = 0.f; float sum_adj_x = 0.f; float sum_sqr = 0.f; - #pragma omp simd reduction(+:sum_x, sum_adj_x, sum_adj) - for (size_t i = 0; i < cols; ++i) { +#pragma omp simd reduction(+ : sum_x, sum_adj_x, sum_adj) + for(size_t i = 0; i < cols; ++i) { sum_x += xRow[i]; sum_adj_x += adjRow[i] * (yRow[i] - (beta ? beta[i] : 0.f)) / gamma[i]; sum_adj += adjRow[i]; } float mean = sum_x / cols; - #pragma omp simd reduction(+:sum_sqr) - for (size_t i = 0; i < cols; ++i) { +#pragma omp simd reduction(+ : sum_sqr) + for(size_t i = 0; i < cols; ++i) { float ex = xRow[i] - mean; - sum_sqr += ex*ex; + sum_sqr += ex * ex; } float sigma = std::sqrt(eps + sum_sqr / cols); - #pragma omp simd - for (size_t i = 0; i < cols; ++i) { +#pragma omp simd + for(size_t i = 0; i < cols; ++i) { float grad_x = 0.f; float x_hat = yRow[i] / gamma[i]; grad_x += cols * adjRow[i]; @@ -794,9 +818,9 @@ void Shift(Tensor out_, Tensor in_, marian::Shape shift, bool invert) { const float* in = in_->data(); int length = out_->shape().elements(); - #pragma omp parallel for - for (int i = 0; i < length; ++i) { - if (i - offset < 0 || i - offset >= length) { +#pragma omp parallel for + for(int i = 0; i < length; ++i) { + if(i - offset < 0 || i - offset >= length) { out[i] = 0.f; } else { out[i] = in[i - offset]; @@ -808,7 +832,7 @@ void SetSparse(float* out, const std::vector<size_t>& indices, const std::vector<float>& values) { int length = indices.size(); - for (int index = 0; index < length; ++index) { + for(int index = 0; index < length; ++index) { out[indices[index]] = values[index]; } } @@ -824,26 +848,26 @@ void LSTMCellForward(Tensor out_, std::vector<Tensor> inputs) { const float* b = inputs[3]->data(); const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr; - for (int j = 0; j < rows; ++j) { + for(int j = 0; j < rows; ++j) { float m = !mask || mask[j]; - float* rowOut = out + j*cols; - const float* rowCell = cell + j*cols; + float* rowOut = out + j * cols; + const float* rowCell = cell + j * cols; - const float* xWrow = xW + j*cols*4; - const float* sUrow = sU + j*cols*4; + const float* xWrow = xW + j * cols * 4; + const float* sUrow = sU + j * cols * 4; - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]); - int l = i + 2*cols; + int l = i + 2 * cols; float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]); - float cout = gf*rowCell[i] + gi*gc; - rowOut[i] = m*cout + (1-m)*rowCell[i]; + float cout = gf * rowCell[i] + gi * gc; + rowOut[i] = m * cout + (1 - m) * rowCell[i]; } } } @@ -858,15 +882,15 @@ void LSTMOutputForward(Tensor out_, std::vector<Tensor> inputs) { const float* sU = inputs[2]->data(); const float* b = inputs[3]->data(); - for (int j = 0; j <rows; ++j) { - float* rowOut = out + j*cols; - const float* rowCell = cell + j*cols; + for(int j = 0; j < rows; ++j) { + float* rowOut = out + j * cols; + const float* rowCell = cell + j * cols; - const float* xWrow = xW + j*cols*4; - const float* sUrow = sU + j*cols*4; + const float* xWrow = xW + j * cols * 4; + const float* sUrow = sU + j * cols * 4; - for (int i = 0; i < cols; ++i) { - int k = i + 3*cols; + for(int i = 0; i < cols; ++i) { + int k = i + 3 * cols; float go = stableLogit(xWrow[k] + sUrow[k] + b[k]); rowOut[i] = go * std::tanh(rowCell[i]); @@ -893,52 +917,70 @@ void LSTMCellBackward(std::vector<Tensor> outputs, const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr; const float* adj = adj_->data(); - for (int j = 0; j <rows; ++j) { + for(int j = 0; j < rows; ++j) { float m = !mask || mask[j]; - float* rowOutCell = outCell + j*cols; - float* rowOutXW = outXW + j*cols*4; - float* rowOutSU = outSU + j*cols*4; + float* rowOutCell = outCell + j * cols; + float* rowOutXW = outXW + j * cols * 4; + float* rowOutSU = outSU + j * cols * 4; - const float* rowCell = cell + j*cols; - const float* xWrow = xW + j*cols*4; - const float* sUrow = sU + j*cols*4; + const float* rowCell = cell + j * cols; + const float* xWrow = xW + j * cols * 4; + const float* sUrow = sU + j * cols * 4; - const float* rowAdj = adj + j*cols; + const float* rowAdj = adj + j * cols; - for (int i = 0; i < cols; ++i) { + for(int i = 0; i < cols; ++i) { float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]); - int l = i + 2*cols; + int l = i + 2 * cols; float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]); float adj = rowAdj[i]; // dc/dx_{t-1} - if (outCell) { - rowOutCell[i] += (m*gf - m + 1)*adj; + if(outCell) { + rowOutCell[i] += (m * gf - m + 1) * adj; } // dc/d(b_f) = dc/d(xW_f) ... - float dcdxf = m*rowCell[i] * gf*(1-gf) * adj; - if (outXW) { rowOutXW[i] += dcdxf; } - if (outSU) { rowOutSU[i] += dcdxf; } - if (outB) { outB[i] += dcdxf; } + float dcdxf = m * rowCell[i] * gf * (1 - gf) * adj; + if(outXW) { + rowOutXW[i] += dcdxf; + } + if(outSU) { + rowOutSU[i] += dcdxf; + } + if(outB) { + outB[i] += dcdxf; + } // dc/d(b_i) ... - float dcdb_i = m * gc * gi*(1-gi) * adj; - if (outXW) { rowOutXW[k] += dcdb_i; } - if (outSU) { rowOutSU[k] += dcdb_i; } - if (outB) { outB[k] += dcdb_i; } + float dcdb_i = m * gc * gi * (1 - gi) * adj; + if(outXW) { + rowOutXW[k] += dcdb_i; + } + if(outSU) { + rowOutSU[k] += dcdb_i; + } + if(outB) { + outB[k] += dcdb_i; + } // dc/d(b_c) ... - float dcdxc = m * gi * (1 - gc*gc) * adj; - if (outXW) { rowOutXW[l] += dcdxc; } - if (outSU) { rowOutSU[l] += dcdxc; } - if (outB) { outB[l] += dcdxc; } + float dcdxc = m * gi * (1 - gc * gc) * adj; + if(outXW) { + rowOutXW[l] += dcdxc; + } + if(outSU) { + rowOutSU[l] += dcdxc; + } + if(outB) { + outB[l] += dcdxc; + } } } } @@ -961,19 +1003,19 @@ void LSTMOutputBackward(std::vector<Tensor> outputs, const float* adj = adj_->data(); - for (int j = 0; j < rows; ++j) { - float* rowOutCell = outCell + j*cols; - float* rowOutXW = outXW + j*cols*4; - float* rowOutSU = outSU + j*cols*4; + for(int j = 0; j < rows; ++j) { + float* rowOutCell = outCell + j * cols; + float* rowOutXW = outXW + j * cols * 4; + float* rowOutSU = outSU + j * cols * 4; - const float* rowCell = cell + j*cols; - const float* xWrow = xW + j*cols*4; - const float* sUrow = sU + j*cols*4; + const float* rowCell = cell + j * cols; + const float* xWrow = xW + j * cols * 4; + const float* sUrow = sU + j * cols * 4; - const float* rowAdj = adj + j*cols; + const float* rowAdj = adj + j * cols; - for (int i = 0; i < cols; ++i) { - int k = i + 3*cols; + for(int i = 0; i < cols; ++i) { + int k = i + 3 * cols; float go = stableLogit(xWrow[k] + sUrow[k] + b[k]); float t = std::tanh(rowCell[i]); @@ -981,15 +1023,21 @@ void LSTMOutputBackward(std::vector<Tensor> outputs, float adj = rowAdj[i]; // dc/dc_{t-1} - if (outCell) { - rowOutCell[i] += go * (1 - t*t) * adj; + if(outCell) { + rowOutCell[i] += go * (1 - t * t) * adj; } // dc/d(b_o) = dc/d(xW_f) ... - float dcdxo = t * go*(1-go) * adj; - if (outXW) { rowOutXW[k] += dcdxo; } - if (outSU) { rowOutSU[k] += dcdxo; } - if (outB) { outB[k] += dcdxo; } + float dcdxo = t * go * (1 - go) * adj; + if(outXW) { + rowOutXW[k] += dcdxo; + } + if(outSU) { + rowOutSU[k] += dcdxo; + } + if(outB) { + outB[k] += dcdxo; + } } } } @@ -1027,6 +1075,5 @@ void PoolingWithMaskingBackward(Tensor adj, bool isEven) { ABORT("Not implemented!"); } - } } // namespace marian diff --git a/src/tensors/device.h b/src/tensors/device.h index 74d68b3d..f8707634 100644 --- a/src/tensors/device.h +++ b/src/tensors/device.h @@ -23,7 +23,7 @@ public: Device(DeviceId deviceId, size_t alignment = 256) : deviceId_(deviceId), data_(0), size_(0), alignment_(alignment) {} - virtual ~Device() {}; + virtual ~Device(){}; virtual void reserve(size_t size) = 0; @@ -35,30 +35,31 @@ public: }; namespace gpu { - class Device : public marian::Device { - public: - Device(DeviceId deviceId, size_t alignment = 256) +class Device : public marian::Device { +public: + Device(DeviceId deviceId, size_t alignment = 256) : marian::Device(deviceId, alignment) {} - ~Device(); + ~Device(); - void reserve(size_t size); - }; + void reserve(size_t size); +}; } namespace cpu { - class Device : public marian::Device { - public: - Device(DeviceId deviceId, size_t alignment = 256) +class Device : public marian::Device { +public: + Device(DeviceId deviceId, size_t alignment = 256) : marian::Device(deviceId, alignment) {} - ~Device(); + ~Device(); - void reserve(size_t size); - }; + void reserve(size_t size); +}; } -static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 256) { +static inline Ptr<Device> DispatchDevice(DeviceId deviceId, + size_t alignment = 256) { #ifdef CUDA_FOUND if(deviceId.type == DeviceType::gpu) return New<gpu::Device>(deviceId, alignment); @@ -71,5 +72,4 @@ static inline Ptr<Device> DispatchDevice(DeviceId deviceId, size_t alignment = 2 return New<cpu::Device>(deviceId, alignment); #endif } - } diff --git a/src/tensors/dispatch.h b/src/tensors/dispatch.h index 750cfff5..7c120e6f 100644 --- a/src/tensors/dispatch.h +++ b/src/tensors/dispatch.h @@ -2,204 +2,250 @@ #ifdef CUDA_FOUND -#define DISPATCH1(Function, Arg1) \ - namespace gpu { \ - void Function(Arg1); \ - } \ - namespace cpu { \ - void Function(Arg1); \ - } \ - void Function(Arg1 arg1) { \ +#define DISPATCH1(Function, Arg1) \ + namespace gpu { \ + void Function(Arg1); \ + } \ + namespace cpu { \ + void Function(Arg1); \ + } \ + void Function(Arg1 arg1) { \ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ - gpu::Function(arg1); \ - else \ - cpu::Function(arg1); \ - } - -#define DISPATCH2(Function, Arg1, Arg2) \ - namespace gpu { \ - void Function(Arg1, Arg2); \ - } \ - namespace cpu { \ - void Function(Arg1, Arg2); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2) { \ + gpu::Function(arg1); \ + else \ + cpu::Function(arg1); \ + } + +#define DISPATCH2(Function, Arg1, Arg2) \ + namespace gpu { \ + void Function(Arg1, Arg2); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2) { \ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ - gpu::Function(arg1, arg2); \ - else \ - cpu::Function(arg1, arg2); \ - } - -#define DISPATCH3(Function, Arg1, Arg2, Arg3) \ - namespace gpu { \ - void Function(Arg1, Arg2, Arg3); \ - } \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3); \ - } \ + gpu::Function(arg1, arg2); \ + else \ + cpu::Function(arg1, arg2); \ + } + +#define DISPATCH3(Function, Arg1, Arg2, Arg3) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3); \ + } \ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \ - if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ - gpu::Function(arg1, arg2, arg3); \ - else \ - cpu::Function(arg1, arg2, arg3); \ - } - -#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \ - namespace gpu { \ - void Function(Arg1, Arg2, Arg3, Arg4); \ - } \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4); \ - } \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3); \ + else \ + cpu::Function(arg1, arg2, arg3); \ + } + +#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4); \ + } \ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4); \ + } + +#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \ + } \ + static inline void Function( \ + Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ - gpu::Function(arg1, arg2, arg3, arg4); \ - else \ - cpu::Function(arg1, arg2, arg3, arg4); \ - } - -#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \ - namespace gpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \ - } \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \ - if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ - gpu::Function(arg1, arg2, arg3, arg4, arg5); \ - else \ - cpu::Function(arg1, arg2, arg3, arg4, arg5); \ - } - -#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \ - namespace gpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \ - } \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \ - if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ - gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \ - else \ - cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \ + gpu::Function(arg1, arg2, arg3, arg4, arg5); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5); \ + } + +#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \ + } \ + static inline void Function( \ + Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \ } #define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \ - namespace gpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \ - } \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \ - if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ - gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \ - else \ - cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \ + } \ + static inline void Function(Arg1 arg1, \ + Arg2 arg2, \ + Arg3 arg3, \ + Arg4 arg4, \ + Arg5 arg5, \ + Arg6 arg6, \ + Arg7 arg7) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \ } #define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \ - namespace gpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \ - } \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \ - if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ - gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ - else \ - cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ - } - -#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \ - namespace gpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \ - } \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \ - if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \ + } \ + static inline void Function(Arg1 arg1, \ + Arg2 arg2, \ + Arg3 arg3, \ + Arg4 arg4, \ + Arg5 arg5, \ + Arg6 arg6, \ + Arg7 arg7, \ + Arg8 arg8) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ + } + +#define DISPATCH9( \ + Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \ + } \ + static inline void Function(Arg1 arg1, \ + Arg2 arg2, \ + Arg3 arg3, \ + Arg4 arg4, \ + Arg5 arg5, \ + Arg6 arg6, \ + Arg7 arg7, \ + Arg8 arg8, \ + Arg9 arg9) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \ - else \ + else \ cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \ } #else #define DISPATCH1(Function, Arg1) \ - namespace cpu { \ - void Function(Arg1); \ - } \ - void Function(Arg1 arg1) { \ - cpu::Function(arg1); \ - } - -#define DISPATCH2(Function, Arg1, Arg2) \ - namespace cpu { \ - void Function(Arg1, Arg2); \ - } \ + namespace cpu { \ + void Function(Arg1); \ + } \ + void Function(Arg1 arg1) { cpu::Function(arg1); } + +#define DISPATCH2(Function, Arg1, Arg2) \ + namespace cpu { \ + void Function(Arg1, Arg2); \ + } \ static inline void Function(Arg1 arg1, Arg2 arg2) { \ - cpu::Function(arg1, arg2); \ + cpu::Function(arg1, arg2); \ } -#define DISPATCH3(Function, Arg1, Arg2, Arg3) \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3); \ - } \ +#define DISPATCH3(Function, Arg1, Arg2, Arg3) \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3); \ + } \ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \ - cpu::Function(arg1, arg2, arg3); \ + cpu::Function(arg1, arg2, arg3); \ } -#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4); \ - } \ +#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4); \ + } \ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \ - cpu::Function(arg1, arg2, arg3, arg4); \ + cpu::Function(arg1, arg2, arg3, arg4); \ } -#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \ - cpu::Function(arg1, arg2, arg3, arg4, arg5); \ +#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \ + } \ + static inline void Function( \ + Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \ + cpu::Function(arg1, arg2, arg3, arg4, arg5); \ } -#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \ - cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \ +#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \ + } \ + static inline void Function( \ + Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \ } #define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \ - cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \ + } \ + static inline void Function(Arg1 arg1, \ + Arg2 arg2, \ + Arg3 arg3, \ + Arg4 arg4, \ + Arg5 arg5, \ + Arg6 arg6, \ + Arg7 arg7) { \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \ } #define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \ - cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ - } - -#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \ - namespace cpu { \ - void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \ - } \ - static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \ - cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \ + } \ + static inline void Function(Arg1 arg1, \ + Arg2 arg2, \ + Arg3 arg3, \ + Arg4 arg4, \ + Arg5 arg5, \ + Arg6 arg6, \ + Arg7 arg7, \ + Arg8 arg8) { \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ + } + +#define DISPATCH9( \ + Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \ + } \ + static inline void Function(Arg1 arg1, \ + Arg2 arg2, \ + Arg3 arg3, \ + Arg4 arg4, \ + Arg5 arg5, \ + Arg6 arg6, \ + Arg7 arg7, \ + Arg8 arg8, \ + Arg9 arg9) { \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \ } #endif diff --git a/src/tensors/gpu/add.cu b/src/tensors/gpu/add.cu index 84679313..1acb5b54 100644 --- a/src/tensors/gpu/add.cu +++ b/src/tensors/gpu/add.cu @@ -9,8 +9,8 @@ #include "functional/functional.h" #include "functional/shape.h" -#include "functional/tmp.h" #include "functional/tensor.h" +#include "functional/tmp.h" namespace marian { @@ -22,7 +22,6 @@ __global__ void gAddGeneric(Functor functor, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, float scale = 1.0) { - int outLength = out.shape().elements(); bool same = outLength == full.elements(); for(int i = 0; i < K; ++i) @@ -37,14 +36,12 @@ __global__ void gAddGeneric(Functor functor, for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < outLength) { - if(same) { out[index] += functional::apply(functor, ins, index) * scale; } else { out.shape().dims(index, dims); out[index] += functional::loops(functor, ins, len, dims) * scale; } - } } } @@ -81,7 +78,6 @@ __global__ void gAddReduce(Functor functor, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, float scale = 1.0) { - int rows = full.elements() / full.back(); int cols = full.back(); @@ -133,12 +129,8 @@ __global__ void gAddReduce(Functor functor, } } -template <class Functor, class ...Tensors> -void Add(Functor functor, - float scale, - marian::Tensor out, - Tensors... tensors) { - +template <class Functor, class... Tensors> +void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { cudaSetDevice(out->getDevice().no); auto full = marian::Shape::broadcast({out, tensors...}); @@ -148,7 +140,7 @@ void Add(Functor functor, constexpr size_t K = sizeof...(Tensors); functional::Tensor<float> gOut = out; - functional::Array<functional::Tensor<float>, K> gIns = {tensors ...}; + functional::Array<functional::Tensor<float>, K> gIns = {tensors...}; if(full.back() != 1 && out->shape().back() == 1) { size_t m = full.elements() / length; @@ -180,6 +172,5 @@ void Add(Functor functor, } #include "tensors/gpu/add.inc" - } } diff --git a/src/tensors/gpu/add.h b/src/tensors/gpu/add.h index 18ed60ad..d6ab4c4e 100644 --- a/src/tensors/gpu/add.h +++ b/src/tensors/gpu/add.h @@ -6,11 +6,7 @@ namespace marian { namespace gpu { -template <class Functor, class ...Tensors> -void Add(Functor functor, - float scale, - marian::Tensor out, - Tensors... tensors); - +template <class Functor, class... Tensors> +void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors); } } diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu index 4fe5f9f4..d5f5b3ec 100644 --- a/src/tensors/gpu/algorithm.cu +++ b/src/tensors/gpu/algorithm.cu @@ -4,45 +4,49 @@ #include "tensors/gpu/cuda_helpers.h" namespace marian { - namespace gpu { - template <typename T> - void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) { - CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); - CudaCopy(begin, end, dest); - CUDA_CHECK(cudaStreamSynchronize(0)); - } - - template void copy<float>(Ptr<Backend> backend, const float* begin, const float* end, float* dest); - template void copy<int>(Ptr<Backend> backend, const int* begin, const int* end, int* dest); - - - __global__ void gFill(float *d_in, int size, float val) { - for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) { - int index = bid + threadIdx.x + blockDim.x * blockIdx.x; - if(index < size) { - d_in[index] = val; - } - } - } +namespace gpu { +template <typename T> +void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest) { + CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); + CudaCopy(begin, end, dest); + CUDA_CHECK(cudaStreamSynchronize(0)); +} - void fill(Ptr<Backend> backend, float* begin, float* end, float value) { - CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); - int size = end - begin; - int threads = std::min(512, size); - int blocks = (size / threads) + (size % threads != 0); - gFill<<<blocks, threads>>>(begin, size, value); - CUDA_CHECK(cudaStreamSynchronize(0)); +template void copy<float>(Ptr<Backend> backend, + const float* begin, + const float* end, + float* dest); +template void copy<int>(Ptr<Backend> backend, + const int* begin, + const int* end, + int* dest); + +__global__ void gFill(float* d_in, int size, float val) { + for(int bid = 0; bid < size; bid += blockDim.x * gridDim.x) { + int index = bid + threadIdx.x + blockDim.x * blockIdx.x; + if(index < size) { + d_in[index] = val; } + } +} - void setSparse(Ptr<Backend> backend, - const std::vector<size_t>& keys, - const std::vector<float>& values, - float* data) { - CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); - ABORT("no SetSparse"); - //gpu::SetSparse(data, keys, values); - CUDA_CHECK(cudaStreamSynchronize(0)); - } +void fill(Ptr<Backend> backend, float* begin, float* end, float value) { + CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); + int size = end - begin; + int threads = std::min(512, size); + int blocks = (size / threads) + (size % threads != 0); + gFill<<<blocks, threads>>>(begin, size, value); + CUDA_CHECK(cudaStreamSynchronize(0)); +} - } +void setSparse(Ptr<Backend> backend, + const std::vector<size_t>& keys, + const std::vector<float>& values, + float* data) { + CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); + ABORT("no SetSparse"); + // gpu::SetSparse(data, keys, values); + CUDA_CHECK(cudaStreamSynchronize(0)); +} +} } diff --git a/src/tensors/gpu/algorithm.h b/src/tensors/gpu/algorithm.h index 003189d1..ac7925d1 100644 --- a/src/tensors/gpu/algorithm.h +++ b/src/tensors/gpu/algorithm.h @@ -3,12 +3,15 @@ #include "tensors/backend.h" namespace marian { - namespace gpu { - template <typename T> - void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest); - - void fill(Ptr<Backend> backend, float* begin, float* end, float value); - - void setSparse(Ptr<Backend> backend, const std::vector<size_t>&, const std::vector<float>&, float*); - } +namespace gpu { +template <typename T> +void copy(Ptr<Backend> backend, const T* begin, const T* end, T* dest); + +void fill(Ptr<Backend> backend, float* begin, float* end, float value); + +void setSparse(Ptr<Backend> backend, + const std::vector<size_t>&, + const std::vector<float>&, + float*); +} } diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h index 41b66f75..5e7e7036 100644 --- a/src/tensors/gpu/backend.h +++ b/src/tensors/gpu/backend.h @@ -25,13 +25,9 @@ public: setHandles(); } - void setDevice() { - cudaSetDevice(deviceId_.no); - } + void setDevice() { cudaSetDevice(deviceId_.no); } - void synchronize() { - cudaStreamSynchronize(0); - } + void synchronize() { cudaStreamSynchronize(0); } cublasHandle_t getCublasHandle() { return cublasHandle_; } @@ -41,13 +37,11 @@ private: cublasHandle_t cublasHandle_; curandGenerator_t curandGenerator_; - void setHandles() { cublasHandle_ = create_handle(); curandGenerator_ = createCurandGenerator(); } - curandGenerator_t createCurandGenerator() { cudaSetDevice(deviceId_.no); curandGenerator_t generator; @@ -67,6 +61,5 @@ private: return cublasHandle; } }; - } } diff --git a/src/tensors/gpu/cuda_helpers.h b/src/tensors/gpu/cuda_helpers.h index a5bef04b..d939b996 100644 --- a/src/tensors/gpu/cuda_helpers.h +++ b/src/tensors/gpu/cuda_helpers.h @@ -13,9 +13,8 @@ const int MAX_BLOCKS = 65535; #define CUDA_CHECK(ans) \ { gpuAssert((ans), __FILE__, __LINE__); } - inline void gpuAssert(cudaError_t code, - const char *file, + const char* file, int line, bool abort = true) { if(code != cudaSuccess) { @@ -26,8 +25,8 @@ inline void gpuAssert(cudaError_t code, template <typename T> void CudaCopy(const T* start, const T* end, T* dest) { - CUDA_CHECK(cudaMemcpy((void*)dest, (void*)start, (end - start) * sizeof(T), - cudaMemcpyDefault)); + CUDA_CHECK(cudaMemcpy( + (void*)dest, (void*)start, (end - start) * sizeof(T), cudaMemcpyDefault)); } #define CUSPARSE_CHECK(x) \ diff --git a/src/tensors/gpu/device.cu b/src/tensors/gpu/device.cu index e15b80d7..d66761e7 100644 --- a/src/tensors/gpu/device.cu +++ b/src/tensors/gpu/device.cu @@ -7,34 +7,34 @@ namespace marian { namespace gpu { - Device::~Device() { - cudaSetDevice(deviceId_.no); - if(data_) { - CUDA_CHECK(cudaFree(data_)); - } - cudaDeviceSynchronize(); +Device::~Device() { + cudaSetDevice(deviceId_.no); + if(data_) { + CUDA_CHECK(cudaFree(data_)); } + cudaDeviceSynchronize(); +} - void Device::reserve(size_t size) { - size = align(size); - cudaSetDevice(deviceId_.no); - - ABORT_IF(size < size_ || size == 0, "New size must be larger than old size and larger than 0"); - - if(data_) { - // Allocate memory by going through host memory - uint8_t *temp = new uint8_t[size_]; - CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(data_)); - CUDA_CHECK(cudaMalloc(&data_, size)); - CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice)); - delete[] temp; - } else { - CUDA_CHECK(cudaMalloc(&data_, size)); - } - - size_ = size; +void Device::reserve(size_t size) { + size = align(size); + cudaSetDevice(deviceId_.no); + + ABORT_IF(size < size_ || size == 0, + "New size must be larger than old size and larger than 0"); + + if(data_) { + // Allocate memory by going through host memory + uint8_t *temp = new uint8_t[size_]; + CUDA_CHECK(cudaMemcpy(temp, data_, size_, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(data_)); + CUDA_CHECK(cudaMalloc(&data_, size)); + CUDA_CHECK(cudaMemcpy(data_, temp, size_, cudaMemcpyHostToDevice)); + delete[] temp; + } else { + CUDA_CHECK(cudaMalloc(&data_, size)); } + size_ = size; +} } } diff --git a/src/tensors/gpu/dropout.cu b/src/tensors/gpu/dropout.cu index 02a38446..dc82e49d 100644 --- a/src/tensors/gpu/dropout.cu +++ b/src/tensors/gpu/dropout.cu @@ -22,31 +22,29 @@ } \ } while(0) - namespace marian { - namespace gpu { - - __global__ void gScale(float* data, int n, float p) { - int index = threadIdx.x + blockIdx.x * blockDim.x; +namespace gpu { - while(index < n) { - data[index] = (data[index] < p) / p; - index += gridDim.x * blockDim.x; - } - } +__global__ void gScale(float* data, int n, float p) { + int index = threadIdx.x + blockIdx.x * blockDim.x; - void Dropout(Tensor tensor, float p) { - auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend()); - curandGenerator_t gen = gpuBackend->getCurandGenerator(); - int n = tensor->size(); - CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n)); - - int numThreads = std::min(n, 512); - int numBlocks = n / numThreads + (n % numThreads != 0); + while(index < n) { + data[index] = (data[index] < p) / p; + index += gridDim.x * blockDim.x; + } +} - gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p); - } +void Dropout(Tensor tensor, float p) { + auto gpuBackend + = std::static_pointer_cast<gpu::Backend>(tensor->getBackend()); + curandGenerator_t gen = gpuBackend->getCurandGenerator(); + int n = tensor->size(); + CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n)); + int numThreads = std::min(n, 512); + int numBlocks = n / numThreads + (n % numThreads != 0); - } + gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p); +} +} } diff --git a/src/tensors/gpu/element.cu b/src/tensors/gpu/element.cu index b8d6cef9..f5c5fbe7 100644 --- a/src/tensors/gpu/element.cu +++ b/src/tensors/gpu/element.cu @@ -1,6 +1,5 @@ -
-
#include "tensors/gpu/element.h"
+
#include "tensors/gpu/cuda_helpers.h"
#include "functional/array.h"
#include "functional/tensor.h"
@@ -11,9 +10,9 @@ namespace marian { namespace gpu {
template <size_t K, bool broadcast, class Functor>
-__global__ void gElement(Functor functor,
- functional::Array<functional::Tensor<float>, K> tensors) {
-
+__global__ void gElement(
+ Functor functor,
+ functional::Array<functional::Tensor<float>, K> tensors) {
int length = tensors[0].shape().elements();
functional::Array<int, functional::Shape::size()> dims;
functional::Array<int, K> indices;
@@ -21,7 +20,6 @@ __global__ void gElement(Functor functor, for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
-
indices.fill(index);
if(broadcast) {
@@ -35,8 +33,8 @@ __global__ void gElement(Functor functor, }
}
-template <class Functor, class ...Tensors>
-void Element(Functor functor, Tensor out, Tensors ...tensors) {
+template <class Functor, class... Tensors>
+void Element(Functor functor, Tensor out, Tensors... tensors) {
cudaSetDevice(out->getDevice().no);
constexpr size_t K = sizeof...(tensors) + 1;
@@ -57,8 +55,5 @@ void Element(Functor functor, Tensor out, Tensors ...tensors) { }
#include "tensors/gpu/element.inc"
-
-
}
}
-
diff --git a/src/tensors/gpu/element.h b/src/tensors/gpu/element.h index 0e7eb162..b13f625f 100644 --- a/src/tensors/gpu/element.h +++ b/src/tensors/gpu/element.h @@ -5,8 +5,7 @@ namespace marian { namespace gpu { -template <class Functor, class ...Tensors> -void Element(Functor functor, Tensor out, Tensors ...tensors); - +template <class Functor, class... Tensors> +void Element(Functor functor, Tensor out, Tensors... tensors); } } diff --git a/src/tensors/gpu/prod.cu b/src/tensors/gpu/prod.cu index 38278ec5..57ff2007 100644 --- a/src/tensors/gpu/prod.cu +++ b/src/tensors/gpu/prod.cu @@ -38,10 +38,11 @@ void Prod(marian::Tensor C, cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle(); + auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend()) + ->getCublasHandle(); #if CUDA_VERSION >= 9000 - //cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH); +// cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH); #endif cublasSgemm(cublasHandle, @@ -59,7 +60,7 @@ void Prod(marian::Tensor C, C->data(), ldc); #if CUDA_VERSION >= 9000 - //cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH); +// cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH); #endif } @@ -96,10 +97,11 @@ void ProdBatched(marian::Tensor C, cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle(); + auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend()) + ->getCublasHandle(); #if CUDA_VERSION >= 9000 - //cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH); +// cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH); #endif cublasSgemmStridedBatched(cublasHandle, opB, @@ -120,10 +122,8 @@ void ProdBatched(marian::Tensor C, n * m, std::max(batchA, batchB)); #if CUDA_VERSION >= 9000 - //cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH); +// cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH); #endif } - - } } diff --git a/src/tensors/gpu/prod.h b/src/tensors/gpu/prod.h index db9b62d8..968e631f 100644 --- a/src/tensors/gpu/prod.h +++ b/src/tensors/gpu/prod.h @@ -21,6 +21,5 @@ void ProdBatched(marian::Tensor C, bool transB, float beta = 0, float scalar = 1); - } } diff --git a/src/tensors/gpu/sparse.h b/src/tensors/gpu/sparse.h index cffb398e..3dd30126 100644 --- a/src/tensors/gpu/sparse.h +++ b/src/tensors/gpu/sparse.h @@ -2,8 +2,8 @@ #include <cusparse_v2.h> #include "common/definitions.h" -#include "tensors/tensor.h" #include "kernels/cuda_helpers.h" +#include "tensors/tensor.h" namespace marian { diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 7e64c954..1160bdd1 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -28,11 +28,11 @@ __device__ inline float stableLogit(float x) { } bool IsNan(Tensor in) { - //cudaSetDevice(in->getDevice().no); - //thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data()); - //thrust::device_ptr<float> end + // cudaSetDevice(in->getDevice().no); + // thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data()); + // thrust::device_ptr<float> end // = thrust::device_pointer_cast(in->data() + in->size()); - //return thrust::transform_reduce( + // return thrust::transform_reduce( // begin, end, isnan_test(), 0, thrust::plus<bool>()); return false; } @@ -93,10 +93,9 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) { for(auto in : inputs) { ABORT_IF(rows != in->shape().elements() / in->shape().back(), - "First dimension must be equal"); + "First dimension must be equal"); int cols_in = in->shape().back(); - int blocks = std::min(MAX_BLOCKS, rows); int threads = std::min(MAX_THREADS, cols_in); @@ -122,7 +121,7 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) { int cols_in = in->shape().back(); for(auto out : outputs) { ABORT_IF(rows != out->shape().elements() / out->shape().back(), - "First dimension must be equal"); + "First dimension must be equal"); int cols_out = out->shape().back(); int blocks = std::min(MAX_BLOCKS, rows); @@ -166,10 +165,10 @@ void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) { SplitCont(outputs, in, ax); } -__global__ void gTransposeND(functional::Tensor<float> out, - const functional::Tensor<float> in, - const functional::Array<int, functional::Shape::size()> permute) { - +__global__ void gTransposeND( + functional::Tensor<float> out, + const functional::Tensor<float> in, + const functional::Array<int, functional::Shape::size()> permute) { constexpr size_t N = functional::Shape::size(); functional::Array<int, N> oDims; functional::Array<int, N> pDims; @@ -458,7 +457,6 @@ void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) { int m = grad->shape().elements() / grad->shape().back(); int k = grad->shape().back(); - int blocks = std::min(MAX_BLOCKS, m); int threads = std::min(MAX_THREADS, k); int shared = sizeof(float) * threads * 2; @@ -784,7 +782,9 @@ void Select(Tensor out, int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); auto mp_indices = allocator->alloc<size_t>(indices.size()); - CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>()); + CudaCopy(indices.data(), + indices.data() + indices.size(), + mp_indices->data<size_t>()); int axisGPU = axis + functional::Shape::size() - out->shape().size(); gSelect<<<blocks, threads>>>(out->data(), @@ -810,7 +810,9 @@ void Insert(Tensor out, int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); auto mp_indices = allocator->alloc<size_t>(indices.size()); - CudaCopy(indices.data(), indices.data() + indices.size(), mp_indices->data<size_t>()); + CudaCopy(indices.data(), + indices.data() + indices.size(), + mp_indices->data<size_t>()); int axisGPU = axis + functional::Shape::size() - out->shape().size(); gInsert<<<blocks, threads>>>(out->data(), @@ -1174,19 +1176,18 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) { out->data(), out->shape(), adj->data(), a->data(), pick->data()); } - float L2Norm(Tensor in) { - cudaSetDevice(in->getDevice().no); int size = in->shape().elements(); int threads = std::min(MAX_THREADS, size); - int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0)); + int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0)); uint8_t* data; cudaMalloc(&data, blocks * sizeof(float)); - Tensor out(new TensorBase( - New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend())); + Tensor out(new TensorBase(New<MemoryPiece>(data, blocks * sizeof(float)), + {1, blocks}, + in->getBackend())); using namespace functional; ReduceAll(_1 * _1, out, in); @@ -1203,7 +1204,7 @@ __global__ void gAtt(float* out, int m, // total rows (batch x time x beam) int k, // depth int b, // batch size - int t // time of ctx + int t // time of ctx ) { int rows = m; int cols = k; @@ -1255,14 +1256,8 @@ void Att(Tensor out, Tensor va, Tensor context, Tensor state) { int threads = std::min(MAX_THREADS, (int)k); int shared = sizeof(float) * threads * 2; - gAtt<<<blocks, threads, shared>>>(out->data(), - va->data(), - context->data(), - state->data(), - m, - k, - b, - t); + gAtt<<<blocks, threads, shared>>>( + out->data(), va->data(), context->data(), state->data(), m, k, b, t); } __global__ void gAttBack(float* gVa, @@ -1576,7 +1571,6 @@ __global__ void gShift(float* out, const float* in, int length, int offset) { } void Shift(Tensor out, Tensor in, marian::Shape shift, bool invert) { - ABORT_IF(in->shape().size() != shift.size(), "bad dimensions"); int offset = 0; @@ -2006,21 +2000,22 @@ __global__ void gMaxPoolingForward(float* out, int lastWidth) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= outRows * outCols) return; + if(tid >= outRows * outCols) + return; int rowId = tid / outRows; int colId = tid % outRows; float* b = in + (rowId * inCols) + (colId * width); - float* localMask = mask + (rowId / numKernels) * maskCols + colId * width; + float* localMask = mask + (rowId / numKernels) * maskCols + colId * width; - if (colId == outRows - 1) { + if(colId == outRows - 1) { width = lastWidth; } float currentMax = b[0] * localMask[0]; - for (int i = 1; i < width; ++i) { - if (b[i] * localMask[i] > currentMax) { + for(int i = 1; i < width; ++i) { + if(b[i] * localMask[i] > currentMax) { currentMax = b[i] * localMask[i]; } } @@ -2045,15 +2040,20 @@ void PoolingWithMaskingForward(Tensor out, int outRows = outShape[2]; int outCols = outShape[0] * outShape[1]; - int lastWidth = ((inCols - isEven) % width == 0) - ? width - : (inCols - isEven) % width; + int lastWidth + = ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width; - gMaxPoolingForward<<<blocks, threads>>>( - out->data(), outRows, outCols, - in->data(), inRows, inCols, - mask->data(), outShape[1], mask->shape()[2], - width, lastWidth); + gMaxPoolingForward<<<blocks, threads>>>(out->data(), + outRows, + outCols, + in->data(), + inRows, + inCols, + mask->data(), + outShape[1], + mask->shape()[2], + width, + lastWidth); } __global__ void gMaxPoolingBackward(float* adj, @@ -2067,30 +2067,31 @@ __global__ void gMaxPoolingBackward(float* adj, int numKernels, int maskCols, int width, - int lastWidth) -{ + int lastWidth) { int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid >= adjRows * adjCols) return; + if(tid >= adjRows * adjCols) + return; int rowId = tid / adjRows; int colId = tid % adjRows; float* b = in + (rowId * inCols) + (colId * width); - if (colId == adjRows - 1) { + if(colId == adjRows - 1) { width = lastWidth; } float* localMask = mask + (rowId / numKernels) * maskCols + colId * width; size_t currentMaxIdx = 0; - for (int i = 1; i < width; ++i) { - if (b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) { + for(int i = 1; i < width; ++i) { + if(b[i] * localMask[i] > b[currentMaxIdx] * localMask[currentMaxIdx]) { currentMaxIdx = i; } } - adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx] += adj[rowId + (colId * adjCols)]; + adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx] + += adj[rowId + (colId * adjCols)]; } void PoolingWithMaskingBackward(Tensor adj, @@ -2111,16 +2112,21 @@ void PoolingWithMaskingBackward(Tensor adj, int adjRows = adjShape[2]; int adjCols = adjShape[0] * adjShape[1]; - int lastWidth = ((inCols - isEven) % width == 0) - ? width - : (inCols - isEven) % width; - - gMaxPoolingBackward<<<blocks, threads>>>( - adj->data(), adjRows, adjCols, - in->data(), adjIn->data(), inRows, inCols, - mask->data(), adjShape[1], mask->shape()[2], - width, lastWidth); + int lastWidth + = ((inCols - isEven) % width == 0) ? width : (inCols - isEven) % width; + + gMaxPoolingBackward<<<blocks, threads>>>(adj->data(), + adjRows, + adjCols, + in->data(), + adjIn->data(), + inRows, + inCols, + mask->data(), + adjShape[1], + mask->shape()[2], + width, + lastWidth); } - } } // namespace marian diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index c60caa91..9789428e 100644 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -78,7 +78,7 @@ public: #endif } - void get(std::vector<float> &v) { + void get(std::vector<float>& v) { v.resize(size()); if(backend_->getDevice().type == DeviceType::cpu) { std::copy(data(), data() + size(), v.data()); @@ -101,9 +101,7 @@ public: #endif } - void set(const std::vector<float> &v) { - set(v.data(), v.data() + v.size()); - } + void set(const std::vector<float>& v) { set(v.data(), v.data() + v.size()); } void set(float value) { if(backend_->getDevice().type == DeviceType::cpu) { @@ -116,8 +114,7 @@ public: #endif } - void setSparse(const std::vector<size_t> &k, - const std::vector<float> &v) { + void setSparse(const std::vector<size_t>& k, const std::vector<float>& v) { if(backend_->getDevice().type == DeviceType::cpu) { for(int i = 0; i < k.size(); ++i) data()[k[i]] = v[i]; @@ -130,8 +127,8 @@ public: } void copyFrom(Tensor in) { - if(in->getBackend()->getDevice().type == DeviceType::cpu && - backend_->getDevice().type == DeviceType::cpu) { + if(in->getBackend()->getDevice().type == DeviceType::cpu + && backend_->getDevice().type == DeviceType::cpu) { std::copy(in->data(), in->data() + in->size(), data()); } #ifdef CUDA_FOUND @@ -167,7 +164,6 @@ public: disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols); if(disp) { - if(dims.back() == 0) { bool par = true; std::vector<std::string> p; @@ -182,9 +178,7 @@ public: strm << " "; } - strm << std::setw(12) - << values[i] - << " "; + strm << std::setw(12) << values[i] << " "; if(dims.back() + 1 == shape().back()) { for(int j = dims.size() - 1; j >= 0; --j) { @@ -214,9 +208,7 @@ public: strm << std::endl; return strm.str(); } - }; typedef std::shared_ptr<TensorBase> Tensor; - } diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index 7086b97b..164fd12b 100644 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -21,53 +21,47 @@ namespace marian { - template <class Functor, class ...Tensors> - void Element(Functor functor, marian::Tensor out, Tensors ...tensors) { +template <class Functor, class... Tensors> +void Element(Functor functor, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND - if(out->getBackend()->getDevice().type == DeviceType::gpu) - gpu::Element(functor, out, tensors...); - else + if(out->getBackend()->getDevice().type == DeviceType::gpu) + gpu::Element(functor, out, tensors...); + else #endif - cpu::Element(functor, out, tensors...); - } - - template <class Functor, class ...Tensors> - void Add(Functor functor, - float scale, - marian::Tensor out, - Tensors... tensors) { + cpu::Element(functor, out, tensors...); +} + +template <class Functor, class... Tensors> +void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { #ifdef CUDA_FOUND - if(out->getBackend()->getDevice().type == DeviceType::gpu) - gpu::Add(functor, scale, out, tensors...); - else + if(out->getBackend()->getDevice().type == DeviceType::gpu) + gpu::Add(functor, scale, out, tensors...); + else #endif - cpu::Add(functor, scale, out, tensors...); - } - - template <class Functor, class ...Tensors> - void Add(Functor functor, - marian::Tensor out, - Tensors... tensors) { - Add(functor, 1, out, tensors...); - } - - template <class Functor, class ...Tensors> - void Reduce(Functor functor, - float scale, - marian::Tensor out, - Tensors... tensors) { - out->set(0); - Add(functor, scale, out, tensors...); - } - - template <class Functor, class ...Tensors> - void Reduce(Functor functor, - marian::Tensor out, - Tensors... tensors) { - out->set(0); - Add(functor, out, tensors...); - } + cpu::Add(functor, scale, out, tensors...); +} + +template <class Functor, class... Tensors> +void Add(Functor functor, marian::Tensor out, Tensors... tensors) { + Add(functor, 1, out, tensors...); +} + +template <class Functor, class... Tensors> +void Reduce(Functor functor, + float scale, + marian::Tensor out, + Tensors... tensors) { + out->set(0); + Add(functor, scale, out, tensors...); +} + +template <class Functor, class... Tensors> +void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) { + out->set(0); + Add(functor, out, tensors...); +} +// clang-format off DISPATCH7(Prod, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float) DISPATCH7(ProdBatched, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float) @@ -86,26 +80,34 @@ namespace marian { DISPATCH4(Shift, marian::Tensor, marian::Tensor, marian::Shape, bool) DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int) +// clang-format on #ifdef CUDA_FOUND - namespace gpu { - void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax); - } +namespace gpu { +void Deconcatenate(std::vector<marian::Tensor>& outputs, + const marian::Tensor in, + int ax); +} #endif - namespace cpu { - void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax); - } +namespace cpu { +void Deconcatenate(std::vector<marian::Tensor>& outputs, + const marian::Tensor in, + int ax); +} - static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) { +static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, + const marian::Tensor in, + int ax) { #ifdef CUDA_FOUND - if(in->getBackend()->getDevice().type == DeviceType::gpu) - gpu::Deconcatenate(outputs, in, ax); - else + if(in->getBackend()->getDevice().type == DeviceType::gpu) + gpu::Deconcatenate(outputs, in, ax); + else #endif - cpu::Deconcatenate(outputs, in, ax); - } + cpu::Deconcatenate(outputs, in, ax); +} +// clang-format off DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float) DISPATCH9(LayerNormalizationGrad, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float) @@ -120,113 +122,116 @@ namespace marian { DISPATCH5(Select, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>) DISPATCH5(Insert, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>) - DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>) DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>); +// clang-format on #ifdef CUDA_FOUND - namespace gpu { - void LSTMCellBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj); - } +namespace gpu { +void LSTMCellBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj); +} #endif - namespace cpu { - void LSTMCellBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj); - } +namespace cpu { +void LSTMCellBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj); +} - static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj) { +static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj) { #ifdef CUDA_FOUND - if(adj->getBackend()->getDevice().type == DeviceType::gpu) - gpu::LSTMCellBackward(outputs, inputs, adj); - else + if(adj->getBackend()->getDevice().type == DeviceType::gpu) + gpu::LSTMCellBackward(outputs, inputs, adj); + else #endif - cpu::LSTMCellBackward(outputs, inputs, adj); - } + cpu::LSTMCellBackward(outputs, inputs, adj); +} #ifdef CUDA_FOUND - namespace gpu { - void LSTMOutputBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj); - } +namespace gpu { +void LSTMOutputBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj); +} #endif - namespace cpu { - void LSTMOutputBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj); - } +namespace cpu { +void LSTMOutputBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj); +} - static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj) { +static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj) { #ifdef CUDA_FOUND - if(adj->getBackend()->getDevice().type == DeviceType::gpu) - gpu::LSTMOutputBackward(outputs, inputs, adj); - else + if(adj->getBackend()->getDevice().type == DeviceType::gpu) + gpu::LSTMOutputBackward(outputs, inputs, adj); + else #endif - cpu::LSTMOutputBackward(outputs, inputs, adj); - } + cpu::LSTMOutputBackward(outputs, inputs, adj); +} - DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool) +DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool) #ifdef CUDA_FOUND - namespace gpu { - void GRUFastBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj, - bool final); - } +namespace gpu { +void GRUFastBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj, + bool final); +} #endif - namespace cpu { - void GRUFastBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj, - bool final); - } - - static inline void GRUFastBackward(std::vector<marian::Tensor> outputs, - std::vector<marian::Tensor> inputs, - marian::Tensor adj, - bool final = false) { +namespace cpu { +void GRUFastBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj, + bool final); +} + +static inline void GRUFastBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj, + bool final = false) { #ifdef CUDA_FOUND - if(adj->getBackend()->getDevice().type == DeviceType::gpu) - gpu::GRUFastBackward(outputs, inputs, adj, final); - else + if(adj->getBackend()->getDevice().type == DeviceType::gpu) + gpu::GRUFastBackward(outputs, inputs, adj, final); + else #endif - cpu::GRUFastBackward(outputs, inputs, adj, final); - } + cpu::GRUFastBackward(outputs, inputs, adj, final); +} +// clang-format off DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor) DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor) +// clang-format on #ifdef CUDA_FOUND - namespace gpu { - float L2Norm(marian::Tensor in); - } +namespace gpu { +float L2Norm(marian::Tensor in); +} #endif - namespace cpu { - float L2Norm(marian::Tensor in); - } +namespace cpu { +float L2Norm(marian::Tensor in); +} - static inline float L2Norm(marian::Tensor in) { +static inline float L2Norm(marian::Tensor in) { #ifdef CUDA_FOUND - if(in->getBackend()->getDevice().type == DeviceType::gpu) - return gpu::L2Norm(in); - else + if(in->getBackend()->getDevice().type == DeviceType::gpu) + return gpu::L2Norm(in); + else #endif - return cpu::L2Norm(in); - } + return cpu::L2Norm(in); +} +// clang-format off DISPATCH5(PoolingWithMaskingForward, marian::Tensor, marian::Tensor, marian::Tensor, int, bool) DISPATCH6(PoolingWithMaskingBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, int, bool) - +// clang-format on } diff --git a/src/training/graph_group_async.cpp b/src/training/graph_group_async.cpp index bee719ad..ee3b7d34 100644 --- a/src/training/graph_group_async.cpp +++ b/src/training/graph_group_async.cpp @@ -104,7 +104,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) { totalSize -= __size__; Tensor param; - Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend()); + Ptr<TensorAllocator> allocator + = New<TensorAllocator>(graph->getBackend()); allocator->reserveExact(__size__ * sizeof(float)); allocator->allocate(param, {1, __size__}); paramsAlloc_.push_back(allocator); @@ -122,7 +123,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) { int __size__ = std::min(shardSize_, totalSize); totalSize -= __size__; Tensor grad_; - Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(graph->getBackend()); + Ptr<TensorAllocator> allocator_ + = New<TensorAllocator>(graph->getBackend()); allocator_->reserveExact(__size__ * sizeof(float)); allocator_->allocate(grad_, {1, __size__}); @@ -139,7 +141,8 @@ void AsyncGraphGroup::init(Ptr<data::Batch> batch) { int __size__ = std::min(shardSize_, totalSize); totalSize -= __size__; Tensor paramAvg; - Ptr<TensorAllocator> allocator = New<TensorAllocator>(graph->getBackend()); + Ptr<TensorAllocator> allocator + = New<TensorAllocator>(graph->getBackend()); allocator->reserveExact(__size__ * sizeof(float)); allocator->allocate(paramAvg, {1, __size__}); @@ -230,7 +233,8 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) { scheduler_->update(cost, batch); if(scheduler_->saving() || scheduler_->validating()) { - // Wait with validation or saving until all other threads are done with update. + // Wait with validation or saving until all other threads are done with + // update. // We want to reuse the graphs for validation, so they need to be in // a safe state. pool_->wait_for_others(lock); diff --git a/src/training/graph_group_async.h b/src/training/graph_group_async.h index 94291dee..af311798 100644 --- a/src/training/graph_group_async.h +++ b/src/training/graph_group_async.h @@ -70,7 +70,6 @@ public: movingAvg_{options_->get<float>("exponential-smoothing") > 0}, mvDecay_{options_->get<float>("exponential-smoothing")}, tau_{options_->get<size_t>("optimizer-delay")} { - pool_.reset(new ThreadPool(devices_.size(), devices_.size())); for(auto device : devices_) { diff --git a/src/training/graph_group_multinode.cpp b/src/training/graph_group_multinode.cpp index b80952a8..a09db862 100644 --- a/src/training/graph_group_multinode.cpp +++ b/src/training/graph_group_multinode.cpp @@ -150,7 +150,8 @@ void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() { size_t modelSize = clientGraphs_[0]->params()->vals()->size(); for(int client = 0; client < devices_.size(); client++) { // Communication overlap buffer (for grads + params) - Tensor commOverlapBuffer = newTensor(modelSize, clientGraphs_[client]->getBackend()); + Tensor commOverlapBuffer + = newTensor(modelSize, clientGraphs_[client]->getBackend()); commOverlapBuffer->copyFrom(clientGraphs_[0]->params()->vals()); clientCommOverlapBuffersGPU_.push_back(commOverlapBuffer); // Gradients local sum buffer @@ -206,11 +207,13 @@ void MultiNodeGraphGroup::calculateShardSizes() { void MultiNodeGraphGroup::initShardGpuTensors() { size_t offset = 0; for(int shard = 0; shard < devices_.size(); shard++) { - Tensor gpuParams = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()); + Tensor gpuParams + = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend()); gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor( offset, shardSizes_[shard])); shardParams_.push_back(gpuParams); - shardGrads_.push_back(newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend())); + shardGrads_.push_back( + newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend())); } } diff --git a/src/training/graph_group_singleton.cpp b/src/training/graph_group_singleton.cpp index 509e50c6..528fff8f 100644 --- a/src/training/graph_group_singleton.cpp +++ b/src/training/graph_group_singleton.cpp @@ -1,5 +1,5 @@ -#include "tensors/tensor_operators.h" #include "training/graph_group_singleton.h" +#include "tensors/tensor_operators.h" #include "functional/functional.h" namespace marian { @@ -15,7 +15,8 @@ void SingletonGraph::updateMovingAverage(Tensor mvAvgParams, Tensor params, size_t batches) { using namespace functional; - float decay = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10)); + float decay + = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10)); Element(_1 = ((1.f - decay) * _1) + (decay * _2), mvAvgParams, params); } @@ -59,8 +60,7 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) { if(mvAvg_) { mvAvgGraph_->reuseWorkspace(graph_); scheduler_->validate({mvAvgGraph_}); - } - else { + } else { scheduler_->validate({graph_}); } } diff --git a/src/training/graph_group_singleton.h b/src/training/graph_group_singleton.h index 5f658bfb..11b9cbc8 100644 --- a/src/training/graph_group_singleton.h +++ b/src/training/graph_group_singleton.h @@ -29,7 +29,6 @@ public: : GraphGroup(options), mvAvg_{options_->get<float>("exponential-smoothing") > 0}, mvDecay_{options_->get<float>("exponential-smoothing")} { - auto deviceId = options_->getDevices()[0]; graph_ = New<ExpressionGraph>(); graph_->setDevice(deviceId); diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp index e169b6ed..685ba172 100644 --- a/src/training/graph_group_sync.cpp +++ b/src/training/graph_group_sync.cpp @@ -1,6 +1,6 @@ #include "training/graph_group_sync.h" -#include "tensors/tensor_operators.h" #include "functional/functional.h" +#include "tensors/tensor_operators.h" namespace marian { @@ -17,7 +17,8 @@ void SyncGraphGroup::updateMovingAverage(Tensor paramsAvg, Tensor params, size_t batches) { using namespace functional; - float decay = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10)); + float decay + = std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10)); Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params); } @@ -135,10 +136,10 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) { int size = params_[idx]->size(); int i = 0; - float div = devices_.size(); // no. of GPUs + float div = devices_.size(); // no. of GPUs // do not average gradients if cost type is sum. - if (options_->get<std::string>("cost-type") == "ce-sum") { + if(options_->get<std::string>("cost-type") == "ce-sum") { div = 1; } @@ -176,7 +177,7 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) { float cost = 0; for(auto c : costs) cost += c; - if (options_->get<std::string>("cost-type") != "ce-sum") { + if(options_->get<std::string>("cost-type") != "ce-sum") { cost = cost / costs.size(); } diff --git a/src/training/graph_group_sync.h b/src/training/graph_group_sync.h index 11c7d9f4..f4bc1ad4 100644 --- a/src/training/graph_group_sync.h +++ b/src/training/graph_group_sync.h @@ -43,7 +43,6 @@ public: devices_{options_->getDevices()}, movingAvg_{options_->get<float>("exponential-smoothing") > 0}, mvDecay_{options_->get<float>("exponential-smoothing")} { - for(auto device : devices_) { auto graph = New<ExpressionGraph>(); graph->setDevice(device); diff --git a/src/training/scheduler.h b/src/training/scheduler.h index ea2c3f01..5e817cc5 100644 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -100,7 +100,8 @@ public: return (state_->batches % options_->get<size_t>("save-freq") == 0); } - void validate(const std::vector<Ptr<ExpressionGraph>>& graphs, bool final = false) { + void validate(const std::vector<Ptr<ExpressionGraph>>& graphs, + bool final = false) { if(state_->validated || (state_->batches % options_->get<size_t>("valid-freq") != 0 && !final)) diff --git a/src/training/sparse_tensor.cu b/src/training/sparse_tensor.cu index cea5b655..78da9668 100644 --- a/src/training/sparse_tensor.cu +++ b/src/training/sparse_tensor.cu @@ -7,8 +7,8 @@ #include "tensors/tensor.h" #include "tensors/tensor_operators.h" #include "training/sparse_tensor.h" -#include "tensors/gpu/cuda_helpers.h" +#include "tensors/gpu/cuda_helpers.h" namespace marian { @@ -48,10 +48,10 @@ __global__ void gFindSubtensor(int* indices, } SparseTensorBase::SparseTensorBase(int capacity, Ptr<Backend> backend) -: backend_(backend), capacity_(capacity) { + : backend_(backend), capacity_(capacity) { ABORT_IF(backend_->getDevice().type == DeviceType::cpu, - "Gradient dropping is currently not implemented for CPU usage"); - + "Gradient dropping is currently not implemented for CPU usage"); + cudaSetDevice(backend_->getDevice().no); CUDA_CHECK(cudaMalloc(&data_, sizeof(float) * capacity)); CUDA_CHECK(cudaMalloc(&indices_, sizeof(int) * capacity)); @@ -64,7 +64,7 @@ SparseTensorBase::SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend) -: backend_(backend) { + : backend_(backend) { data_ = data; indices_ = indices; size_ = size; @@ -98,10 +98,10 @@ void SparseTensorBase::copyFrom(float* data, size_ = size; if(size == 0) return; - + ABORT_IF(backend_->getDevice().type == DeviceType::cpu, - "Gradient dropping is currently not implemented for CPU usage"); - + "Gradient dropping is currently not implemented for CPU usage"); + cudaSetDevice(backend_->getDevice().no); cudaMemcpy(data_, data, size * sizeof(float), cudaMemcpyDefault); @@ -128,7 +128,7 @@ void SparseTensorBase::setSize(int size) { void SparseTensorBase::toDense(Tensor t, int offset) { ABORT_IF(backend_->getDevice().type == DeviceType::cpu, "Gradient dropping is currently not implemented for CPU usage"); - + cudaSetDevice(backend_->getDevice().no); int threads = 512; int blocks = 1 + size_ / threads; @@ -152,8 +152,8 @@ std::shared_ptr<SparseTensorBase> SparseTensorBase::subtensor(int pos, int size, int idx) { ABORT_IF(backend_->getDevice().type == DeviceType::cpu, - "Gradient dropping is currently not implemented for CPU usage"); - + "Gradient dropping is currently not implemented for CPU usage"); + cudaSetDevice(backend_->getDevice().no); cudaStreamSynchronize(0); int* start = gstart_ + idx; diff --git a/src/training/training.h b/src/training/training.h index 7216c36f..31473f00 100644 --- a/src/training/training.h +++ b/src/training/training.h @@ -45,7 +45,6 @@ public: if((options_->has("valid-sets") || options_->has("valid-script-path")) && options_->get<size_t>("valid-freq") > 0) { - for(auto validator : Validators(dataset->getVocabs(), options_)) scheduler->addValidator(validator); } diff --git a/src/training/validator.h b/src/training/validator.h index 295bfa49..7e448e3d 100644 --- a/src/training/validator.h +++ b/src/training/validator.h @@ -4,9 +4,9 @@ #include <cstdlib> #include <limits> +#include "3rd_party/threadpool.h" #include "common/config.h" #include "common/utils.h" -#include "3rd_party/threadpool.h" #include "data/batch_generator.h" #include "data/corpus.h" #include "graph/expression_graph.h" @@ -25,8 +25,7 @@ namespace marian { class ValidatorBase : public TrainingObserver { public: ValidatorBase(bool lowerIsBetter) - : lowerIsBetter_(lowerIsBetter), - lastBest_{initScore()} {} + : lowerIsBetter_(lowerIsBetter), lastBest_{initScore()} {} virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) = 0; virtual std::string type() = 0; @@ -98,7 +97,8 @@ protected: Ptr<data::BatchGenerator<DataSet>>) = 0; - void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs, float val) { + void updateStalled(const std::vector<Ptr<ExpressionGraph>>& graphs, + float val) { if((lowerIsBetter_ && lastBest_ > val) || (!lowerIsBetter_ && lastBest_ < val)) { stalled_ = 0; @@ -226,7 +226,6 @@ public: TranslationValidator(std::vector<Ptr<Vocab>> vocabs, Ptr<Config> options) : Validator(vocabs, options, false), quiet_(options_->get<bool>("quiet-translation")) { - Ptr<Options> opts = New<Options>(); opts->merge(options); opts->set("inference", true); @@ -314,7 +313,8 @@ public: scorer = scorers[id % graphs.size()]; } - auto search = New<BeamSearch>(options_, std::vector<Ptr<Scorer>>{scorer}); + auto search + = New<BeamSearch>(options_, std::vector<Ptr<Scorer>>{scorer}); auto histories = search->search(graph, batch); for(auto history : histories) { diff --git a/src/translator/beam_search.h b/src/translator/beam_search.h index cfe94f59..12df46dc 100644 --- a/src/translator/beam_search.h +++ b/src/translator/beam_search.h @@ -34,21 +34,20 @@ public: std::vector<Ptr<ScorerState>>& states, size_t beamSize, bool first) { - Beams newBeams(beams.size()); for(int i = 0; i < keys.size(); ++i) { - int embIdx = keys[i] % vocabSize; - int beamIdx = i / beamSize; + int embIdx = keys[i] % vocabSize; + int beamIdx = i / beamSize; if(newBeams[beamIdx].size() < beams[beamIdx].size()) { auto& beam = beams[beamIdx]; auto& newBeam = newBeams[beamIdx]; int hypIdx = keys[i] / vocabSize; - float cost = costs[i]; + float cost = costs[i]; - int hypIdxTrans = (hypIdx / beamSize) + - (hypIdx % beamSize) * beams.size(); + int hypIdxTrans + = (hypIdx / beamSize) + (hypIdx % beamSize) * beams.size(); if(first) hypIdxTrans = hypIdx; @@ -78,7 +77,7 @@ public: Beams pruneBeam(const Beams& beams) { Beams newBeams; - for(auto beam: beams) { + for(auto beam : beams) { Beam newBeam; for(auto hyp : beam) { if(hyp->GetWord() > 0) { @@ -90,9 +89,7 @@ public: return newBeams; } - Histories search(Ptr<ExpressionGraph> graph, - Ptr<data::CorpusBatch> batch) { - + Histories search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) { int dimBatch = batch->size(); Histories histories; for(int i = 0; i < dimBatch; ++i) { @@ -140,8 +137,7 @@ public: Expr prevCosts; if(first) { // no cost - prevCosts = graph->constant({1, 1, 1, 1}, - inits::from_value(0)); + prevCosts = graph->constant({1, 1, 1, 1}, inits::from_value(0)); } else { std::vector<float> beamCosts; @@ -155,8 +151,7 @@ public: hypIndices.push_back(hyp->GetPrevStateIndex()); embIndices.push_back(hyp->GetWord()); beamCosts.push_back(hyp->GetCost()); - } - else { + } else { hypIndices.push_back(0); embIndices.push_back(0); beamCosts.push_back(-9999); @@ -164,9 +159,8 @@ public: } } - prevCosts - = graph->constant({(int)localBeamSize, 1, dimBatch, 1}, - inits::from_vector(beamCosts)); + prevCosts = graph->constant({(int)localBeamSize, 1, dimBatch, 1}, + inits::from_vector(beamCosts)); } //********************************************************************** @@ -174,10 +168,12 @@ public: auto totalCosts = prevCosts; for(int i = 0; i < scorers_.size(); ++i) { - states[i] = scorers_[i]->step(graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize); + states[i] = scorers_[i]->step( + graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize); if(scorers_[i]->getWeight() != 1.f) - totalCosts = totalCosts + scorers_[i]->getWeight() * states[i]->getProbs(); + totalCosts + = totalCosts + scorers_[i]->getWeight() * states[i]->getProbs(); else totalCosts = totalCosts + states[i]->getProbs(); } @@ -207,12 +203,14 @@ public: nth->getNBestList(beamSizes, totalCosts->val(), outCosts, outKeys, first); int dimTrgVoc = totalCosts->shape()[-1]; - beams = toHyps(outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first); + beams = toHyps( + outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first); auto prunedBeams = pruneBeam(beams); for(int i = 0; i < dimBatch; ++i) { if(!beams[i].empty()) { - final = final || histories[i]->size() >= 3 * batch->front()->batchWidth(); + final = final + || histories[i]->size() >= 3 * batch->front()->batchWidth(); histories[i]->Add(beams[i], prunedBeams[i].empty() || final); } } diff --git a/src/translator/helpers.cpp b/src/translator/helpers.cpp index f112e405..6f48d218 100644 --- a/src/translator/helpers.cpp +++ b/src/translator/helpers.cpp @@ -18,7 +18,7 @@ void SetColumn(Tensor in_, size_t col, float value) { int nColumns = in_->shape()[-1]; float* in = in_->data(); - for (int rowNumber = 0; rowNumber < nRows; ++rowNumber) { + for(int rowNumber = 0; rowNumber < nRows; ++rowNumber) { int index = col + rowNumber * nColumns; in[index] = value; } @@ -31,7 +31,6 @@ void suppressUnk(Expr probs) { void suppressWord(Expr probs, Word id) { SetColumn(probs->val(), id, std::numeric_limits<float>::lowest()); } - } void suppressUnk(Expr probs) { @@ -55,5 +54,4 @@ void suppressWord(Expr probs, Word id) { } #endif } - } diff --git a/src/translator/helpers.cu b/src/translator/helpers.cu index 7672e515..8971db44 100644 --- a/src/translator/helpers.cu +++ b/src/translator/helpers.cu @@ -44,6 +44,5 @@ void suppressUnk(Expr probs) { void suppressWord(Expr probs, Word id) { SetColumn(probs->val(), id, std::numeric_limits<float>::lowest()); } - } } diff --git a/src/translator/helpers.h b/src/translator/helpers.h index bee596f3..ef9118b7 100644 --- a/src/translator/helpers.h +++ b/src/translator/helpers.h @@ -14,7 +14,6 @@ namespace cpu { void suppressUnk(Expr probs); void suppressWord(Expr probs, Word id); - } namespace gpu { @@ -22,11 +21,9 @@ namespace gpu { void suppressUnk(Expr probs); void suppressWord(Expr probs, Word id); - } void suppressUnk(Expr probs); void suppressWord(Expr probs, Word id); - } diff --git a/src/translator/history.h b/src/translator/history.h index 0070b723..6ed0f019 100644 --- a/src/translator/history.h +++ b/src/translator/history.h @@ -27,7 +27,8 @@ public: if(beam[j]->GetWord() == 0 || last) { float cost = beam[j]->GetCost() / LengthPenalty(history_.size()); topHyps_.push({history_.size(), j, cost}); - //std::cerr << "Add " << history_.size() << " " << j << " " << cost << std::endl; + // std::cerr << "Add " << history_.size() << " " << j << " " << cost + // << std::endl; } } history_.push_back(beam); @@ -44,14 +45,14 @@ public: size_t start = bestHypCoord.i; size_t j = bestHypCoord.j; - //float c = bestHypCoord.cost; - //std::cerr << "h: " << start << " " << j << " " << c << std::endl; + // float c = bestHypCoord.cost; + // std::cerr << "h: " << start << " " << j << " " << c << std::endl; Words targetWords; Ptr<Hypothesis> bestHyp = history_[start][j]; while(bestHyp->GetPrevHyp() != nullptr) { targetWords.push_back(bestHyp->GetWord()); - //std::cerr << bestHyp->GetWord() << " " << bestHyp << std::endl; + // std::cerr << bestHyp->GetWord() << " " << bestHyp << std::endl; bestHyp = bestHyp->GetPrevHyp(); } diff --git a/src/translator/nth_element.cpp b/src/translator/nth_element.cpp index 2730adfb..9416ea0e 100644 --- a/src/translator/nth_element.cpp +++ b/src/translator/nth_element.cpp @@ -3,11 +3,11 @@ * SPDX-License-Identifier: MIT */ +#include "translator/nth_element.h" #include <algorithm> #include <iterator> #include <limits> #include <numeric> -#include "translator/nth_element.h" namespace marian { @@ -18,8 +18,8 @@ NthElementCPU::NthElementCPU(size_t maxBeamSize, size_t maxBatchSize) { } void NthElementCPU::getNBestList(float* probs, - const std::vector<int>& batchFirstElementIdxs, - const std::vector<int>& cumulativeBeamSizes) { + const std::vector<int>& batchFirstElementIdxs, + const std::vector<int>& cumulativeBeamSizes) { /* For each batch, select the top N elements, where N is the beam size for * this batch. Locally record these elements (their current value and index * in 'probs') before updating each element to a large negative value, such @@ -31,16 +31,19 @@ void NthElementCPU::getNBestList(float* probs, std::iota(idxs.begin(), idxs.end(), 0); int numBatches = batchFirstElementIdxs.size() - 1; - for (int batchIdx = 0; batchIdx < numBatches; ++batchIdx) { + for(int batchIdx = 0; batchIdx < numBatches; ++batchIdx) { int pos = cumulativeBeamSizes[batchIdx]; - int beamSize = cumulativeBeamSizes[batchIdx+1] - pos; + int beamSize = cumulativeBeamSizes[batchIdx + 1] - pos; - std::vector<int>::iterator begin = idxs.begin() + batchFirstElementIdxs[batchIdx]; + std::vector<int>::iterator begin + = idxs.begin() + batchFirstElementIdxs[batchIdx]; std::vector<int>::iterator middle = begin + beamSize; - std::vector<int>::iterator end = idxs.begin() + batchFirstElementIdxs[batchIdx+1]; - std::partial_sort(begin, middle, end, [=](int a, int b) { return probs[a] > probs[b]; }); + std::vector<int>::iterator end + = idxs.begin() + batchFirstElementIdxs[batchIdx + 1]; + std::partial_sort( + begin, middle, end, [=](int a, int b) { return probs[a] > probs[b]; }); - while (begin != middle) { + while(begin != middle) { int idx = *begin++; h_res_idx[pos] = idx; h_res[pos] = probs[idx]; @@ -51,32 +54,38 @@ void NthElementCPU::getNBestList(float* probs, } void NthElementCPU::getNBestList(const std::vector<size_t>& beamSizes, - Tensor probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, - const bool isFirst) { + Tensor probs, + std::vector<float>& outCosts, + std::vector<unsigned>& outKeys, + const bool isFirst) { std::vector<int> cumulativeBeamSizes(beamSizes.size() + 1, 0); std::vector<int> batchFirstElementIdxs(beamSizes.size() + 1, 0); size_t vocabSize = probs->shape()[-1]; - for (size_t i = 0; i < beamSizes.size(); ++i) { - cumulativeBeamSizes[i+1] = cumulativeBeamSizes[i] + beamSizes[i]; - batchFirstElementIdxs[i+1] += (isFirst ? i + 1 : cumulativeBeamSizes[i+1]) * vocabSize; + for(size_t i = 0; i < beamSizes.size(); ++i) { + cumulativeBeamSizes[i + 1] = cumulativeBeamSizes[i] + beamSizes[i]; + batchFirstElementIdxs[i + 1] + += (isFirst ? i + 1 : cumulativeBeamSizes[i + 1]) * vocabSize; } getNBestList(probs->data(), batchFirstElementIdxs, cumulativeBeamSizes); GetPairs(cumulativeBeamSizes.back(), outKeys, outCosts); } -void NthElementCPU::GetPairs(size_t number, std::vector<unsigned>& outKeys, - std::vector<float>& outValues) { - std::copy(h_res_idx.begin(), h_res_idx.begin() + number, std::back_inserter(outKeys)); - std::copy(h_res.begin(), h_res.begin() + number, std::back_inserter(outValues)); +void NthElementCPU::GetPairs(size_t number, + std::vector<unsigned>& outKeys, + std::vector<float>& outValues) { + std::copy(h_res_idx.begin(), + h_res_idx.begin() + number, + std::back_inserter(outKeys)); + std::copy( + h_res.begin(), h_res.begin() + number, std::back_inserter(outValues)); lastN = number; } void NthElementCPU::getValueByKey(std::vector<float>& out, float* d_in) { - for (size_t i = 0; i < lastN; ++i) { + for(size_t i = 0; i < lastN; ++i) { out[i] = d_in[h_res_idx[i]]; } } - } diff --git a/src/translator/nth_element.cu b/src/translator/nth_element.cu index f44a6290..fda5855f 100644 --- a/src/translator/nth_element.cu +++ b/src/translator/nth_element.cu @@ -271,8 +271,8 @@ __global__ void gGetValueByKey(float* d_in, float* d_out, int* indeces, int n) { } NthElementGPU::NthElementGPU(size_t maxBeamSize, - size_t maxBatchSize, - DeviceId deviceId) + size_t maxBatchSize, + DeviceId deviceId) : deviceId_(deviceId), NUM_BLOCKS(std::min( 500, @@ -321,10 +321,9 @@ NthElementGPU::~NthElementGPU() { CUDA_CHECK(cudaFree(d_cumBeamSizes)); } - void NthElementGPU::getNBestList(float* probs, - const std::vector<int>& batchFirstElementIdxs, - const std::vector<int>& cummulatedBeamSizes) { + const std::vector<int>& batchFirstElementIdxs, + const std::vector<int>& cummulatedBeamSizes) { cudaSetDevice(deviceId_.no); CUDA_CHECK(cudaMemcpyAsync(d_batchPosition, batchFirstElementIdxs.data(), @@ -359,10 +358,10 @@ void NthElementGPU::getNBestList(float* probs, } void NthElementGPU::getNBestList(const std::vector<size_t>& beamSizes, - Tensor Probs, - std::vector<float>& outCosts, - std::vector<unsigned>& outKeys, - const bool isFirst) { + Tensor Probs, + std::vector<float>& outCosts, + std::vector<unsigned>& outKeys, + const bool isFirst) { cudaSetDevice(deviceId_.no); std::vector<int> cummulatedBeamSizes(beamSizes.size() + 1, 0); @@ -381,8 +380,8 @@ void NthElementGPU::getNBestList(const std::vector<size_t>& beamSizes, } void NthElementGPU::GetPairs(size_t number, - std::vector<unsigned>& outKeys, - std::vector<float>& outValues) { + std::vector<unsigned>& outKeys, + std::vector<float>& outValues) { cudaSetDevice(deviceId_.no); CUDA_CHECK(cudaMemcpyAsync(h_res, d_res, @@ -417,5 +416,4 @@ void NthElementGPU::getValueByKey(std::vector<float>& out, float* d_in) { /* stream_ */ 0)); CUDA_CHECK(cudaStreamSynchronize(/* stream_ */ 0)); } - } diff --git a/src/translator/nth_element.h b/src/translator/nth_element.h index c65d1114..a2485021 100644 --- a/src/translator/nth_element.h +++ b/src/translator/nth_element.h @@ -16,18 +16,21 @@ struct NthElement { virtual ~NthElement() {} virtual void getNBestList(float* probs, - const std::vector<int>& batchFirstElementIdxs, - const std::vector<int>& cummulatedBeamSizes) = 0; + const std::vector<int>& batchFirstElementIdxs, + const std::vector<int>& cummulatedBeamSizes) + = 0; virtual void getNBestList(const std::vector<size_t>& beamSizes, - Tensor Probs, - std::vector<float>& outCosts, - std::vector<unsigned>& outKeys, - const bool isFirst = false) = 0; + Tensor Probs, + std::vector<float>& outCosts, + std::vector<unsigned>& outKeys, + const bool isFirst = false) + = 0; virtual void GetPairs(size_t number, - std::vector<unsigned>& outKeys, - std::vector<float>& outValues) = 0; + std::vector<unsigned>& outKeys, + std::vector<float>& outValues) + = 0; virtual void getValueByKey(std::vector<float>& out, float* d_in) = 0; }; @@ -105,5 +108,4 @@ private: int* d_cumBeamSizes; size_t lastN; }; - } diff --git a/src/translator/printer.h b/src/translator/printer.h index 969fe705..1f9ed787 100644 --- a/src/translator/printer.h +++ b/src/translator/printer.h @@ -14,12 +14,11 @@ void Printer(Ptr<Config> options, Ptr<History> history, OStream& best1, OStream& bestn) { - bool reverse = options->get<bool>("right-left"); if(options->has("n-best") && options->get<bool>("n-best")) { const auto& nbl = history->NBest(options->get<size_t>("beam-size")); - + for(size_t i = 0; i < nbl.size(); ++i) { const auto& result = nbl[i]; const auto& words = std::get<0>(result); @@ -49,9 +48,9 @@ void Printer(Ptr<Config> options, } auto bestTranslation = history->Top(); - - std::string translation = Join((*vocab)(std::get<0>(bestTranslation)), - " ", reverse); + + std::string translation + = Join((*vocab)(std::get<0>(bestTranslation)), " ", reverse); best1 << translation << std::flush; } } diff --git a/src/translator/scorers.h b/src/translator/scorers.h index 94bda6e7..da7b20ad 100644 --- a/src/translator/scorers.h +++ b/src/translator/scorers.h @@ -34,7 +34,8 @@ public: Ptr<ScorerState>, const std::vector<size_t>&, const std::vector<size_t>&, - int dimBatch, int beamSize) + int dimBatch, + int beamSize) = 0; virtual void init(Ptr<ExpressionGraph> graph) {} @@ -90,12 +91,13 @@ public: Ptr<ScorerState> state, const std::vector<size_t>& hypIndices, const std::vector<size_t>& embIndices, - int dimBatch, int beamSize) { + int dimBatch, + int beamSize) { graph->switchParams(getName()); auto wrappedState = std::dynamic_pointer_cast<ScorerWrapperState>(state)->getState(); - return New<ScorerWrapperState>( - encdec_->step(graph, wrappedState, hypIndices, embIndices, dimBatch, beamSize)); + return New<ScorerWrapperState>(encdec_->step( + graph, wrappedState, hypIndices, embIndices, dimBatch, beamSize)); } }; @@ -132,8 +134,7 @@ public: p[0] = 0; p[2] = 0; - penalties_ = graph->constant({1, dimVocab_}, - inits::from_vector(p)); + penalties_ = graph->constant({1, dimVocab_}, inits::from_vector(p)); return New<WordPenaltyState>(dimVocab_, penalties_); } @@ -141,7 +142,8 @@ public: Ptr<ScorerState> state, const std::vector<size_t>& hypIndices, const std::vector<size_t>& embIndices, - int dimBatch, int beamSize) { + int dimBatch, + int beamSize) { return state; } }; @@ -168,8 +170,7 @@ public: p[i] = 0; p[2] = 0; - penalties_ = graph->constant({1, dimVocab_}, - inits::from_vector(p)); + penalties_ = graph->constant({1, dimVocab_}, inits::from_vector(p)); return New<WordPenaltyState>(dimVocab_, penalties_); } @@ -177,7 +178,8 @@ public: Ptr<ScorerState> state, const std::vector<size_t>& hypIndices, const std::vector<size_t>& embIndices, - int dimBatch, int beamSize) { + int dimBatch, + int beamSize) { return state; } }; diff --git a/src/translator/translator.h b/src/translator/translator.h index 43dbf16a..0cb0da01 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -37,7 +37,7 @@ public: ThreadPool threadPool(devices.size(), devices.size()); scorers_.resize(devices.size()); graphs_.resize(devices.size()); - + size_t id = 0; for(auto device : devices) { auto task = [&](DeviceId device, size_t id) { @@ -61,13 +61,13 @@ public: data::BatchGenerator<data::Corpus> bg(corpus_, options_); auto devices = options_->getDevices(); - + ThreadPool threadPool(devices.size(), devices.size()); size_t batchId = 0; auto collector = New<OutputCollector>(); if(options_->get<bool>("quiet-translation")) - collector->setPrintingStrategy(New<QuietPrinting>()); + collector->setPrintingStrategy(New<QuietPrinting>()); bg.prepare(false); |