diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2018-02-27 03:18:29 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2018-02-27 03:18:29 +0300 |
commit | df29f16826c692458e0f30e697c4636e351b45aa (patch) | |
tree | 73488b490618a31fd30213e1831713965de3793c | |
parent | 205c8ca9b949b3bd5c590c2a6cd12f144fc0702b (diff) | |
parent | 3027e801b7047897302266d7d1937c4475fdb49b (diff) |
Merge ../marian.hieu
95 files changed, 1142 insertions, 1127 deletions
diff --git a/contrib/other-builds/amunmt/.project b/contrib/other-builds/amunmt/.project index 106dac4a..413c6001 100644 --- a/contrib/other-builds/amunmt/.project +++ b/contrib/other-builds/amunmt/.project @@ -141,9 +141,9 @@ <locationURI>PARENT-3-PROJECT_LOC/src/amun/common/base_matrix.cpp</locationURI> </link> <link> - <name>src/amun/common/base_matrix.h</name> + <name>src/amun/common/base_tensor.h</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/common/base_matrix.h</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/common/base_tensor.h</locationURI> </link> <link> <name>src/amun/common/beam.cpp</name> @@ -1291,11 +1291,6 @@ <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/decoder/encoder_decoder_state.h</locationURI> </link> <link> - <name>src/amun/cpu/dl4mt/decoder.cpp</name> - <type>1</type> - <locationURI>null:/decoder.cpp</locationURI> - </link> - <link> <name>src/amun/cpu/dl4mt/decoder.h</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/dl4mt/decoder.h</locationURI> @@ -1346,16 +1341,6 @@ <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/dl4mt/model.h</locationURI> </link> <link> - <name>src/amun/cpu/mblas/matrix.cpp</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/mblas/matrix.cpp</locationURI> - </link> - <link> - <name>src/amun/cpu/mblas/matrix.h</name> - <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/mblas/matrix.h</locationURI> - </link> - <link> <name>src/amun/cpu/mblas/phoenix_functions.cpp</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/mblas/phoenix_functions.cpp</locationURI> @@ -1371,6 +1356,16 @@ <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/mblas/simd_math_prims.h</locationURI> </link> <link> + <name>src/amun/cpu/mblas/tensor.cpp</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/mblas/tensor.cpp</locationURI> + </link> + <link> + <name>src/amun/cpu/mblas/tensor.h</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/mblas/tensor.h</locationURI> + </link> + <link> <name>src/amun/cpu/nematus/decoder.h</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/src/amun/cpu/nematus/decoder.h</locationURI> @@ -1516,6 +1511,21 @@ <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/decoder/language_model.h</locationURI> </link> <link> + <name>src/amun/gpu/dl4mt/cell.h</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/dl4mt/cell.h</locationURI> + </link> + <link> + <name>src/amun/gpu/dl4mt/cellstate.cpp</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/dl4mt/cellstate.cpp</locationURI> + </link> + <link> + <name>src/amun/gpu/dl4mt/cellstate.h</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/dl4mt/cellstate.h</locationURI> + </link> + <link> <name>src/amun/gpu/dl4mt/decoder.h</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/dl4mt/decoder.h</locationURI> @@ -1561,6 +1571,11 @@ <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/dl4mt/model.h</locationURI> </link> <link> + <name>src/amun/gpu/dl4mt/multiplicative.h</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/dl4mt/multiplicative.h</locationURI> + </link> + <link> <name>src/amun/gpu/mblas/handles.cu</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/handles.cu</locationURI> @@ -1571,49 +1586,49 @@ <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/handles.h</locationURI> </link> <link> - <name>src/amun/gpu/mblas/matrix.cu</name> + <name>src/amun/gpu/mblas/nth_element.cu</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/matrix.cu</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/nth_element.cu</locationURI> </link> <link> - <name>src/amun/gpu/mblas/matrix.h</name> + <name>src/amun/gpu/mblas/nth_element.h</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/matrix.h</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/nth_element.h</locationURI> </link> <link> - <name>src/amun/gpu/mblas/matrix_functions.cu</name> + <name>src/amun/gpu/mblas/nth_element_kernels.cu</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/matrix_functions.cu</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/nth_element_kernels.cu</locationURI> </link> <link> - <name>src/amun/gpu/mblas/matrix_functions.h</name> + <name>src/amun/gpu/mblas/nth_element_kernels.h</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/matrix_functions.h</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/nth_element_kernels.h</locationURI> </link> <link> - <name>src/amun/gpu/mblas/matrix_wrapper.h</name> + <name>src/amun/gpu/mblas/tensor.cu</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/matrix_wrapper.h</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/tensor.cu</locationURI> </link> <link> - <name>src/amun/gpu/mblas/nth_element.cu</name> + <name>src/amun/gpu/mblas/tensor.h</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/nth_element.cu</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/tensor.h</locationURI> </link> <link> - <name>src/amun/gpu/mblas/nth_element.h</name> + <name>src/amun/gpu/mblas/tensor_functions.cu</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/nth_element.h</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/tensor_functions.cu</locationURI> </link> <link> - <name>src/amun/gpu/mblas/nth_element_kernels.cu</name> + <name>src/amun/gpu/mblas/tensor_functions.h</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/nth_element_kernels.cu</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/tensor_functions.h</locationURI> </link> <link> - <name>src/amun/gpu/mblas/nth_element_kernels.h</name> + <name>src/amun/gpu/mblas/tensor_wrapper.h</name> <type>1</type> - <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/nth_element_kernels.h</locationURI> + <locationURI>PARENT-3-PROJECT_LOC/src/amun/gpu/mblas/tensor_wrapper.h</locationURI> </link> <link> <name>src/amun/gpu/mblas/thrust_functions.h</name> diff --git a/src/amun/CMakeLists.txt b/src/amun/CMakeLists.txt index bf53e6d5..6145cbda 100644 --- a/src/amun/CMakeLists.txt +++ b/src/amun/CMakeLists.txt @@ -9,8 +9,8 @@ list(APPEND SOURCES "${CMAKE_CURRENT_BINARY_DIR}/common/git_version.cpp") add_library(cpumode OBJECT - cpu/mblas/matrix.cpp cpu/mblas/phoenix_functions.cpp + cpu/mblas/tensor.cpp cpu/decoder/encoder_decoder.cpp cpu/decoder/encoder_decoder_state.cpp cpu/decoder/encoder_decoder_loader.cpp @@ -43,7 +43,6 @@ add_library(cpumode OBJECT add_library(libcommon OBJECT ${CMAKE_CURRENT_BINARY_DIR}/common/git_version.cpp common/base_best_hyps.cpp - common/base_matrix.cpp common/config.cpp common/exception.cpp common/filter.cpp @@ -64,6 +63,7 @@ add_library(libcommon OBJECT common/utils.cpp common/vocab.cpp common/factor_vocab.cpp + common/base_tensor.cpp common/translation_task.cpp ) @@ -89,10 +89,10 @@ cuda_add_executable( gpu/dl4mt/gru.cu gpu/dl4mt/model.cu gpu/mblas/handles.cu - gpu/mblas/matrix.cu - gpu/mblas/matrix_functions.cu gpu/mblas/nth_element.cu gpu/mblas/nth_element_kernels.cu + gpu/mblas/tensor.cu + gpu/mblas/tensor_functions.cu gpu/npz_converter.cu gpu/types-gpu.cu @@ -112,10 +112,10 @@ cuda_add_library(python SHARED gpu/decoder/encoder_decoder_loader.cu gpu/decoder/encoder_decoder_state.cu gpu/mblas/handles.cu - gpu/mblas/matrix.cu - gpu/mblas/matrix_functions.cu gpu/mblas/nth_element.cu gpu/mblas/nth_element_kernels.cu + gpu/mblas/tensor.cu + gpu/mblas/tensor_functions.cu gpu/dl4mt/encoder.cu gpu/dl4mt/gru.cu gpu/dl4mt/model.cu diff --git a/src/amun/common/base_best_hyps.cpp b/src/amun/common/base_best_hyps.cpp index a5221b8f..052337a8 100644 --- a/src/amun/common/base_best_hyps.cpp +++ b/src/amun/common/base_best_hyps.cpp @@ -5,7 +5,7 @@ using namespace std; namespace amunmt { -BestHypsBase::BestHypsBase(const God &god) +BaseBestHyps::BaseBestHyps(const God &god) : god_(god), forbidUNK_(!god.Get<bool>("allow-unk")), isInputFiltered_(god.Get<std::vector<std::string>>("softmax-filter").size()), diff --git a/src/amun/common/base_best_hyps.h b/src/amun/common/base_best_hyps.h index f643cbb8..8b720ef0 100644 --- a/src/amun/common/base_best_hyps.h +++ b/src/amun/common/base_best_hyps.h @@ -9,14 +9,12 @@ namespace amunmt { -class God; - -class BestHypsBase +class BaseBestHyps { public: - BestHypsBase(const God &god); + BaseBestHyps(const God &god); - BestHypsBase(const BestHypsBase&) = delete; + BaseBestHyps(const BaseBestHyps&) = delete; virtual void CalcBeam( const Beam& prevHyps, @@ -34,6 +32,6 @@ class BestHypsBase }; -typedef std::shared_ptr<BestHypsBase> BestHypsBasePtr; +typedef std::shared_ptr<BaseBestHyps> BaseBestHypsPtr; } diff --git a/src/amun/common/base_matrix.cpp b/src/amun/common/base_tensor.cpp index 4324dee5..56105978 100644 --- a/src/amun/common/base_matrix.cpp +++ b/src/amun/common/base_tensor.cpp @@ -1,11 +1,11 @@ #include <sstream> -#include "base_matrix.h" +#include "base_tensor.h" using namespace std; namespace amunmt { -unsigned BaseMatrix::size() const { +unsigned BaseTensor::size() const { unsigned ret = dim(0); for (unsigned i = 1; i < SHAPE_SIZE; ++i) { ret *= dim(i); @@ -14,7 +14,7 @@ unsigned BaseMatrix::size() const { return ret; } -std::string BaseMatrix::Debug(unsigned detailed) const +std::string BaseTensor::Debug(unsigned detailed) const { std::stringstream strm; strm << dim(0) << "x" << dim(1) << "x" << dim(2) << "x" << dim(3) << "=" << size(); diff --git a/src/amun/common/base_matrix.h b/src/amun/common/base_tensor.h index 068ac873..13159159 100644 --- a/src/amun/common/base_matrix.h +++ b/src/amun/common/base_tensor.h @@ -9,17 +9,10 @@ namespace amunmt { const unsigned SHAPE_SIZE = 4; -class Hypothesis; -typedef std::shared_ptr<Hypothesis> HypothesisPtr; -typedef std::vector<HypothesisPtr> Beam; - -class Scorer; -typedef std::shared_ptr<Scorer> ScorerPtr; - -class BaseMatrix { +class BaseTensor { public: - BaseMatrix() {} - virtual ~BaseMatrix() {} + BaseTensor() {} + virtual ~BaseTensor() {} virtual unsigned dim(unsigned i) const = 0; diff --git a/src/amun/common/god.cpp b/src/amun/common/god.cpp index b4574835..130985b9 100644 --- a/src/amun/common/god.cpp +++ b/src/amun/common/god.cpp @@ -270,7 +270,7 @@ std::vector<ScorerPtr> God::GetScorers(const DeviceInfo &deviceInfo) const { return scorers; } -BestHypsBasePtr God::GetBestHyps(const DeviceInfo &deviceInfo) const { +BaseBestHypsPtr God::GetBestHyps(const DeviceInfo &deviceInfo) const { if (deviceInfo.deviceType == CPUDevice) { return cpuLoaders_.begin()->second->GetBestHyps(*this, deviceInfo); } diff --git a/src/amun/common/god.h b/src/amun/common/god.h index 9153ecf5..e489ba98 100644 --- a/src/amun/common/god.h +++ b/src/amun/common/god.h @@ -61,7 +61,7 @@ class God { std::shared_ptr<const Filter> GetFilter() const; - BestHypsBasePtr GetBestHyps(const DeviceInfo &deviceInfo) const; + BaseBestHypsPtr GetBestHyps(const DeviceInfo &deviceInfo) const; std::vector<ScorerPtr> GetScorers(const DeviceInfo &deviceInfo) const; std::vector<std::string> GetScorerNames() const; diff --git a/src/amun/common/loader.h b/src/amun/common/loader.h index 5f4940f3..55b39fc6 100644 --- a/src/amun/common/loader.h +++ b/src/amun/common/loader.h @@ -29,7 +29,7 @@ class Loader { } virtual ScorerPtr NewScorer(const God &god, const DeviceInfo &deviceInfo) const = 0; - virtual BestHypsBasePtr GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const = 0; + virtual BaseBestHypsPtr GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const = 0; const std::string& GetName() const { return name_; diff --git a/src/amun/common/scorer.h b/src/amun/common/scorer.h index 7ed6bd7f..8c3a46a8 100644 --- a/src/amun/common/scorer.h +++ b/src/amun/common/scorer.h @@ -5,12 +5,17 @@ #include "common/hypothesis.h" #include "common/sentence.h" -#include "common/base_matrix.h" +#include "common/base_tensor.h" #include "yaml-cpp/node/node.h" namespace amunmt { +class God; class Sentences; +class Hypothesis; +typedef std::shared_ptr<Hypothesis> HypothesisPtr; +typedef std::vector<HypothesisPtr> Beam; + class State { public: @@ -63,9 +68,9 @@ class Scorer { return name_; } - virtual BaseMatrix& GetProbs() = 0; + virtual BaseTensor& GetProbs() = 0; virtual void *GetNBest() = 0; // hack - need to return matrix<NthOut> but NthOut contain cuda code - virtual const BaseMatrix *GetBias() const = 0; + virtual const BaseTensor *GetBias() const = 0; protected: const God &god_; diff --git a/src/amun/common/search.cpp b/src/amun/common/search.cpp index 70b2e4ae..6bfb704e 100644 --- a/src/amun/common/search.cpp +++ b/src/amun/common/search.cpp @@ -5,7 +5,7 @@ #include "common/history.h" #include "common/histories.h" #include "common/filter.h" -#include "common/base_matrix.h" +#include "common/base_tensor.h" #ifdef CUDA #include <cuda.h> diff --git a/src/amun/common/search.h b/src/amun/common/search.h index 4aec6068..c03a54bf 100644 --- a/src/amun/common/search.h +++ b/src/amun/common/search.h @@ -41,7 +41,7 @@ class Search { const unsigned maxBeamSize_; bool normalizeScore_; Words filterIndices_; - BestHypsBasePtr bestHyps_; + BaseBestHypsPtr bestHyps_; std::vector<unsigned> activeCount_; void BatchStats(); diff --git a/src/amun/cpu/decoder/best_hyps.h b/src/amun/cpu/decoder/best_hyps.h index 461246c0..1621b681 100644 --- a/src/amun/cpu/decoder/best_hyps.h +++ b/src/amun/cpu/decoder/best_hyps.h @@ -6,7 +6,7 @@ #include "common/scorer.h" #include "common/god.h" #include "common/exception.h" -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" #include "cpu/decoder/encoder_decoder.h" namespace amunmt { @@ -22,11 +22,11 @@ struct ProbCompare { const float* data_; }; -class BestHyps : public BestHypsBase +class BestHyps : public BaseBestHyps { public: BestHyps(const God &god) - : BestHypsBase(god) + : BaseBestHyps(god) {} void CalcBeam( diff --git a/src/amun/cpu/decoder/encoder_decoder.h b/src/amun/cpu/decoder/encoder_decoder.h index 25e28f4e..f26ae952 100644 --- a/src/amun/cpu/decoder/encoder_decoder.h +++ b/src/amun/cpu/decoder/encoder_decoder.h @@ -3,7 +3,7 @@ #include <yaml-cpp/yaml.h> #include "common/scorer.h" -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" #include "cpu/decoder/encoder_decoder_state.h" namespace amunmt { @@ -22,8 +22,8 @@ class CPUEncoderDecoderBase : public Scorer { virtual State* NewState() const; - virtual void GetAttention(mblas::Matrix& Attention) = 0; - virtual mblas::Matrix& GetAttention() = 0; + virtual void GetAttention(mblas::Tensor& Attention) = 0; + virtual mblas::Tensor& GetAttention() = 0; virtual void *GetNBest() { @@ -31,14 +31,14 @@ class CPUEncoderDecoderBase : public Scorer { return nullptr; } - virtual const BaseMatrix *GetBias() const + virtual const BaseTensor *GetBias() const { assert(false); return nullptr; } protected: - mblas::Matrix SourceContext_; + mblas::Tensor SourceContext_; }; diff --git a/src/amun/cpu/decoder/encoder_decoder_loader.cpp b/src/amun/cpu/decoder/encoder_decoder_loader.cpp index 55d7b65f..f29ed9c0 100644 --- a/src/amun/cpu/decoder/encoder_decoder_loader.cpp +++ b/src/amun/cpu/decoder/encoder_decoder_loader.cpp @@ -43,8 +43,8 @@ ScorerPtr EncoderDecoderLoader::NewScorer(const God &god, const DeviceInfo&) con tab, *dl4mtModels_[0])); } -BestHypsBasePtr EncoderDecoderLoader::GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const { - return BestHypsBasePtr(new CPU::BestHyps(god)); +BaseBestHypsPtr EncoderDecoderLoader::GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const { + return BaseBestHypsPtr(new CPU::BestHyps(god)); } } diff --git a/src/amun/cpu/decoder/encoder_decoder_loader.h b/src/amun/cpu/decoder/encoder_decoder_loader.h index 8a90599c..89814dd0 100644 --- a/src/amun/cpu/decoder/encoder_decoder_loader.h +++ b/src/amun/cpu/decoder/encoder_decoder_loader.h @@ -28,7 +28,7 @@ class EncoderDecoderLoader : public Loader { virtual void Load(const God& god); virtual ScorerPtr NewScorer(const God &god, const DeviceInfo &deviceInfo) const; - BestHypsBasePtr GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const; + BaseBestHypsPtr GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const; private: std::vector<std::unique_ptr<dl4mt::Weights>> dl4mtModels_; diff --git a/src/amun/cpu/decoder/encoder_decoder_state.cpp b/src/amun/cpu/decoder/encoder_decoder_state.cpp index 85112993..647f8c6c 100644 --- a/src/amun/cpu/decoder/encoder_decoder_state.cpp +++ b/src/amun/cpu/decoder/encoder_decoder_state.cpp @@ -14,19 +14,19 @@ std::string EncoderDecoderState::Debug(unsigned verbosity) const return CPU::mblas::Debug(states_); } -CPU::mblas::Matrix& EncoderDecoderState::GetStates() { +CPU::mblas::Tensor& EncoderDecoderState::GetStates() { return states_; } -CPU::mblas::Matrix& EncoderDecoderState::GetEmbeddings() { +CPU::mblas::Tensor& EncoderDecoderState::GetEmbeddings() { return embeddings_; } -const CPU::mblas::Matrix& EncoderDecoderState::GetStates() const { +const CPU::mblas::Tensor& EncoderDecoderState::GetStates() const { return states_; } -const CPU::mblas::Matrix& EncoderDecoderState::GetEmbeddings() const { +const CPU::mblas::Tensor& EncoderDecoderState::GetEmbeddings() const { return embeddings_; } diff --git a/src/amun/cpu/decoder/encoder_decoder_state.h b/src/amun/cpu/decoder/encoder_decoder_state.h index d0ebe53a..be3cd028 100644 --- a/src/amun/cpu/decoder/encoder_decoder_state.h +++ b/src/amun/cpu/decoder/encoder_decoder_state.h @@ -2,7 +2,7 @@ #include <vector> -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" #include "common/scorer.h" namespace amunmt { @@ -15,15 +15,15 @@ class EncoderDecoderState : public State { virtual std::string Debug(unsigned verbosity = 1) const; - CPU::mblas::Matrix& GetStates(); - const CPU::mblas::Matrix& GetStates() const; + CPU::mblas::Tensor& GetStates(); + const CPU::mblas::Tensor& GetStates() const; - CPU::mblas::Matrix& GetEmbeddings(); - const CPU::mblas::Matrix& GetEmbeddings() const; + CPU::mblas::Tensor& GetEmbeddings(); + const CPU::mblas::Tensor& GetEmbeddings() const; private: - CPU::mblas::Matrix states_; - CPU::mblas::Matrix embeddings_; + CPU::mblas::Tensor states_; + CPU::mblas::Tensor embeddings_; }; } // namespace CPU diff --git a/src/amun/cpu/dl4mt/decoder.h b/src/amun/cpu/dl4mt/decoder.h index add290cc..d9bbf3c3 100644 --- a/src/amun/cpu/dl4mt/decoder.h +++ b/src/amun/cpu/dl4mt/decoder.h @@ -1,6 +1,6 @@ #pragma once -#include "../mblas/matrix.h" +#include "../mblas/tensor.h" #include "model.h" #include "gru.h" #include "common/god.h" @@ -18,13 +18,13 @@ class Decoder { : w_(model) {} - void Lookup(mblas::Matrix& Rows, const std::vector<unsigned>& ids) { + void Lookup(mblas::Tensor& Rows, const std::vector<unsigned>& ids) { using namespace mblas; std::vector<unsigned> tids = ids; for(auto&& id : tids) if(id >= w_.E_.rows()) id = 1; - Rows = Assemble<byRow, Matrix>(w_.E_, tids); + Rows = Assemble<byRow, Tensor>(w_.E_, tids); } size_t GetCols() { @@ -46,14 +46,14 @@ class Decoder { RNNHidden(const Weights1& initModel, const Weights2& gruModel) : w_(initModel), gru_(gruModel) {} - void InitializeState(mblas::Matrix& State, - const mblas::Matrix& SourceContext, + void InitializeState(mblas::Tensor& State, + const mblas::Tensor& SourceContext, const size_t batchSize = 1) { using namespace mblas; // Calculate mean of source context, rowwise // Repeat mean batchSize times by broadcasting - Temp1_ = Mean<byRow, Matrix>(SourceContext); + Temp1_ = Mean<byRow, Tensor>(SourceContext); Temp2_.resize(batchSize, SourceContext.columns()); Temp2_ = 0.0f; AddBiasVector<byRow>(Temp2_, Temp1_); @@ -69,9 +69,9 @@ class Decoder { State = blaze::forEach(State, Tanh()); } - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Context) { + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Context) { gru_.GetNextState(NextState, State, Context); } @@ -79,8 +79,8 @@ class Decoder { const Weights1& w_; const GRU<Weights2> gru_; - mblas::Matrix Temp1_; - mblas::Matrix Temp2_; + mblas::Tensor Temp1_; + mblas::Tensor Temp2_; }; ////////////////////////////////////////////////////////////// @@ -90,9 +90,9 @@ class Decoder { RNNFinal(const Weights& model) : gru_(model) {} - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Context) { + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Context) { gru_.GetNextState(NextState, State, Context); } @@ -110,7 +110,7 @@ class Decoder { V_ = blaze::trans(blaze::row(w_.V_, 0)); } - void Init(const mblas::Matrix& SourceContext) { + void Init(const mblas::Tensor& SourceContext) { using namespace mblas; SCU_ = SourceContext * w_.U_; if (w_.Gamma_1_.rows()) { @@ -119,9 +119,9 @@ class Decoder { AddBiasVector<byRow>(SCU_, w_.B_); } - void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, - const mblas::Matrix& HiddenState, - const mblas::Matrix& SourceContext) { + void GetAlignedSourceContext(mblas::Tensor& AlignedSourceContext, + const mblas::Tensor& HiddenState, + const mblas::Tensor& SourceContext) { using namespace mblas; Temp2_ = HiddenState * w_.W_; @@ -129,7 +129,7 @@ class Decoder { LayerNormalization(Temp2_, w_.Gamma_2_); } - Temp1_ = Broadcast<Matrix>(Tanh(), SCU_, Temp2_); + Temp1_ = Broadcast<Tensor>(Tanh(), SCU_, Temp2_); A_.resize(Temp1_.rows(), 1); blaze::column(A_, 0) = Temp1_ * V_; @@ -145,21 +145,21 @@ class Decoder { AlignedSourceContext = A_ * SourceContext; } - void GetAttention(mblas::Matrix& Attention) { + void GetAttention(mblas::Tensor& Attention) { Attention = A_; } - mblas::Matrix& GetAttention() { + mblas::Tensor& GetAttention() { return A_; } private: const Weights& w_; - mblas::Matrix SCU_; - mblas::Matrix Temp1_; - mblas::Matrix Temp2_; - mblas::Matrix A_; + mblas::Tensor SCU_; + mblas::Tensor Temp1_; + mblas::Tensor Temp2_; + mblas::Tensor A_; mblas::ColumnVector V_; }; @@ -173,9 +173,9 @@ class Decoder { {} void GetProbs(mblas::ArrayMatrix& Probs, - const mblas::Matrix& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext) { + const mblas::Tensor& State, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext) { using namespace mblas; @@ -212,20 +212,20 @@ class Decoder { void Filter(const std::vector<unsigned>& ids) { filtered_ = true; using namespace mblas; - FilteredW4_ = Assemble<byColumn, Matrix>(w_.W4_, ids); - FilteredB4_ = Assemble<byColumn, Matrix>(w_.B4_, ids); + FilteredW4_ = Assemble<byColumn, Tensor>(w_.W4_, ids); + FilteredB4_ = Assemble<byColumn, Tensor>(w_.B4_, ids); } private: const Weights& w_; bool filtered_; - mblas::Matrix FilteredW4_; - mblas::Matrix FilteredB4_; + mblas::Tensor FilteredW4_; + mblas::Tensor FilteredB4_; - mblas::Matrix T1_; - mblas::Matrix T2_; - mblas::Matrix T3_; + mblas::Tensor T1_; + mblas::Tensor T2_; + mblas::Tensor T3_; }; public: @@ -237,10 +237,10 @@ class Decoder { softmax_(model.decSoftmax_) {} - void Decode(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Embeddings, - const mblas::Matrix& SourceContext) { + void Decode(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Embeddings, + const mblas::Tensor& SourceContext) { GetHiddenState(HiddenState_, State, Embeddings); GetAlignedSourceContext(AlignedSourceContext_, HiddenState_, SourceContext); GetNextState(NextState, HiddenState_, AlignedSourceContext_); @@ -251,20 +251,20 @@ class Decoder { return Probs_; } - void EmptyState(mblas::Matrix& State, - const mblas::Matrix& SourceContext, + void EmptyState(mblas::Tensor& State, + const mblas::Tensor& SourceContext, size_t batchSize = 1) { rnn1_.InitializeState(State, SourceContext, batchSize); attention_.Init(SourceContext); } - void EmptyEmbedding(mblas::Matrix& Embedding, + void EmptyEmbedding(mblas::Tensor& Embedding, size_t batchSize = 1) { Embedding.resize(batchSize, embeddings_.GetCols()); Embedding = 0.0f; } - void Lookup(mblas::Matrix& Embedding, + void Lookup(mblas::Tensor& Embedding, const std::vector<unsigned>& w) { embeddings_.Lookup(Embedding, w); } @@ -273,11 +273,11 @@ class Decoder { softmax_.Filter(ids); } - void GetAttention(mblas::Matrix& attention) { + void GetAttention(mblas::Tensor& attention) { attention_.GetAttention(attention); } - mblas::Matrix& GetAttention() { + mblas::Tensor& GetAttention() { return attention_.GetAttention(); } @@ -287,34 +287,34 @@ class Decoder { private: - void GetHiddenState(mblas::Matrix& HiddenState, - const mblas::Matrix& PrevState, - const mblas::Matrix& Embedding) { + void GetHiddenState(mblas::Tensor& HiddenState, + const mblas::Tensor& PrevState, + const mblas::Tensor& Embedding) { rnn1_.GetNextState(HiddenState, PrevState, Embedding); } - void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, - const mblas::Matrix& HiddenState, - const mblas::Matrix& SourceContext) { + void GetAlignedSourceContext(mblas::Tensor& AlignedSourceContext, + const mblas::Tensor& HiddenState, + const mblas::Tensor& SourceContext) { attention_.GetAlignedSourceContext(AlignedSourceContext, HiddenState, SourceContext); } - void GetNextState(mblas::Matrix& State, - const mblas::Matrix& HiddenState, - const mblas::Matrix& AlignedSourceContext) { + void GetNextState(mblas::Tensor& State, + const mblas::Tensor& HiddenState, + const mblas::Tensor& AlignedSourceContext) { rnn2_.GetNextState(State, HiddenState, AlignedSourceContext); } - void GetProbs(const mblas::Matrix& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext) { + void GetProbs(const mblas::Tensor& State, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext) { softmax_.GetProbs(Probs_, State, Embedding, AlignedSourceContext); } private: - mblas::Matrix HiddenState_; - mblas::Matrix AlignedSourceContext_; + mblas::Tensor HiddenState_; + mblas::Tensor AlignedSourceContext_; mblas::ArrayMatrix Probs_; Embeddings<Weights::Embeddings> embeddings_; diff --git a/src/amun/cpu/dl4mt/encoder.cpp b/src/amun/cpu/dl4mt/encoder.cpp index 8887fdb9..05eb7971 100644 --- a/src/amun/cpu/dl4mt/encoder.cpp +++ b/src/amun/cpu/dl4mt/encoder.cpp @@ -7,15 +7,15 @@ namespace CPU { namespace dl4mt { void Encoder::Encode(const std::vector<unsigned>& words, - mblas::Matrix& context) { - std::vector<mblas::Matrix> embeddedWords; + mblas::Tensor& context) { + std::vector<mblas::Tensor> embeddedWords; context.resize(words.size(), forwardRnn_.GetStateLength() + backwardRnn_.GetStateLength()); for(auto& w : words) { embeddedWords.emplace_back(); - mblas::Matrix &embed = embeddedWords.back(); + mblas::Tensor &embed = embeddedWords.back(); embeddings_.Lookup(embed, w); //cerr << "embed=" << embed.Debug(true) << endl; } diff --git a/src/amun/cpu/dl4mt/encoder.h b/src/amun/cpu/dl4mt/encoder.h index efa6e3da..40ebd1f0 100644 --- a/src/amun/cpu/dl4mt/encoder.h +++ b/src/amun/cpu/dl4mt/encoder.h @@ -1,6 +1,6 @@ #pragma once -#include "../mblas/matrix.h" +#include "../mblas/tensor.h" #include "../dl4mt/model.h" #include "../dl4mt/gru.h" @@ -19,7 +19,7 @@ class Encoder { : w_(model) {} - void Lookup(mblas::Matrix& Row, size_t i) { + void Lookup(mblas::Tensor& Row, size_t i) { size_t len = w_.E_.columns(); if(i < w_.E_.rows()) Row = blaze::submatrix(w_.E_, i, 0, 1, len); @@ -43,15 +43,15 @@ class Encoder { State_ = 0.0f; } - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Embd) { + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Embd) { gru_.GetNextState(NextState, State, Embd); } template <class It> void Encode(It it, It end, - mblas::Matrix& Context, bool invert) { + mblas::Tensor& Context, bool invert) { InitializeState(); size_t n = std::distance(it, end); @@ -76,7 +76,7 @@ class Encoder { // Model matrices const GRU<Weights> gru_; - mblas::Matrix State_; + mblas::Tensor State_; }; ///////////////////////////////////////////////////////////////// @@ -88,7 +88,7 @@ class Encoder { {} void Encode(const std::vector<unsigned>& words, - mblas::Matrix& context); + mblas::Tensor& context); private: Embeddings<Weights::Embeddings> embeddings_; diff --git a/src/amun/cpu/dl4mt/encoder_decoder.cpp b/src/amun/cpu/dl4mt/encoder_decoder.cpp index 73e32d2f..0644182c 100644 --- a/src/amun/cpu/dl4mt/encoder_decoder.cpp +++ b/src/amun/cpu/dl4mt/encoder_decoder.cpp @@ -60,17 +60,17 @@ void EncoderDecoder::AssembleBeamState(const State& in, const EDState& edIn = in.get<EDState>(); EDState& edOut = out.get<EDState>(); - edOut.GetStates() = mblas::Assemble<mblas::byRow, mblas::Matrix>(edIn.GetStates(), beamStateIds); + edOut.GetStates() = mblas::Assemble<mblas::byRow, mblas::Tensor>(edIn.GetStates(), beamStateIds); decoder_->Lookup(edOut.GetEmbeddings(), beamWords); } -void EncoderDecoder::GetAttention(mblas::Matrix& Attention) { +void EncoderDecoder::GetAttention(mblas::Tensor& Attention) { decoder_->GetAttention(Attention); } -mblas::Matrix& EncoderDecoder::GetAttention() { +mblas::Tensor& EncoderDecoder::GetAttention() { return decoder_->GetAttention(); } @@ -85,7 +85,7 @@ void EncoderDecoder::Filter(const std::vector<unsigned>& filterIds) { } -BaseMatrix& EncoderDecoder::GetProbs() { +BaseTensor& EncoderDecoder::GetProbs() { return decoder_->GetProbs(); } diff --git a/src/amun/cpu/dl4mt/encoder_decoder.h b/src/amun/cpu/dl4mt/encoder_decoder.h index 538ed833..69846373 100644 --- a/src/amun/cpu/dl4mt/encoder_decoder.h +++ b/src/amun/cpu/dl4mt/encoder_decoder.h @@ -4,7 +4,7 @@ #include <yaml-cpp/yaml.h> #include "cpu/decoder/encoder_decoder.h" -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" #include "cpu/dl4mt/model.h" #include "cpu/dl4mt/encoder.h" #include "cpu/dl4mt/decoder.h" @@ -43,12 +43,12 @@ class EncoderDecoder : public CPUEncoderDecoderBase { const Beam& beam, State& out); - void GetAttention(mblas::Matrix& Attention); - mblas::Matrix& GetAttention(); + void GetAttention(mblas::Tensor& Attention); + mblas::Tensor& GetAttention(); unsigned GetVocabSize() const; - BaseMatrix& GetProbs(); + BaseTensor& GetProbs(); void Filter(const std::vector<unsigned>& filterIds); diff --git a/src/amun/cpu/dl4mt/gru.h b/src/amun/cpu/dl4mt/gru.h index 72e3e19b..7089396e 100644 --- a/src/amun/cpu/dl4mt/gru.h +++ b/src/amun/cpu/dl4mt/gru.h @@ -1,5 +1,5 @@ #pragma once -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" namespace amunmt { namespace CPU { @@ -11,13 +11,13 @@ class GRU { GRU(const Weights& model) : w_(model) { using namespace mblas; - WWx_ = Concat<byColumn, Matrix>(w_.W_, w_.Wx_); - UUx_ = Concat<byColumn, Matrix>(w_.U_, w_.Ux_); + WWx_ = Concat<byColumn, Tensor>(w_.W_, w_.Wx_); + UUx_ = Concat<byColumn, Tensor>(w_.U_, w_.Ux_); } - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Context) const { + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Context) const { RUH_ = Context * WWx_; if (w_.Gamma_1_.rows()) { LayerNormalization(RUH_, w_.Gamma_1_); @@ -33,8 +33,8 @@ class GRU { ElementwiseOps(NextState, State); } - void ElementwiseOps(mblas::Matrix& NextState, - const mblas::Matrix& State) const { + void ElementwiseOps(mblas::Tensor& NextState, + const mblas::Tensor& State) const { using namespace mblas; using namespace blaze; @@ -78,12 +78,12 @@ class GRU { private: // Model matrices const Weights& w_; - mutable mblas::Matrix WWx_; - mutable mblas::Matrix UUx_; + mutable mblas::Tensor WWx_; + mutable mblas::Tensor UUx_; // reused to avoid allocation - mutable mblas::Matrix RUH_; - mutable mblas::Matrix Temp_; + mutable mblas::Tensor RUH_; + mutable mblas::Tensor Temp_; }; } diff --git a/src/amun/cpu/dl4mt/model.cpp b/src/amun/cpu/dl4mt/model.cpp index a0d190e7..7a660f6e 100644 --- a/src/amun/cpu/dl4mt/model.cpp +++ b/src/amun/cpu/dl4mt/model.cpp @@ -25,7 +25,7 @@ Weights::GRU::GRU(const NpzConverter& model, const std::vector<std::string> &key Gamma_1_(model[keys.at(6)]), Gamma_2_(model[keys.at(7)]) { - const_cast<mblas::Matrix&>(Bx2_) = 0.0f; + const_cast<mblas::Tensor&>(Bx2_) = 0.0f; } ////////////////////////////////////////////////////////////////////////////// @@ -47,7 +47,7 @@ Weights::DecGRU2::DecGRU2(const NpzConverter& model) Gamma_1_(model["decoder_cell2_gamma1"]), Gamma_2_(model["decoder_cell2_gamma2"]) { - const_cast<mblas::Matrix&>(Bx1_) = 0.0f; + const_cast<mblas::Tensor&>(Bx1_) = 0.0f; } Weights::DecAttention::DecAttention(const NpzConverter& model) diff --git a/src/amun/cpu/dl4mt/model.h b/src/amun/cpu/dl4mt/model.h index 87299b87..cfe6ae25 100644 --- a/src/amun/cpu/dl4mt/model.h +++ b/src/amun/cpu/dl4mt/model.h @@ -5,7 +5,7 @@ #include <string> #include "cpu/npz_converter.h" -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" namespace amunmt { namespace CPU { @@ -19,21 +19,21 @@ struct Weights { Embeddings(const NpzConverter& model, const std::string &key); Embeddings(const NpzConverter& model, const std::vector<std::pair<std::string, bool>> keys); - const mblas::Matrix E_; + const mblas::Tensor E_; }; struct GRU { GRU(const NpzConverter& model, const std::vector<std::string> &keys); - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix Wx_; - const mblas::Matrix Bx1_; - const mblas::Matrix Bx2_; - const mblas::Matrix Ux_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor Wx_; + const mblas::Tensor Bx1_; + const mblas::Tensor Bx2_; + const mblas::Tensor Ux_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; ////////////////////////////////////////////////////////////////////////////// @@ -41,51 +41,51 @@ struct Weights { struct DecInit { DecInit(const NpzConverter& model); - const mblas::Matrix Wi_; - const mblas::Matrix Bi_; - const mblas::Matrix Gamma_; + const mblas::Tensor Wi_; + const mblas::Tensor Bi_; + const mblas::Tensor Gamma_; }; struct DecGRU2 { DecGRU2(const NpzConverter& model); - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix Wx_; - const mblas::Matrix Bx2_; - const mblas::Matrix Bx1_; - const mblas::Matrix Ux_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor Wx_; + const mblas::Tensor Bx2_; + const mblas::Tensor Bx1_; + const mblas::Tensor Ux_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; struct DecAttention { DecAttention(const NpzConverter& model); - const mblas::Matrix V_; - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix C_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor V_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor C_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; struct DecSoftmax { DecSoftmax(const NpzConverter& model); - const mblas::Matrix W1_; - const mblas::Matrix B1_; - const mblas::Matrix W2_; - const mblas::Matrix B2_; - const mblas::Matrix W3_; - const mblas::Matrix B3_; - const mblas::Matrix W4_; - const mblas::Matrix B4_; - const mblas::Matrix Gamma_0_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor W1_; + const mblas::Tensor B1_; + const mblas::Tensor W2_; + const mblas::Tensor B2_; + const mblas::Tensor W3_; + const mblas::Tensor B3_; + const mblas::Tensor W4_; + const mblas::Tensor B4_; + const mblas::Tensor Gamma_0_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; ////////////////////////////////////////////////////////////////////////////// diff --git a/src/amun/cpu/mblas/matrix.cpp b/src/amun/cpu/mblas/tensor.cpp index 2ee0bbb7..6c385be0 100644 --- a/src/amun/cpu/mblas/matrix.cpp +++ b/src/amun/cpu/mblas/tensor.cpp @@ -1,5 +1,5 @@ #include <boost/iterator/permutation_iterator.hpp> -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" #include "cpu/mblas/simd_math_prims.h" #include "common/god.h" #include "common/hypothesis.h" diff --git a/src/amun/cpu/mblas/matrix.h b/src/amun/cpu/mblas/tensor.h index 93e1f29a..7104085a 100644 --- a/src/amun/cpu/mblas/matrix.h +++ b/src/amun/cpu/mblas/tensor.h @@ -7,7 +7,7 @@ #include <blaze/Math.h> #include "phoenix_functions.h" -#include "common/base_matrix.h" +#include "common/base_tensor.h" #include "common/exception.h" namespace amunmt { @@ -19,16 +19,16 @@ typedef blaze::DynamicVector<float, blaze::rowVector> Vector; typedef blaze::DynamicVector<float, blaze::columnVector> ColumnVector; ////////////////////////////////////////////////////////////////////////////////////////////// -class Matrix : public BaseMatrix, public blaze::DynamicMatrix<float, blaze::rowMajor> +class Tensor : public BaseTensor, public blaze::DynamicMatrix<float, blaze::rowMajor> { public: typedef blaze::DynamicMatrix<float, blaze::rowMajor> Parent; - Matrix() + Tensor() : Parent() {} - Matrix(unsigned rows, unsigned cols) + Tensor(unsigned rows, unsigned cols) : Parent(rows, cols) {} @@ -59,7 +59,7 @@ public: ////////////////////////////////////////////////////////////////////////////////////////////// template <typename T, bool SO = blaze::rowMajor> -class BlazeMatrix : public BaseMatrix, public blaze::CustomMatrix<T, blaze::unaligned, +class BlazeMatrix : public BaseTensor, public blaze::CustomMatrix<T, blaze::unaligned, blaze::unpadded, blaze::rowMajor> { public: @@ -208,7 +208,7 @@ MT& AddBiasVector(MT& m, const VT& b) { return m; } -//Matrix& Swap(Matrix& Out, Matrix& In); +//Tensor& Swap(Tensor& Out, Tensor& In); template <class MT> void Reshape(MT& m, unsigned rows, unsigned cols) { diff --git a/src/amun/cpu/nematus/decoder.h b/src/amun/cpu/nematus/decoder.h index 6c7c5aac..ab979ae7 100644 --- a/src/amun/cpu/nematus/decoder.h +++ b/src/amun/cpu/nematus/decoder.h @@ -1,6 +1,6 @@ #pragma once -#include "../mblas/matrix.h" +#include "../mblas/tensor.h" #include "model.h" #include "gru.h" #include "transition.h" @@ -19,7 +19,7 @@ class Decoder { : w_(model) {} - void Lookup(mblas::Matrix& Rows, const std::vector<unsigned>& ids) { + void Lookup(mblas::Tensor& Rows, const std::vector<unsigned>& ids) { using namespace mblas; std::vector<unsigned> tids = ids; for (auto&& id : tids) { @@ -27,7 +27,7 @@ class Decoder { id = 1; } } - Rows = Assemble<byRow, Matrix>(w_.E_, tids); + Rows = Assemble<byRow, Tensor>(w_.E_, tids); } size_t GetCols() { @@ -52,15 +52,15 @@ class Decoder { {} void InitializeState( - mblas::Matrix& State, - const mblas::Matrix& SourceContext, + mblas::Tensor& State, + const mblas::Tensor& SourceContext, const size_t batchSize = 1) { using namespace mblas; // Calculate mean of source context, rowwise // Repeat mean batchSize times by broadcasting - Temp1_ = Mean<byRow, Matrix>(SourceContext); + Temp1_ = Mean<byRow, Tensor>(SourceContext); Temp2_.resize(batchSize, SourceContext.columns()); Temp2_ = 0.0f; @@ -78,9 +78,9 @@ class Decoder { // std::cerr << std::endl; } - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Context) { + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Context) { gru_.GetNextState(NextState, State, Context); } @@ -88,8 +88,8 @@ class Decoder { const Weights1& w_; const GRU<Weights2> gru_; - mblas::Matrix Temp1_; - mblas::Matrix Temp2_; + mblas::Tensor Temp1_; + mblas::Tensor Temp2_; }; ////////////////////////////////////////////////////////////// @@ -102,9 +102,9 @@ class Decoder { {} void GetNextState( - mblas::Matrix& nextState, - const mblas::Matrix& state, - const mblas::Matrix& context) + mblas::Tensor& nextState, + const mblas::Tensor& state, + const mblas::Tensor& context) { gru_.GetNextState(nextState, state, context); transition_.GetNextState(nextState); @@ -128,7 +128,7 @@ class Decoder { V_ = blaze::trans(blaze::row(w_.V_, 0)); } - void Init(const mblas::Matrix& SourceContext) { + void Init(const mblas::Tensor& SourceContext) { using namespace mblas; SCU_ = SourceContext * w_.U_; mblas::AddBiasVector<mblas::byRow>(SCU_, w_.B_); @@ -139,9 +139,9 @@ class Decoder { } void GetAlignedSourceContext( - mblas::Matrix& AlignedSourceContext, - const mblas::Matrix& HiddenState, - const mblas::Matrix& SourceContext) + mblas::Tensor& AlignedSourceContext, + const mblas::Tensor& HiddenState, + const mblas::Tensor& SourceContext) { using namespace mblas; @@ -150,7 +150,7 @@ class Decoder { LayerNormalization(Temp2_, w_.W_comb_lns_, w_.W_comb_lnb_); } - Temp1_ = Broadcast<Matrix>(Tanh(), SCU_, Temp2_); + Temp1_ = Broadcast<Tensor>(Tanh(), SCU_, Temp2_); A_.resize(Temp1_.rows(), 1); blaze::column(A_, 0) = Temp1_ * V_; @@ -166,21 +166,21 @@ class Decoder { AlignedSourceContext = A_ * SourceContext; } - void GetAttention(mblas::Matrix& Attention) { + void GetAttention(mblas::Tensor& Attention) { Attention = A_; } - mblas::Matrix& GetAttention() { + mblas::Tensor& GetAttention() { return A_; } private: const Weights& w_; - mblas::Matrix SCU_; - mblas::Matrix Temp1_; - mblas::Matrix Temp2_; - mblas::Matrix A_; + mblas::Tensor SCU_; + mblas::Tensor Temp1_; + mblas::Tensor Temp2_; + mblas::Tensor A_; mblas::ColumnVector V_; }; @@ -194,9 +194,9 @@ class Decoder { {} void GetProbs(mblas::ArrayMatrix& Probs, - const mblas::Matrix& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext) { + const mblas::Tensor& State, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext) { using namespace mblas; T1_ = State * w_.W1_; @@ -244,20 +244,20 @@ class Decoder { void Filter(const std::vector<unsigned>& ids) { filtered_ = true; using namespace mblas; - FilteredW4_ = Assemble<byColumn, Matrix>(w_.W4_, ids); - FilteredB4_ = Assemble<byColumn, Matrix>(w_.B4_, ids); + FilteredW4_ = Assemble<byColumn, Tensor>(w_.W4_, ids); + FilteredB4_ = Assemble<byColumn, Tensor>(w_.B4_, ids); } private: const Weights& w_; bool filtered_; - mblas::Matrix FilteredW4_; - mblas::Matrix FilteredB4_; + mblas::Tensor FilteredW4_; + mblas::Tensor FilteredB4_; - mblas::Matrix T1_; - mblas::Matrix T2_; - mblas::Matrix T3_; + mblas::Tensor T1_; + mblas::Tensor T2_; + mblas::Tensor T3_; }; public: @@ -270,10 +270,10 @@ class Decoder { {} void Decode( - mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Embeddings, - const mblas::Matrix& SourceContext) + mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Embeddings, + const mblas::Tensor& SourceContext) { GetHiddenState(HiddenState_, State, Embeddings); // std::cerr << "HIDDEN: " << std::endl; @@ -297,20 +297,20 @@ class Decoder { return Probs_; } - void EmptyState(mblas::Matrix& State, - const mblas::Matrix& SourceContext, + void EmptyState(mblas::Tensor& State, + const mblas::Tensor& SourceContext, size_t batchSize = 1) { rnn1_.InitializeState(State, SourceContext, batchSize); attention_.Init(SourceContext); } - void EmptyEmbedding(mblas::Matrix& Embedding, + void EmptyEmbedding(mblas::Tensor& Embedding, size_t batchSize = 1) { Embedding.resize(batchSize, embeddings_.GetCols()); Embedding = 0.0f; } - void Lookup(mblas::Matrix& Embedding, + void Lookup(mblas::Tensor& Embedding, const std::vector<unsigned>& w) { embeddings_.Lookup(Embedding, w); } @@ -319,11 +319,11 @@ class Decoder { softmax_.Filter(ids); } - void GetAttention(mblas::Matrix& attention) { + void GetAttention(mblas::Tensor& attention) { attention_.GetAttention(attention); } - mblas::Matrix& GetAttention() { + mblas::Tensor& GetAttention() { return attention_.GetAttention(); } @@ -333,34 +333,34 @@ class Decoder { private: - void GetHiddenState(mblas::Matrix& HiddenState, - const mblas::Matrix& PrevState, - const mblas::Matrix& Embedding) { + void GetHiddenState(mblas::Tensor& HiddenState, + const mblas::Tensor& PrevState, + const mblas::Tensor& Embedding) { rnn1_.GetNextState(HiddenState, PrevState, Embedding); } - void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, - const mblas::Matrix& HiddenState, - const mblas::Matrix& SourceContext) { + void GetAlignedSourceContext(mblas::Tensor& AlignedSourceContext, + const mblas::Tensor& HiddenState, + const mblas::Tensor& SourceContext) { attention_.GetAlignedSourceContext(AlignedSourceContext, HiddenState, SourceContext); } - void GetNextState(mblas::Matrix& State, - const mblas::Matrix& HiddenState, - const mblas::Matrix& AlignedSourceContext) { + void GetNextState(mblas::Tensor& State, + const mblas::Tensor& HiddenState, + const mblas::Tensor& AlignedSourceContext) { rnn2_.GetNextState(State, HiddenState, AlignedSourceContext); } - void GetProbs(const mblas::Matrix& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext) { + void GetProbs(const mblas::Tensor& State, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext) { softmax_.GetProbs(Probs_, State, Embedding, AlignedSourceContext); } private: - mblas::Matrix HiddenState_; - mblas::Matrix AlignedSourceContext_; + mblas::Tensor HiddenState_; + mblas::Tensor AlignedSourceContext_; mblas::ArrayMatrix Probs_; Embeddings<Weights::Embeddings> embeddings_; diff --git a/src/amun/cpu/nematus/encoder.cpp b/src/amun/cpu/nematus/encoder.cpp index 49b9e8ea..240d7601 100644 --- a/src/amun/cpu/nematus/encoder.cpp +++ b/src/amun/cpu/nematus/encoder.cpp @@ -6,15 +6,15 @@ namespace amunmt { namespace CPU { namespace Nematus { -void Encoder::GetContext(const std::vector<unsigned>& words, mblas::Matrix& context) { - std::vector<mblas::Matrix> embeddedWords; +void Encoder::GetContext(const std::vector<unsigned>& words, mblas::Tensor& context) { + std::vector<mblas::Tensor> embeddedWords; context.resize(words.size(), forwardRnn_.GetStateLength() + backwardRnn_.GetStateLength()); for (auto& w : words) { embeddedWords.emplace_back(); - mblas::Matrix &embed = embeddedWords.back(); + mblas::Tensor &embed = embeddedWords.back(); embeddings_.Lookup(embed, w); } diff --git a/src/amun/cpu/nematus/encoder.h b/src/amun/cpu/nematus/encoder.h index 9e32dde6..b55246b4 100644 --- a/src/amun/cpu/nematus/encoder.h +++ b/src/amun/cpu/nematus/encoder.h @@ -1,6 +1,6 @@ #pragma once -#include "../mblas/matrix.h" +#include "../mblas/tensor.h" #include "model.h" #include "gru.h" #include "transition.h" @@ -20,7 +20,7 @@ class Encoder { : w_(model) {} - void Lookup(mblas::Matrix& Row, size_t i) { + void Lookup(mblas::Tensor& Row, size_t i) { size_t len = w_.E_.columns(); if(i < w_.E_.rows()) Row = blaze::submatrix(w_.E_, i, 0, 1, len); @@ -46,9 +46,9 @@ class Encoder { State_ = 0.0f; } - void GetNextState(mblas::Matrix& nextState, - const mblas::Matrix& state, - const mblas::Matrix& embd) { + void GetNextState(mblas::Tensor& nextState, + const mblas::Tensor& state, + const mblas::Tensor& embd) { gru_.GetNextState(nextState, state, embd); // std::cerr << "GRU: " << std::endl; // for (int i = 0; i < 10; ++i) std::cerr << nextState(0, i) << " "; @@ -60,7 +60,7 @@ class Encoder { } template <class It> - void GetContext(It it, It end, mblas::Matrix& Context, bool invert) { + void GetContext(It it, It end, mblas::Tensor& Context, bool invert) { InitializeState(); size_t n = std::distance(it, end); @@ -86,7 +86,7 @@ class Encoder { const GRU<WeightsGRU> gru_; const Transition transition_; - mblas::Matrix State_; + mblas::Tensor State_; }; ///////////////////////////////////////////////////////////////// @@ -98,7 +98,7 @@ class Encoder { {} void GetContext(const std::vector<unsigned>& words, - mblas::Matrix& context); + mblas::Tensor& context); private: Embeddings<Weights::Embeddings> embeddings_; diff --git a/src/amun/cpu/nematus/encoder_decoder.cpp b/src/amun/cpu/nematus/encoder_decoder.cpp index 8edc71bb..d3d3697a 100644 --- a/src/amun/cpu/nematus/encoder_decoder.cpp +++ b/src/amun/cpu/nematus/encoder_decoder.cpp @@ -7,7 +7,7 @@ #include "common/sentences.h" #include "cpu/decoder/encoder_decoder_loader.h" -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" using namespace std; @@ -64,17 +64,17 @@ void EncoderDecoder::AssembleBeamState(const State& in, const EDState& edIn = in.get<EDState>(); EDState& edOut = out.get<EDState>(); - edOut.GetStates() = mblas::Assemble<mblas::byRow, mblas::Matrix>(edIn.GetStates(), beamStateIds); + edOut.GetStates() = mblas::Assemble<mblas::byRow, mblas::Tensor>(edIn.GetStates(), beamStateIds); decoder_->Lookup(edOut.GetEmbeddings(), beamWords); } -void EncoderDecoder::GetAttention(mblas::Matrix& Attention) { +void EncoderDecoder::GetAttention(mblas::Tensor& Attention) { decoder_->GetAttention(Attention); } -mblas::Matrix& EncoderDecoder::GetAttention() { +mblas::Tensor& EncoderDecoder::GetAttention() { return decoder_->GetAttention(); } @@ -89,7 +89,7 @@ void EncoderDecoder::Filter(const std::vector<unsigned>& filterIds) { } -BaseMatrix& EncoderDecoder::GetProbs() { +BaseTensor& EncoderDecoder::GetProbs() { return decoder_->GetProbs(); } diff --git a/src/amun/cpu/nematus/encoder_decoder.h b/src/amun/cpu/nematus/encoder_decoder.h index ab81dd28..2fd43abb 100644 --- a/src/amun/cpu/nematus/encoder_decoder.h +++ b/src/amun/cpu/nematus/encoder_decoder.h @@ -8,7 +8,7 @@ #include "cpu/nematus/decoder.h" #include "cpu/nematus/model.h" -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" namespace amunmt { @@ -38,12 +38,12 @@ class EncoderDecoder : public CPUEncoderDecoderBase { const Beam& beam, State& out); - void GetAttention(mblas::Matrix& Attention); - mblas::Matrix& GetAttention(); + void GetAttention(mblas::Tensor& Attention); + mblas::Tensor& GetAttention(); unsigned GetVocabSize() const; - BaseMatrix& GetProbs(); + BaseTensor& GetProbs(); void Filter(const std::vector<unsigned>& filterIds); diff --git a/src/amun/cpu/nematus/gru.h b/src/amun/cpu/nematus/gru.h index 33e855fd..3166ca6e 100644 --- a/src/amun/cpu/nematus/gru.h +++ b/src/amun/cpu/nematus/gru.h @@ -1,5 +1,5 @@ #pragma once -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" #include <iomanip> namespace amunmt { @@ -13,15 +13,15 @@ class GRU { layerNormalization_(w_.W_lns_.rows()) { if (!layerNormalization_) { - WWx_ = mblas::Concat<mblas::byColumn, mblas::Matrix>(w_.W_, w_.Wx_); - UUx_ = mblas::Concat<mblas::byColumn, mblas::Matrix>(w_.U_, w_.Ux_); + WWx_ = mblas::Concat<mblas::byColumn, mblas::Tensor>(w_.W_, w_.Wx_); + UUx_ = mblas::Concat<mblas::byColumn, mblas::Tensor>(w_.U_, w_.Ux_); } } void GetNextState( - mblas::Matrix& nextState, - const mblas::Matrix& state, - const mblas::Matrix& context) const + mblas::Tensor& nextState, + const mblas::Tensor& state, + const mblas::Tensor& context) const { // std::cerr << "Get next state" << std::endl; if (layerNormalization_) { @@ -33,7 +33,7 @@ class GRU { mblas::AddBiasVector<mblas::byRow>(RUH_2_, w_.Bx1_); LayerNormalization(RUH_2_, w_.Wx_lns_, w_.Wx_lnb_); - RUH_ = mblas::Concat<mblas::byColumn, mblas::Matrix>(RUH_1_, RUH_2_); + RUH_ = mblas::Concat<mblas::byColumn, mblas::Tensor>(RUH_1_, RUH_2_); Temp_1_ = state * w_.U_; mblas::AddBiasVector<mblas::byRow>(Temp_1_, w_.Bx3_); @@ -43,7 +43,7 @@ class GRU { mblas::AddBiasVector<mblas::byRow>(Temp_2_, w_.Bx2_); LayerNormalization(Temp_2_, w_.Ux_lns_, w_.Ux_lnb_); - Temp_ = mblas::Concat<mblas::byColumn, mblas::Matrix>(Temp_1_, Temp_2_); + Temp_ = mblas::Concat<mblas::byColumn, mblas::Tensor>(Temp_1_, Temp_2_); ElementwiseOpsLayerNorm(nextState, state); @@ -54,7 +54,7 @@ class GRU { } } - void ElementwiseOps(mblas::Matrix& NextState, const mblas::Matrix& State) const { + void ElementwiseOps(mblas::Tensor& NextState, const mblas::Tensor& State) const { using namespace mblas; using namespace blaze; @@ -88,7 +88,7 @@ class GRU { } } - void ElementwiseOpsLayerNorm(mblas::Matrix& NextState, const mblas::Matrix& State) const { + void ElementwiseOpsLayerNorm(mblas::Tensor& NextState, const mblas::Tensor& State) const { using namespace mblas; using namespace blaze; @@ -129,21 +129,21 @@ class GRU { private: // Model matrices const Weights& w_; - mutable mblas::Matrix WWx_; - mutable mblas::Matrix UUx_; - mutable mblas::Matrix Wbbx_; - mutable mblas::Matrix lns_WWx_; - mutable mblas::Matrix lns_UUx_; - mutable mblas::Matrix lnb_WWx_; - mutable mblas::Matrix lnb_UUx_; + mutable mblas::Tensor WWx_; + mutable mblas::Tensor UUx_; + mutable mblas::Tensor Wbbx_; + mutable mblas::Tensor lns_WWx_; + mutable mblas::Tensor lns_UUx_; + mutable mblas::Tensor lnb_WWx_; + mutable mblas::Tensor lnb_UUx_; // reused to avoid allocation - mutable mblas::Matrix RUH_; - mutable mblas::Matrix RUH_1_; - mutable mblas::Matrix RUH_2_; - mutable mblas::Matrix Temp_; - mutable mblas::Matrix Temp_1_; - mutable mblas::Matrix Temp_2_; + mutable mblas::Tensor RUH_; + mutable mblas::Tensor RUH_1_; + mutable mblas::Tensor RUH_2_; + mutable mblas::Tensor Temp_; + mutable mblas::Tensor Temp_1_; + mutable mblas::Tensor Temp_2_; bool layerNormalization_; }; diff --git a/src/amun/cpu/nematus/model.cpp b/src/amun/cpu/nematus/model.cpp index de2da9a6..0a024937 100644 --- a/src/amun/cpu/nematus/model.cpp +++ b/src/amun/cpu/nematus/model.cpp @@ -20,13 +20,13 @@ Weights::Transition::Transition(const NpzConverter& model, TransitionType type, switch(type) { case TransitionType::Encoder: Bx1_.emplace_back(1, Ux_.back().dim(1)); - const_cast<mblas::Matrix&>(Bx1_.back()) = 0.0f; + const_cast<mblas::Tensor&>(Bx1_.back()) = 0.0f; Bx2_.emplace_back(model(name(prefix, "bx", infix, i), true)); break; case TransitionType::Decoder: Bx1_.emplace_back(model(name(prefix, "bx", infix, i), true)); Bx2_.emplace_back(1, Ux_.back().dim(1)); - const_cast<mblas::Matrix&>(Bx2_.back()) = 0.0f; + const_cast<mblas::Tensor&>(Bx2_.back()) = 0.0f; break; } } @@ -86,8 +86,8 @@ Weights::GRU::GRU(const NpzConverter& model, std::string prefix, std::vector<std Ux_lns_(model[prefix + keys.at(12)]), Ux_lnb_(model[prefix + keys.at(13)]) { - const_cast<mblas::Matrix&>(Bx2_) = 0.0f; - const_cast<mblas::Matrix&>(Bx3_) = 0.0f; + const_cast<mblas::Tensor&>(Bx2_) = 0.0f; + const_cast<mblas::Tensor&>(Bx3_) = 0.0f; } ////////////////////////////////////////////////////////////////////////////// @@ -119,8 +119,8 @@ Weights::DecGRU2::DecGRU2(const NpzConverter& model, std::string prefix, std::ve Ux_lnb_(model[prefix + keys.at(13)]) // Ux_nl_lnb { - const_cast<mblas::Matrix&>(B_) = 0.0f; - const_cast<mblas::Matrix&>(Bx1_) = 0.0f; + const_cast<mblas::Tensor&>(B_) = 0.0f; + const_cast<mblas::Tensor&>(Bx1_) = 0.0f; } Weights::DecAttention::DecAttention(const NpzConverter& model) diff --git a/src/amun/cpu/nematus/model.h b/src/amun/cpu/nematus/model.h index 82fe09f3..b378b434 100644 --- a/src/amun/cpu/nematus/model.h +++ b/src/amun/cpu/nematus/model.h @@ -6,7 +6,7 @@ #include "cpu/npz_converter.h" -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" namespace amunmt { namespace CPU { @@ -35,16 +35,16 @@ struct Weights { TransitionType type_; public: - std::vector<mblas::Matrix> B_; - std::vector<mblas::Matrix> Bx1_; - std::vector<mblas::Matrix> Bx2_; - std::vector<mblas::Matrix> U_; - std::vector<mblas::Matrix> Ux_; + std::vector<mblas::Tensor> B_; + std::vector<mblas::Tensor> Bx1_; + std::vector<mblas::Tensor> Bx2_; + std::vector<mblas::Tensor> U_; + std::vector<mblas::Tensor> Ux_; - std::vector<mblas::Matrix> U_lns_; - std::vector<mblas::Matrix> U_lnb_; - std::vector<mblas::Matrix> Ux_lns_; - std::vector<mblas::Matrix> Ux_lnb_; + std::vector<mblas::Tensor> U_lns_; + std::vector<mblas::Tensor> U_lnb_; + std::vector<mblas::Tensor> Ux_lns_; + std::vector<mblas::Tensor> Ux_lnb_; }; @@ -52,93 +52,93 @@ struct Weights { Embeddings(const NpzConverter& model, const std::string &key); Embeddings(const NpzConverter& model, const std::vector<std::pair<std::string, bool>> keys); - const mblas::Matrix E_; + const mblas::Tensor E_; }; struct GRU { GRU(const NpzConverter& model, std::string prefix, std::vector<std::string> keys); - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix Wx_; - const mblas::Matrix Bx1_; - const mblas::Matrix Bx2_; - const mblas::Matrix Bx3_; - const mblas::Matrix Ux_; - - const mblas::Matrix W_lns_; - const mblas::Matrix W_lnb_; - const mblas::Matrix Wx_lns_; - const mblas::Matrix Wx_lnb_; - const mblas::Matrix U_lns_; - const mblas::Matrix U_lnb_; - const mblas::Matrix Ux_lns_; - const mblas::Matrix Ux_lnb_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor Wx_; + const mblas::Tensor Bx1_; + const mblas::Tensor Bx2_; + const mblas::Tensor Bx3_; + const mblas::Tensor Ux_; + + const mblas::Tensor W_lns_; + const mblas::Tensor W_lnb_; + const mblas::Tensor Wx_lns_; + const mblas::Tensor Wx_lnb_; + const mblas::Tensor U_lns_; + const mblas::Tensor U_lnb_; + const mblas::Tensor Ux_lns_; + const mblas::Tensor Ux_lnb_; }; struct DecInit { DecInit(const NpzConverter& model); - const mblas::Matrix Wi_; - const mblas::Matrix Bi_; - const mblas::Matrix lns_; - const mblas::Matrix lnb_; + const mblas::Tensor Wi_; + const mblas::Tensor Bi_; + const mblas::Tensor lns_; + const mblas::Tensor lnb_; }; struct DecGRU2 { DecGRU2(const NpzConverter& model, std::string prefix, std::vector<std::string> keys); - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix Wx_; - const mblas::Matrix Bx3_; - const mblas::Matrix Bx2_; - const mblas::Matrix Bx1_; - const mblas::Matrix Ux_; - - const mblas::Matrix W_lns_; - const mblas::Matrix W_lnb_; - const mblas::Matrix Wx_lns_; - const mblas::Matrix Wx_lnb_; - const mblas::Matrix U_lns_; - const mblas::Matrix U_lnb_; - const mblas::Matrix Ux_lns_; - const mblas::Matrix Ux_lnb_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor Wx_; + const mblas::Tensor Bx3_; + const mblas::Tensor Bx2_; + const mblas::Tensor Bx1_; + const mblas::Tensor Ux_; + + const mblas::Tensor W_lns_; + const mblas::Tensor W_lnb_; + const mblas::Tensor Wx_lns_; + const mblas::Tensor Wx_lnb_; + const mblas::Tensor U_lns_; + const mblas::Tensor U_lnb_; + const mblas::Tensor Ux_lns_; + const mblas::Tensor Ux_lnb_; }; struct DecAttention { DecAttention(const NpzConverter& model); - const mblas::Matrix V_; - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix C_; - const mblas::Matrix Wc_att_lns_; - const mblas::Matrix Wc_att_lnb_; - const mblas::Matrix W_comb_lns_; - const mblas::Matrix W_comb_lnb_; + const mblas::Tensor V_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor C_; + const mblas::Tensor Wc_att_lns_; + const mblas::Tensor Wc_att_lnb_; + const mblas::Tensor W_comb_lns_; + const mblas::Tensor W_comb_lnb_; }; struct DecSoftmax { DecSoftmax(const NpzConverter& model); - const mblas::Matrix W1_; - const mblas::Matrix B1_; - const mblas::Matrix W2_; - const mblas::Matrix B2_; - const mblas::Matrix W3_; - const mblas::Matrix B3_; - const mblas::Matrix W4_; - const mblas::Matrix B4_; - const mblas::Matrix lns_1_; - const mblas::Matrix lns_2_; - const mblas::Matrix lns_3_; - const mblas::Matrix lnb_1_; - const mblas::Matrix lnb_2_; - const mblas::Matrix lnb_3_; + const mblas::Tensor W1_; + const mblas::Tensor B1_; + const mblas::Tensor W2_; + const mblas::Tensor B2_; + const mblas::Tensor W3_; + const mblas::Tensor B3_; + const mblas::Tensor W4_; + const mblas::Tensor B4_; + const mblas::Tensor lns_1_; + const mblas::Tensor lns_2_; + const mblas::Tensor lns_3_; + const mblas::Tensor lnb_1_; + const mblas::Tensor lnb_2_; + const mblas::Tensor lnb_3_; }; diff --git a/src/amun/cpu/nematus/transition.cpp b/src/amun/cpu/nematus/transition.cpp index a5ac1c8f..d019c9fa 100644 --- a/src/amun/cpu/nematus/transition.cpp +++ b/src/amun/cpu/nematus/transition.cpp @@ -14,7 +14,7 @@ Transition::Transition(const Weights::Transition& model) } -void Transition::GetNextState(mblas::Matrix& state) const +void Transition::GetNextState(mblas::Tensor& state) const { if (layerNormalization_) { for (int i = 0; i < w_.size(); ++i) { @@ -51,7 +51,7 @@ void Transition::GetNextState(mblas::Matrix& state) const } -void Transition::ElementwiseOps(mblas::Matrix& state, int idx) const { +void Transition::ElementwiseOps(mblas::Tensor& state, int idx) const { using namespace mblas; using namespace blaze; diff --git a/src/amun/cpu/nematus/transition.h b/src/amun/cpu/nematus/transition.h index 3db3c72e..651ea9ca 100644 --- a/src/amun/cpu/nematus/transition.h +++ b/src/amun/cpu/nematus/transition.h @@ -1,6 +1,6 @@ #pragma once -#include "cpu/mblas/matrix.h" +#include "cpu/mblas/tensor.h" #include "model.h" namespace amunmt { @@ -11,23 +11,23 @@ class Transition { public: Transition(const Weights::Transition& model); - void GetNextState(mblas::Matrix& state) const; + void GetNextState(mblas::Tensor& state) const; protected: - void ElementwiseOps(mblas::Matrix& state, int idx) const; + void ElementwiseOps(mblas::Tensor& state, int idx) const; private: // Model matrices const Weights::Transition& w_; // reused to avoid allocation - mutable mblas::Matrix UUx_; - mutable mblas::Matrix RUH_; - mutable mblas::Matrix RUH_1_; - mutable mblas::Matrix RUH_2_; - mutable mblas::Matrix Temp_; - mutable mblas::Matrix Temp_1_; - mutable mblas::Matrix Temp_2_; + mutable mblas::Tensor UUx_; + mutable mblas::Tensor RUH_; + mutable mblas::Tensor RUH_1_; + mutable mblas::Tensor RUH_2_; + mutable mblas::Tensor Temp_; + mutable mblas::Tensor Temp_1_; + mutable mblas::Tensor Temp_2_; bool layerNormalization_; }; diff --git a/src/amun/cpu/npz_converter.h b/src/amun/cpu/npz_converter.h index dfc7381d..15414c2b 100644 --- a/src/amun/cpu/npz_converter.h +++ b/src/amun/cpu/npz_converter.h @@ -1,7 +1,7 @@ #pragma once #include "cnpy/cnpy.h" -#include "mblas/matrix.h" +#include "mblas/tensor.h" namespace amunmt { namespace CPU { @@ -66,7 +66,7 @@ class NpzConverter { destructed_ = true; } - mblas::Matrix operator[](const std::string& key) const { + mblas::Tensor operator[](const std::string& key) const { BlazeWrapper matrix; auto it = model_.find(key); if(it != model_.end()) { @@ -79,19 +79,19 @@ class NpzConverter { } } - mblas::Matrix ret; + mblas::Tensor ret; ret = matrix; return std::move(ret); } - mblas::Matrix getFirstOfMany(const std::vector<std::pair<std::string, bool>> keys) const { + mblas::Tensor getFirstOfMany(const std::vector<std::pair<std::string, bool>> keys) const { BlazeWrapper matrix; for (auto key : keys) { auto it = model_.find(key.first); if(it != model_.end()) { NpyMatrixWrapper np(it->second); matrix = BlazeWrapper(np.data(), np.size1(), np.size2()); - mblas::Matrix ret; + mblas::Tensor ret; if (key.second) { const auto matrix2 = blaze::trans(matrix); ret = matrix2; @@ -103,11 +103,11 @@ class NpzConverter { } std::cerr << "Matrix not found: " << keys[0].first << "\n"; - mblas::Matrix ret; + mblas::Tensor ret; return std::move(ret); } - mblas::Matrix operator()(const std::string& key, + mblas::Tensor operator()(const std::string& key, bool transpose) const { BlazeWrapper matrix; auto it = model_.find(key); @@ -117,7 +117,7 @@ class NpzConverter { } else { std::cerr << "Missing " << key << std::endl; } - mblas::Matrix ret; + mblas::Tensor ret; if (transpose) { const auto matrix2 = blaze::trans(matrix); ret = matrix2; diff --git a/src/amun/fpga/best_hyps.cpp b/src/amun/fpga/best_hyps.cpp index 029f66e3..be9f35ec 100644 --- a/src/amun/fpga/best_hyps.cpp +++ b/src/amun/fpga/best_hyps.cpp @@ -10,7 +10,7 @@ namespace amunmt { namespace FPGA { BestHyps::BestHyps(const God &god, const OpenCLInfo &openCLInfo) -: BestHypsBase( +: BaseBestHyps( !god.Get<bool>("allow-unk"), god.Get<bool>("n-best"), god.Get<std::vector<std::string>>("softmax-filter").size(), @@ -23,12 +23,12 @@ BestHyps::BestHyps(const God &god, const OpenCLInfo &openCLInfo) //std::cerr << "BestHyps::BestHyps" << std::endl; } -void BestHyps::DisAllowUNK(mblas::Matrix& Prob) +void BestHyps::DisAllowUNK(mblas::Tensor& Prob) { SetColumn(Prob, UNK_ID, std::numeric_limits<float>::lowest()); } -void BestHyps::FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs, +void BestHyps::FindBests(const std::vector<uint>& beamSizes, mblas::Tensor& Probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst) @@ -47,7 +47,7 @@ void BestHyps::CalcBeam( /* using namespace mblas; - mblas::Matrix& Probs = static_cast<mblas::Matrix&>(scorers[0]->GetProbs()); + mblas::Tensor& Probs = static_cast<mblas::Tensor&>(scorers[0]->GetProbs()); //cerr << "Probs=" << Probs.Debug(1) << endl; std::vector<float> vCosts; @@ -66,7 +66,7 @@ void BestHyps::CalcBeam( //std::cerr << "1Probs=" << Probs.Debug(1) << std::endl; for (size_t i = 1; i < scorers.size(); ++i) { - mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs()); + mblas::Tensor &currProbs = static_cast<mblas::Tensor&>(scorers[i]->GetProbs()); float weight = weights_.at(scorers[0]->GetName()); ElementAddWeighted(Probs, weight, currProbs); diff --git a/src/amun/fpga/best_hyps.h b/src/amun/fpga/best_hyps.h index 625d07c3..be26e482 100644 --- a/src/amun/fpga/best_hyps.h +++ b/src/amun/fpga/best_hyps.h @@ -7,14 +7,14 @@ namespace amunmt { namespace FPGA { -class BestHyps : public BestHypsBase +class BestHyps : public BaseBestHyps { public: BestHyps(const God &god, const OpenCLInfo &openCLInfo); - void DisAllowUNK(mblas::Matrix& Prob); + void DisAllowUNK(mblas::Tensor& Prob); - void FindBests(const std::vector<uint>& beamSizes, mblas::Matrix& Probs, + void FindBests(const std::vector<uint>& beamSizes, mblas::Tensor& Probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst); diff --git a/src/amun/fpga/decoder.cpp b/src/amun/fpga/decoder.cpp index d11d11c5..dc37b5b2 100644 --- a/src/amun/fpga/decoder.cpp +++ b/src/amun/fpga/decoder.cpp @@ -3,8 +3,8 @@ namespace amunmt { namespace FPGA { -void Decoder::EmptyState(mblas::Matrix& State, - const mblas::Matrix& SourceContext, +void Decoder::EmptyState(mblas::Tensor& State, + const mblas::Tensor& SourceContext, size_t batchSize, const Array<int>& batchMapping) { @@ -12,15 +12,15 @@ void Decoder::EmptyState(mblas::Matrix& State, alignment_.Init(SourceContext); } -void Decoder::EmptyEmbedding(mblas::Matrix& Embedding, size_t batchSize) { +void Decoder::EmptyEmbedding(mblas::Tensor& Embedding, size_t batchSize) { Embedding.Resize(batchSize, embeddings_.GetCols()); mblas::Fill(Embedding, 0); } -void Decoder::Decode(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Embeddings, - const mblas::Matrix& SourceContext, +void Decoder::Decode(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Embeddings, + const mblas::Tensor& SourceContext, const Array<int>& mapping, const std::vector<uint>& beamSizes) { @@ -40,15 +40,15 @@ void Decoder::Decode(mblas::Matrix& NextState, } -void Decoder::GetHiddenState(mblas::Matrix& HiddenState, - const mblas::Matrix& PrevState, - const mblas::Matrix& Embedding) { +void Decoder::GetHiddenState(mblas::Tensor& HiddenState, + const mblas::Tensor& PrevState, + const mblas::Tensor& Embedding) { rnn1_.GetNextState(HiddenState, PrevState, Embedding); } -void Decoder::GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, - const mblas::Matrix& HiddenState, - const mblas::Matrix& SourceContext, +void Decoder::GetAlignedSourceContext(mblas::Tensor& AlignedSourceContext, + const mblas::Tensor& HiddenState, + const mblas::Tensor& SourceContext, const Array<int>& mapping, const std::vector<uint>& beamSizes) { @@ -57,21 +57,21 @@ void Decoder::GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, } -void Decoder::GetNextState(mblas::Matrix& State, - const mblas::Matrix& HiddenState, - const mblas::Matrix& AlignedSourceContext) +void Decoder::GetNextState(mblas::Tensor& State, + const mblas::Tensor& HiddenState, + const mblas::Tensor& AlignedSourceContext) { rnn2_.GetNextState(State, HiddenState, AlignedSourceContext); } -void Decoder::GetProbs(const mblas::Matrix& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext) +void Decoder::GetProbs(const mblas::Tensor& State, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext) { softmax_.GetProbs(Probs_, State, Embedding, AlignedSourceContext); } -void Decoder::Lookup(mblas::Matrix& Embedding, +void Decoder::Lookup(mblas::Tensor& Embedding, const std::vector<uint>& w) { embeddings_.Lookup(Embedding, w); diff --git a/src/amun/fpga/decoder.h b/src/amun/fpga/decoder.h index 7314b1ec..124a1585 100644 --- a/src/amun/fpga/decoder.h +++ b/src/amun/fpga/decoder.h @@ -22,7 +22,7 @@ class Decoder { , indices_(openCLInfo) {} - void Lookup(mblas::Matrix& Rows, const std::vector<uint>& ids) + void Lookup(mblas::Tensor& Rows, const std::vector<uint>& ids) { using namespace mblas; std::vector<uint> tids = ids; @@ -60,8 +60,8 @@ class Decoder { , Temp2_(openCLInfo) {} - void InitializeState(mblas::Matrix& State, - const mblas::Matrix& SourceContext, + void InitializeState(mblas::Tensor& State, + const mblas::Tensor& SourceContext, const size_t batchSize, const Array<int>& mapping) { @@ -92,9 +92,9 @@ class Decoder { } - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Context) { + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Context) { gru_.GetNextState(NextState, State, Context); } @@ -102,8 +102,8 @@ class Decoder { const Weights1& w_; const GRU<Weights2> gru_; - mblas::Matrix Temp1_; - mblas::Matrix Temp2_; + mblas::Tensor Temp1_; + mblas::Tensor Temp2_; }; template <class Weights> @@ -112,9 +112,9 @@ class Decoder { RNNFinal(const OpenCLInfo &openCLInfo, const Weights& model) : gru_(openCLInfo, model) {} - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Context) + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Context) { gru_.GetNextState(NextState, State, Context); } @@ -136,7 +136,7 @@ class Decoder { { } - void Init(const mblas::Matrix& SourceContext) + void Init(const mblas::Tensor& SourceContext) { using namespace mblas; @@ -149,9 +149,9 @@ class Decoder { } } - void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, - const mblas::Matrix& HiddenState, - const mblas::Matrix& SourceContext, + void GetAlignedSourceContext(mblas::Tensor& AlignedSourceContext, + const mblas::Tensor& HiddenState, + const mblas::Tensor& SourceContext, const Array<int>& mapping, const std::vector<uint>& beamSizes) { @@ -218,7 +218,7 @@ class Decoder { //std::cerr << "2AlignedSourceContext=" << AlignedSourceContext.Debug() << std::endl; } - mblas::Matrix& GetAttention() { + mblas::Tensor& GetAttention() { return A_; } @@ -227,10 +227,10 @@ class Decoder { Array<int> dBatchMapping_; - mblas::Matrix SCU_; - mblas::Matrix Temp1_; - mblas::Matrix Temp2_; - mblas::Matrix A_; + mblas::Tensor SCU_; + mblas::Tensor Temp1_; + mblas::Tensor Temp2_; + mblas::Tensor A_; }; @@ -245,10 +245,10 @@ class Decoder { { } - void GetProbs(mblas::Matrix& Probs, - const mblas::Matrix& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext) + void GetProbs(mblas::Tensor& Probs, + const mblas::Tensor& State, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext) { using namespace mblas; @@ -295,9 +295,9 @@ class Decoder { const Weights& w_; bool filtered_; - mblas::Matrix T1_; - mblas::Matrix T2_; - mblas::Matrix T3_; + mblas::Tensor T1_; + mblas::Tensor T2_; + mblas::Tensor T3_; }; @@ -317,53 +317,53 @@ public: return embeddings_.GetRows(); } - mblas::Matrix& GetProbs() { + mblas::Tensor& GetProbs() { return Probs_; } - mblas::Matrix& GetAttention() { + mblas::Tensor& GetAttention() { return alignment_.GetAttention(); } - void EmptyState(mblas::Matrix& State, - const mblas::Matrix& SourceContext, + void EmptyState(mblas::Tensor& State, + const mblas::Tensor& SourceContext, size_t batchSize, const Array<int>& batchMapping); - void EmptyEmbedding(mblas::Matrix& Embedding, size_t batchSize = 1); + void EmptyEmbedding(mblas::Tensor& Embedding, size_t batchSize = 1); - void Decode(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Embeddings, - const mblas::Matrix& SourceContext, + void Decode(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Embeddings, + const mblas::Tensor& SourceContext, const Array<int>& mapping, const std::vector<uint>& beamSizes); - void GetHiddenState(mblas::Matrix& HiddenState, - const mblas::Matrix& PrevState, - const mblas::Matrix& Embedding); + void GetHiddenState(mblas::Tensor& HiddenState, + const mblas::Tensor& PrevState, + const mblas::Tensor& Embedding); - void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, - const mblas::Matrix& HiddenState, - const mblas::Matrix& SourceContext, + void GetAlignedSourceContext(mblas::Tensor& AlignedSourceContext, + const mblas::Tensor& HiddenState, + const mblas::Tensor& SourceContext, const Array<int>& mapping, const std::vector<uint>& beamSizes); - void GetNextState(mblas::Matrix& State, - const mblas::Matrix& HiddenState, - const mblas::Matrix& AlignedSourceContext); + void GetNextState(mblas::Tensor& State, + const mblas::Tensor& HiddenState, + const mblas::Tensor& AlignedSourceContext); - void GetProbs(const mblas::Matrix& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext); + void GetProbs(const mblas::Tensor& State, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext); - void Lookup(mblas::Matrix& Embedding, + void Lookup(mblas::Tensor& Embedding, const std::vector<uint>& w); private: - mblas::Matrix HiddenState_; - mblas::Matrix AlignedSourceContext_; - mblas::Matrix Probs_; + mblas::Tensor HiddenState_; + mblas::Tensor AlignedSourceContext_; + mblas::Tensor Probs_; Embeddings<Weights::DecEmbeddings> embeddings_; RNNHidden<Weights::DecInit, Weights::DecGRU1> rnn1_; diff --git a/src/amun/fpga/encoder.cpp b/src/amun/fpga/encoder.cpp index 03c945df..f65d342a 100644 --- a/src/amun/fpga/encoder.cpp +++ b/src/amun/fpga/encoder.cpp @@ -37,7 +37,7 @@ std::vector<std::vector<size_t>> GetBatchInput(const Sentences& source, size_t t return matrix; } -void Encoder::Encode(const Sentences& source, size_t tab, mblas::Matrix& context, +void Encoder::Encode(const Sentences& source, size_t tab, mblas::Tensor& context, Array<int>& dMapping) { size_t maxSentenceLength = GetMaxLength(source, tab); diff --git a/src/amun/fpga/encoder.h b/src/amun/fpga/encoder.h index 7919923f..57701796 100644 --- a/src/amun/fpga/encoder.h +++ b/src/amun/fpga/encoder.h @@ -17,7 +17,7 @@ class Encoder { : w_(model) {} - void Lookup(const OpenCLInfo &openCLInfo, mblas::Matrix& Row, const Words& words) + void Lookup(const OpenCLInfo &openCLInfo, mblas::Tensor& Row, const Words& words) { std::vector<uint> knownWords(words.size(), 1); for (size_t i = 0; i < words.size(); ++i) { @@ -68,19 +68,19 @@ class Encoder { mblas::Fill(State_, 0.0f); } - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Embd) { + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Embd) { gru_.GetNextState(NextState, State, Embd); } template <class It> - void Encode(It it, It end, mblas::Matrix& Context, size_t batchSize, bool invert, + void Encode(It it, It end, mblas::Tensor& Context, size_t batchSize, bool invert, const Array<int>* mapping=nullptr) { InitializeState(batchSize); - mblas::Matrix prevState(State_); + mblas::Tensor prevState(State_); //std::cerr << "State_=" << State_.Debug(1) << std::endl; //std::cerr << "prevState=" << prevState.Debug(1) << std::endl; @@ -118,14 +118,14 @@ class Encoder { // Model matrices const GRU<Weights> gru_; - mblas::Matrix State_; + mblas::Tensor State_; }; public: Encoder(const OpenCLInfo &openCLInfo, const Weights& model); - void Encode(const Sentences& source, size_t tab, mblas::Matrix& Context, + void Encode(const Sentences& source, size_t tab, mblas::Tensor& Context, Array<int>& dMapping); protected: @@ -134,8 +134,8 @@ protected: RNN<Weights::EncBackwardGRU> backwardRnn_; // reusing memory - std::vector<mblas::Matrix> embeddedWords_; - mblas::Matrix Context; + std::vector<mblas::Tensor> embeddedWords_; + mblas::Tensor Context; const OpenCLInfo &openCLInfo_; diff --git a/src/amun/fpga/encoder_decoder.cpp b/src/amun/fpga/encoder_decoder.cpp index 9c7bc577..1848bfe0 100644 --- a/src/amun/fpga/encoder_decoder.cpp +++ b/src/amun/fpga/encoder_decoder.cpp @@ -106,7 +106,7 @@ unsigned EncoderDecoder::GetVocabSize() const return decoder_->GetVocabSize(); } -BaseMatrix& EncoderDecoder::GetProbs() +BaseTensor& EncoderDecoder::GetProbs() { return decoder_->GetProbs(); } diff --git a/src/amun/fpga/encoder_decoder.h b/src/amun/fpga/encoder_decoder.h index 127f80d4..e5c4f1f6 100644 --- a/src/amun/fpga/encoder_decoder.h +++ b/src/amun/fpga/encoder_decoder.h @@ -41,11 +41,11 @@ public: virtual size_t GetVocabSize() const; - virtual BaseMatrix& GetProbs(); + virtual BaseTensor& GetProbs(); protected: const Weights& model_; - mblas::Matrix sourceContext_; + mblas::Tensor sourceContext_; std::unique_ptr<Encoder> encoder_; std::unique_ptr<Decoder> decoder_; diff --git a/src/amun/fpga/encoder_decoder_loader.cpp b/src/amun/fpga/encoder_decoder_loader.cpp index 5ff380e4..fa146bbb 100644 --- a/src/amun/fpga/encoder_decoder_loader.cpp +++ b/src/amun/fpga/encoder_decoder_loader.cpp @@ -63,10 +63,10 @@ ScorerPtr EncoderDecoderLoader::NewScorer(const God &god, const DeviceInfo &devi return ScorerPtr(ed); } -BestHypsBasePtr EncoderDecoderLoader::GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const +BaseBestHypsPtr EncoderDecoderLoader::GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const { BestHyps *obj = new BestHyps(god, openCLInfo_); - return BestHypsBasePtr(obj); + return BaseBestHypsPtr(obj); } diff --git a/src/amun/fpga/encoder_decoder_loader.h b/src/amun/fpga/encoder_decoder_loader.h index dac88962..d450938c 100644 --- a/src/amun/fpga/encoder_decoder_loader.h +++ b/src/amun/fpga/encoder_decoder_loader.h @@ -18,7 +18,7 @@ public: virtual void Load(const God &god); virtual ScorerPtr NewScorer(const God &god, const DeviceInfo &deviceInfo) const; - virtual BestHypsBasePtr GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const; + virtual BaseBestHypsPtr GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const; protected: diff --git a/src/amun/fpga/encoder_decoder_state.cpp b/src/amun/fpga/encoder_decoder_state.cpp index 17bde56f..ab49ce4b 100644 --- a/src/amun/fpga/encoder_decoder_state.cpp +++ b/src/amun/fpga/encoder_decoder_state.cpp @@ -13,19 +13,19 @@ EncoderDecoderState::EncoderDecoderState(const OpenCLInfo &openCLInfo) } -mblas::Matrix& EncoderDecoderState::GetStates() { +mblas::Tensor& EncoderDecoderState::GetStates() { return states_; } -const mblas::Matrix& EncoderDecoderState::GetStates() const { +const mblas::Tensor& EncoderDecoderState::GetStates() const { return states_; } -mblas::Matrix& EncoderDecoderState::GetEmbeddings() { +mblas::Tensor& EncoderDecoderState::GetEmbeddings() { return embeddings_; } -const mblas::Matrix& EncoderDecoderState::GetEmbeddings() const { +const mblas::Tensor& EncoderDecoderState::GetEmbeddings() const { return embeddings_; } diff --git a/src/amun/fpga/encoder_decoder_state.h b/src/amun/fpga/encoder_decoder_state.h index 8b569c99..e508c090 100644 --- a/src/amun/fpga/encoder_decoder_state.h +++ b/src/amun/fpga/encoder_decoder_state.h @@ -9,16 +9,16 @@ class EncoderDecoderState : public State { public: EncoderDecoderState(const OpenCLInfo &openCLInfo); - mblas::Matrix& GetStates(); - mblas::Matrix& GetEmbeddings(); - const mblas::Matrix& GetStates() const; - const mblas::Matrix& GetEmbeddings() const; + mblas::Tensor& GetStates(); + mblas::Tensor& GetEmbeddings(); + const mblas::Tensor& GetStates() const; + const mblas::Tensor& GetEmbeddings() const; virtual std::string Debug(size_t verbosity = 1) const; protected: - mblas::Matrix states_; - mblas::Matrix embeddings_; + mblas::Tensor states_; + mblas::Tensor embeddings_; }; diff --git a/src/amun/fpga/gru.h b/src/amun/fpga/gru.h index 362a3435..84294b19 100644 --- a/src/amun/fpga/gru.h +++ b/src/amun/fpga/gru.h @@ -24,9 +24,9 @@ public: return w_.U_.dim(0); } - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Context) const + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Context) const { using namespace mblas; @@ -92,13 +92,13 @@ protected: const Weights& w_; // reused to avoid allocation - mutable mblas::Matrix RU_; - mutable mblas::Matrix H_; - mutable mblas::Matrix R_; - mutable mblas::Matrix U_; + mutable mblas::Tensor RU_; + mutable mblas::Tensor H_; + mutable mblas::Tensor R_; + mutable mblas::Tensor U_; - mutable mblas::Matrix Temp1_; - mutable mblas::Matrix Temp2_; + mutable mblas::Tensor Temp1_; + mutable mblas::Tensor Temp2_; }; @@ -148,9 +148,9 @@ public: return w_.U_.dim(0); } - void GetNextState(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& Context) const + void GetNextState(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& Context) const { using namespace mblas; @@ -181,10 +181,10 @@ public: } - void ElementwiseOps(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& RUH, - const mblas::Matrix& Temp) const + void ElementwiseOps(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& RUH, + const mblas::Tensor& Temp) const { const uint rows = State.dim(0) * State.dim(2) * State.dim(3); const uint cols = State.dim(1); @@ -202,11 +202,11 @@ protected: const Weights& w_; // reused to avoid allocation - mutable mblas::Matrix WWx_; - mutable mblas::Matrix UUx_; + mutable mblas::Tensor WWx_; + mutable mblas::Tensor UUx_; - mutable mblas::Matrix RUH_; - mutable mblas::Matrix Temp_; + mutable mblas::Tensor RUH_; + mutable mblas::Tensor Temp_; }; diff --git a/src/amun/fpga/matrix.cpp b/src/amun/fpga/matrix.cpp index d47cb13f..42e6c03c 100644 --- a/src/amun/fpga/matrix.cpp +++ b/src/amun/fpga/matrix.cpp @@ -78,7 +78,7 @@ void Matrix::Reshape2D() std::string Matrix::Debug(size_t verbosity) const { std::stringstream strm; - strm << BaseMatrix::Debug(verbosity) << " " << arr_.Debug(verbosity); + strm << BaseTensor::Debug(verbosity) << " " << arr_.Debug(verbosity); //cerr << "Debug1=" << strm.str() << endl; return strm.str(); diff --git a/src/amun/fpga/matrix.h b/src/amun/fpga/matrix.h index 94ee30e4..6af26c19 100644 --- a/src/amun/fpga/matrix.h +++ b/src/amun/fpga/matrix.h @@ -1,5 +1,5 @@ #pragma once -#include "common/base_matrix.h" +#include "common/base_tensor.h" #include "types-fpga.h" #include "array.h" @@ -7,7 +7,7 @@ namespace amunmt { namespace FPGA { namespace mblas { -class Matrix : public BaseMatrix { +class Matrix : public BaseTensor { public: Matrix(const OpenCLInfo &openCLInfo); Matrix(const OpenCLInfo &openCLInfo, size_t rows, size_t cols, bool zero = false); diff --git a/src/amun/fpga/matrix_functions.cpp b/src/amun/fpga/matrix_functions.cpp index 017bb062..7d373420 100644 --- a/src/amun/fpga/matrix_functions.cpp +++ b/src/amun/fpga/matrix_functions.cpp @@ -233,13 +233,13 @@ Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B, return C; } -void ElementwiseOps(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& RUH, - const mblas::Matrix& Temp, - const mblas::Matrix& B, - const mblas::Matrix& Bx1, - const mblas::Matrix& Bx2, +void ElementwiseOps(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& RUH, + const mblas::Tensor& Temp, + const mblas::Tensor& B, + const mblas::Tensor& Bx1, + const mblas::Tensor& Bx2, const uint &rows, const uint &cols) { @@ -536,7 +536,7 @@ void SetColumn(Matrix& In, int noColumn, float value) void MaxElement( Array<float> &d_out, const Array<int> &d_ind, - mblas::Matrix &d_in, + mblas::Tensor &d_in, int numBatches, const Array<int> &batchFirstElementIdxs) { @@ -554,7 +554,7 @@ void MaxElement( void NthElement( Array<float>& d_out, Array<unsigned> &d_ind, - const mblas::Matrix &Probs, + const mblas::Tensor &Probs, const Array<uint> &beamSizes, size_t maxBatchSize, const Array<uint> &d_cummulatedBeamSizes, diff --git a/src/amun/fpga/matrix_functions.h b/src/amun/fpga/matrix_functions.h index b6684b37..af4a1cc4 100644 --- a/src/amun/fpga/matrix_functions.h +++ b/src/amun/fpga/matrix_functions.h @@ -56,13 +56,13 @@ Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B, inline void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps) {} -void ElementwiseOps(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& RUH, - const mblas::Matrix& Temp, - const mblas::Matrix& B, - const mblas::Matrix& Bx1, - const mblas::Matrix& Bx2, +void ElementwiseOps(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& RUH, + const mblas::Tensor& Temp, + const mblas::Tensor& B, + const mblas::Tensor& Bx1, + const mblas::Tensor& Bx2, const uint &rows, const uint &cols); @@ -106,7 +106,7 @@ void SetColumn(Matrix& In, int noColumn, float value); void MaxElement( Array<float> &d_out, const Array<int> &d_ind, - mblas::Matrix &d_in, + mblas::Tensor &d_in, int numBatches, const Array<int> &batchFirstElementIdxs); //float* d_out, int* d_ind, float* d_in, int numBatches, int* batchFirstElementIdxs @@ -114,7 +114,7 @@ void MaxElement( void NthElement( Array<float>& d_out, Array<unsigned> &d_ind, - const mblas::Matrix &Probs, + const mblas::Tensor &Probs, const Array<uint> &beamSizes, size_t maxBatchSize, const Array<uint> &d_cummulatedBeamSizes, diff --git a/src/amun/fpga/model.h b/src/amun/fpga/model.h index 8d681291..c0a274db 100644 --- a/src/amun/fpga/model.h +++ b/src/amun/fpga/model.h @@ -17,7 +17,7 @@ struct Weights { //std::cerr << "E_=" << E_.Debug() << std::endl; } - const mblas::Matrix E_; + const mblas::Tensor E_; }; struct EncForwardGRU { @@ -33,15 +33,15 @@ struct Weights { Gamma_2_(model.GetMatrix(openCLInfo, "encoder_gamma2")) { } - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix Wx_; - const mblas::Matrix Bx1_; - const mblas::Matrix Bx2_; - const mblas::Matrix Ux_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor Wx_; + const mblas::Tensor Bx1_; + const mblas::Tensor Bx2_; + const mblas::Tensor Ux_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; struct EncBackwardGRU { @@ -57,15 +57,15 @@ struct Weights { Gamma_2_(model.GetMatrix(openCLInfo, "encoder_r_gamma2")) {} - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix Wx_; - const mblas::Matrix Bx1_; - const mblas::Matrix Bx2_; - const mblas::Matrix Ux_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor Wx_; + const mblas::Tensor Bx1_; + const mblas::Tensor Bx2_; + const mblas::Tensor Ux_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; ////////////////////////////////////////////////////////////////////////////// @@ -74,7 +74,7 @@ struct Weights { : E_(model.GetMatrix(openCLInfo, "Wemb_dec")) {} - const mblas::Matrix E_; + const mblas::Tensor E_; }; struct DecInit { @@ -84,9 +84,9 @@ struct Weights { Gamma_(model.GetMatrix(openCLInfo, "ff_state_gamma")) {} - const mblas::Matrix Wi_; - const mblas::Matrix Bi_; - const mblas::Matrix Gamma_; + const mblas::Tensor Wi_; + const mblas::Tensor Bi_; + const mblas::Tensor Gamma_; }; struct DecGRU1 { @@ -102,15 +102,15 @@ struct Weights { Gamma_2_(model.GetMatrix(openCLInfo, "decoder_cell1_gamma2")) {} - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix Wx_; - const mblas::Matrix Bx1_; - const mblas::Matrix Bx2_; - const mblas::Matrix Ux_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor Wx_; + const mblas::Tensor Bx1_; + const mblas::Tensor Bx2_; + const mblas::Tensor Ux_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; struct DecGRU2 { @@ -126,15 +126,15 @@ struct Weights { Gamma_2_(model.GetMatrix(openCLInfo, "decoder_cell2_gamma2")) {} - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix Wx_; - const mblas::Matrix Bx2_; - const mblas::Matrix Bx1_; - const mblas::Matrix Ux_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor Wx_; + const mblas::Tensor Bx2_; + const mblas::Tensor Bx1_; + const mblas::Tensor Ux_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; struct DecAlignment { @@ -148,13 +148,13 @@ struct Weights { Gamma_2_(model.GetMatrix(openCLInfo, "decoder_att_gamma2")) {} - const mblas::Matrix V_; - const mblas::Matrix W_; - const mblas::Matrix B_; - const mblas::Matrix U_; - const mblas::Matrix C_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor V_; + const mblas::Tensor W_; + const mblas::Tensor B_; + const mblas::Tensor U_; + const mblas::Tensor C_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; struct DecSoftmax { @@ -172,17 +172,17 @@ struct Weights { Gamma_2_(model.GetMatrix(openCLInfo, "ff_logit_l1_gamma2")) {} - const mblas::Matrix W1_; - const mblas::Matrix B1_; - const mblas::Matrix W2_; - const mblas::Matrix B2_; - const mblas::Matrix W3_; - const mblas::Matrix B3_; - const mblas::Matrix W4_; - const mblas::Matrix B4_; - const mblas::Matrix Gamma_0_; - const mblas::Matrix Gamma_1_; - const mblas::Matrix Gamma_2_; + const mblas::Tensor W1_; + const mblas::Tensor B1_; + const mblas::Tensor W2_; + const mblas::Tensor B2_; + const mblas::Tensor W3_; + const mblas::Tensor B3_; + const mblas::Tensor W4_; + const mblas::Tensor B4_; + const mblas::Tensor Gamma_0_; + const mblas::Tensor Gamma_1_; + const mblas::Tensor Gamma_2_; }; ////////////////////////////////////////////////////////////////////////////// diff --git a/src/amun/fpga/npz_converter.cpp b/src/amun/fpga/npz_converter.cpp index 7ce9bdd4..16dac0a6 100644 --- a/src/amun/fpga/npz_converter.cpp +++ b/src/amun/fpga/npz_converter.cpp @@ -13,13 +13,13 @@ NpzConverter::NpzConverter(const std::string& file) cerr << "file=" << file << endl; } -mblas::Matrix NpzConverter::GetMatrix( +mblas::Tensor NpzConverter::GetMatrix( const OpenCLInfo &openCLInfo, const std::string& key, bool transpose ) const { - mblas::Matrix matrix(openCLInfo); + mblas::Tensor matrix(openCLInfo); //cerr << "key1=" << key << " " << matrix.Debug(1) << endl; cnpy::npz_t::const_iterator it = model_.find(key); diff --git a/src/amun/fpga/npz_converter.h b/src/amun/fpga/npz_converter.h index 6aa95eb1..565204ee 100644 --- a/src/amun/fpga/npz_converter.h +++ b/src/amun/fpga/npz_converter.h @@ -43,7 +43,7 @@ public: model_.destruct(); } - mblas::Matrix GetMatrix( + mblas::Tensor GetMatrix( const OpenCLInfo &openCLInfo, const std::string& key, bool transpose = false) const; diff --git a/src/amun/fpga/nth_element.cpp b/src/amun/fpga/nth_element.cpp index 5b2dcf69..e32aa1e8 100644 --- a/src/amun/fpga/nth_element.cpp +++ b/src/amun/fpga/nth_element.cpp @@ -17,7 +17,7 @@ NthElement::NthElement(const OpenCLInfo &openCLInfo, size_t maxBeamSize, size_t } -void NthElement::getNBestList(const std::vector<uint>& beamSizes, mblas::Matrix& Probs, +void NthElement::getNBestList(const std::vector<uint>& beamSizes, mblas::Tensor& Probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst) { diff --git a/src/amun/fpga/nth_element.h b/src/amun/fpga/nth_element.h index bd3b0615..e926c749 100644 --- a/src/amun/fpga/nth_element.h +++ b/src/amun/fpga/nth_element.h @@ -15,7 +15,7 @@ public: NthElement(const NthElement ©) = delete; NthElement(const OpenCLInfo &openCLInfo, size_t maxBeamSize, size_t maxBatchSize); - void getNBestList(const std::vector<uint>& beamSizes, mblas::Matrix& Probs, + void getNBestList(const std::vector<uint>& beamSizes, mblas::Tensor& Probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst); diff --git a/src/amun/gpu/decoder/ape_penalty.h b/src/amun/gpu/decoder/ape_penalty.h index 54259151..3e969987 100644 --- a/src/amun/gpu/decoder/ape_penalty.h +++ b/src/amun/gpu/decoder/ape_penalty.h @@ -8,7 +8,7 @@ #include "common/base_best_hyps.h" #include "common/loader.h" -#include "gpu/mblas/matrix.h" +#include "gpu/mblas/tensor.h" namespace GPU { @@ -55,7 +55,7 @@ class ApePenalty : public Scorer { private: std::vector<float> costs_; const SrcTrgMap& srcTrgMap_; - mblas::Matrix Probs_; + mblas::Tensor Probs_; const Penalties& penalties_; }; diff --git a/src/amun/gpu/decoder/best_hyps.cu b/src/amun/gpu/decoder/best_hyps.cu index 4a921a30..9bcac353 100644 --- a/src/amun/gpu/decoder/best_hyps.cu +++ b/src/amun/gpu/decoder/best_hyps.cu @@ -6,7 +6,7 @@ namespace amunmt { namespace GPU { BestHyps::BestHyps(const God &god) - : BestHypsBase(god), + : BaseBestHyps(god), keys_(god.Get<unsigned>("beam-size") * god.Get<unsigned>("mini-batch")), costs_(god.Get<unsigned>("beam-size") * god.Get<unsigned>("mini-batch")), maxBeamSize_(god.Get<unsigned>("beam-size")) @@ -17,11 +17,11 @@ BestHyps::BestHyps(const God &god) } } -void BestHyps::DisAllowUNK(mblas::Matrix& Prob) { +void BestHyps::DisAllowUNK(mblas::Tensor& Prob) { SetColumn(Prob, UNK_ID, std::numeric_limits<float>::lowest()); } -void BestHyps::FindBests(const std::vector<unsigned>& beamSizes, mblas::Matrix& Probs, +void BestHyps::FindBests(const std::vector<unsigned>& beamSizes, mblas::Tensor& Probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst) @@ -30,7 +30,7 @@ void BestHyps::FindBests(const std::vector<unsigned>& beamSizes, mblas::Matrix& } // fast fused softmax and nth_element -void BestHyps::FindBests(const std::vector<unsigned>& beamSizes, mblas::Matrix& Probs, +void BestHyps::FindBests(const std::vector<unsigned>& beamSizes, mblas::Tensor& Probs, mblas::Vector<NthOutBatch> &nBest, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, @@ -45,7 +45,7 @@ std::vector<SoftAlignmentPtr> BestHyps::GetAlignments(const std::vector<ScorerPt std::vector<SoftAlignmentPtr> alignments; for (auto& scorer : scorers) { if (GPU::EncoderDecoder* encdec = dynamic_cast<GPU::EncoderDecoder*>(scorer.get())) { - const mblas::Matrix &attention = encdec->GetAttention(); + const mblas::Tensor &attention = encdec->GetAttention(); unsigned attLength = attention.dim(1); SoftAlignment *softAlignment = new SoftAlignment(attLength); @@ -76,7 +76,7 @@ void BestHyps::CalcBeam( using namespace mblas; - mblas::Matrix& Probs = static_cast<mblas::Matrix&>(scorers[0]->GetProbs()); + mblas::Tensor& Probs = static_cast<mblas::Tensor&>(scorers[0]->GetProbs()); std::vector<float> vCosts; for (auto& h : prevHyps) { @@ -97,7 +97,7 @@ void BestHyps::CalcBeam( const bool isFirst = (vCosts[0] == 0.0f) ? true : false; if (god_.UseFusedSoftmax()) { - const mblas::Matrix& b4 = *static_cast<const mblas::Matrix*>(scorers[0]->GetBias()); + const mblas::Tensor& b4 = *static_cast<const mblas::Tensor*>(scorers[0]->GetBias()); mblas::Vector<NthOutBatch> &nBest = *static_cast<mblas::Vector<NthOutBatch>*>(scorers[0]->GetNBest()); nBest.newSize(beamSizeSum); @@ -115,7 +115,7 @@ void BestHyps::CalcBeam( BroadcastVecColumn(weights_.at(scorers[0]->GetName()) * _1 + _2, Probs, costs_); for (unsigned i = 1; i < scorers.size(); ++i) { - mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs()); + mblas::Tensor &currProbs = static_cast<mblas::Tensor&>(scorers[i]->GetProbs()); Element(_1 + weights_.at(scorers[i]->GetName()) * _2, Probs, currProbs); } @@ -132,7 +132,7 @@ void BestHyps::CalcBeam( breakDowns.push_back(bestCosts); for (unsigned i = 1; i < scorers.size(); ++i) { std::vector<float> modelCosts(beamSizeSum); - mblas::Matrix &currProbs = static_cast<mblas::Matrix&>(scorers[i]->GetProbs()); + mblas::Tensor &currProbs = static_cast<mblas::Tensor&>(scorers[i]->GetProbs()); nthElement_->getValueByKey(modelCosts, currProbs); breakDowns.push_back(modelCosts); @@ -194,7 +194,7 @@ void BestHyps::CalcBeam( ////////////////////////////////////////////////////////////////////////// void BestHyps::getNBestList(const std::vector<unsigned>& beamSizes, - mblas::Matrix& Probs, + mblas::Tensor& Probs, mblas::Vector<NthOutBatch> &nBest, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, diff --git a/src/amun/gpu/decoder/best_hyps.h b/src/amun/gpu/decoder/best_hyps.h index 47695f36..5fa05476 100644 --- a/src/amun/gpu/decoder/best_hyps.h +++ b/src/amun/gpu/decoder/best_hyps.h @@ -8,7 +8,7 @@ #include "common/exception.h" #include "common/god.h" #include "common/utils.h" -#include "gpu/mblas/matrix_functions.h" +#include "gpu/mblas/tensor_functions.h" #include "gpu/mblas/nth_element.h" #include "gpu/mblas/vector.h" @@ -17,16 +17,16 @@ namespace amunmt { namespace GPU { -class BestHyps : public BestHypsBase +class BestHyps : public BaseBestHyps { public: BestHyps(const BestHyps ©) = delete; BestHyps(const God &god); - void DisAllowUNK(mblas::Matrix& Prob); + void DisAllowUNK(mblas::Tensor& Prob); // standard nth_element - void FindBests(const std::vector<unsigned>& beamSizes, mblas::Matrix& Probs, + void FindBests(const std::vector<unsigned>& beamSizes, mblas::Tensor& Probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst); @@ -48,14 +48,14 @@ class BestHyps : public BestHypsBase unsigned maxBeamSize_; // fast fused softmax and nth_element - void FindBests(const std::vector<unsigned>& beamSizes, mblas::Matrix& Probs, + void FindBests(const std::vector<unsigned>& beamSizes, mblas::Tensor& Probs, mblas::Vector<NthOutBatch> &nBest, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst); void getNBestList(const std::vector<unsigned>& beamSizes, - mblas::Matrix& Probs, + mblas::Tensor& Probs, mblas::Vector<NthOutBatch> &nBest, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, diff --git a/src/amun/gpu/decoder/encoder_decoder.cu b/src/amun/gpu/decoder/encoder_decoder.cu index 4ce69c4f..e43202b0 100644 --- a/src/amun/gpu/decoder/encoder_decoder.cu +++ b/src/amun/gpu/decoder/encoder_decoder.cu @@ -5,7 +5,7 @@ #include "common/sentences.h" #include "encoder_decoder.h" -#include "gpu/mblas/matrix_functions.h" +#include "gpu/mblas/tensor_functions.h" #include "gpu/dl4mt/dl4mt.h" #include "gpu/decoder/encoder_decoder_state.h" #include "gpu/decoder/best_hyps.h" @@ -29,7 +29,7 @@ EncoderDecoder::EncoderDecoder( encoder_(new Encoder(model_, config)), decoder_(new Decoder(god, model_, config)), indices_(god.Get<unsigned>("beam-size")), - SourceContext_(new mblas::Matrix()) + SourceContext_(new mblas::Tensor()) { BEGIN_TIMER("EncoderDecoder"); } @@ -144,11 +144,11 @@ void EncoderDecoder::AssembleBeamState(const State& in, //PAUSE_TIMER("AssembleBeamState"); } -void EncoderDecoder::GetAttention(mblas::Matrix& Attention) { +void EncoderDecoder::GetAttention(mblas::Tensor& Attention) { decoder_->GetAttention(Attention); } -BaseMatrix& EncoderDecoder::GetProbs() { +BaseTensor& EncoderDecoder::GetProbs() { return decoder_->GetProbs(); } @@ -157,12 +157,12 @@ void *EncoderDecoder::GetNBest() return &decoder_->GetNBest(); } -const BaseMatrix *EncoderDecoder::GetBias() const +const BaseTensor *EncoderDecoder::GetBias() const { return decoder_->GetBias(); } -mblas::Matrix& EncoderDecoder::GetAttention() { +mblas::Tensor& EncoderDecoder::GetAttention() { return decoder_->GetAttention(); } diff --git a/src/amun/gpu/decoder/encoder_decoder.h b/src/amun/gpu/decoder/encoder_decoder.h index 7c9ccac7..d145559e 100644 --- a/src/amun/gpu/decoder/encoder_decoder.h +++ b/src/amun/gpu/decoder/encoder_decoder.h @@ -7,7 +7,7 @@ #include "common/base_best_hyps.h" #include "common/threadpool.h" #include "gpu/types-gpu.h" -#include "gpu/mblas/matrix.h" +#include "gpu/mblas/tensor.h" #include "gpu/mblas/handles.h" #include "gpu/mblas/vector.h" @@ -46,13 +46,13 @@ class EncoderDecoder : public Scorer { const Beam& beam, State& out); - void GetAttention(mblas::Matrix& Attention); + void GetAttention(mblas::Tensor& Attention); - mblas::Matrix& GetAttention(); - virtual BaseMatrix& GetProbs(); + mblas::Tensor& GetAttention(); + virtual BaseTensor& GetProbs(); virtual void *GetNBest(); - virtual const BaseMatrix *GetBias() const; + virtual const BaseTensor *GetBias() const; unsigned GetVocabSize() const; @@ -67,7 +67,7 @@ class EncoderDecoder : public Scorer { mblas::Vector<unsigned> sentenceLengths_; // set in Encoder::GetContext() to length (maxSentenceLength * batchSize). 1 if it's a word, 0 otherwise - std::unique_ptr<mblas::Matrix> SourceContext_; + std::unique_ptr<mblas::Tensor> SourceContext_; EncoderDecoder(const EncoderDecoder&) = delete; diff --git a/src/amun/gpu/decoder/encoder_decoder_loader.cu b/src/amun/gpu/decoder/encoder_decoder_loader.cu index f6496d53..8fc0c1ec 100644 --- a/src/amun/gpu/decoder/encoder_decoder_loader.cu +++ b/src/amun/gpu/decoder/encoder_decoder_loader.cu @@ -60,8 +60,8 @@ ScorerPtr EncoderDecoderLoader::NewScorer(const God &god, const DeviceInfo &devi tab, *weights_[d])); } -BestHypsBasePtr EncoderDecoderLoader::GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const { - BestHypsBasePtr obj(new GPU::BestHyps(god)); +BaseBestHypsPtr EncoderDecoderLoader::GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const { + BaseBestHypsPtr obj(new GPU::BestHyps(god)); //std::thread::id this_id = std::this_thread::get_id(); //std::cerr << "deviceInfo=" << deviceInfo << " thread " << this_id << " sleeping...\n"; diff --git a/src/amun/gpu/decoder/encoder_decoder_loader.h b/src/amun/gpu/decoder/encoder_decoder_loader.h index cad4887b..4377bdd0 100644 --- a/src/amun/gpu/decoder/encoder_decoder_loader.h +++ b/src/amun/gpu/decoder/encoder_decoder_loader.h @@ -17,7 +17,7 @@ class EncoderDecoderLoader : public Loader { virtual void Load(const God &god); virtual ScorerPtr NewScorer(const God &god, const DeviceInfo &deviceInfo) const; - virtual BestHypsBasePtr GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const; + virtual BaseBestHypsPtr GetBestHyps(const God &god, const DeviceInfo &deviceInfo) const; private: std::vector<std::unique_ptr<Weights>> weights_; // MUST be indexed by gpu id. eg. weights_[2] is for gpu2 diff --git a/src/amun/gpu/decoder/encoder_decoder_state.cu b/src/amun/gpu/decoder/encoder_decoder_state.cu index 1c8be9b3..3fb0351b 100644 --- a/src/amun/gpu/decoder/encoder_decoder_state.cu +++ b/src/amun/gpu/decoder/encoder_decoder_state.cu @@ -21,7 +21,7 @@ CellState& EncoderDecoderState::GetStates() { return states_; } -mblas::Matrix& EncoderDecoderState::GetEmbeddings() { +mblas::Tensor& EncoderDecoderState::GetEmbeddings() { return embeddings_; } @@ -29,7 +29,7 @@ const CellState& EncoderDecoderState::GetStates() const { return states_; } -const mblas::Matrix& EncoderDecoderState::GetEmbeddings() const { +const mblas::Tensor& EncoderDecoderState::GetEmbeddings() const { return embeddings_; } diff --git a/src/amun/gpu/decoder/encoder_decoder_state.h b/src/amun/gpu/decoder/encoder_decoder_state.h index 9399b850..713abc68 100644 --- a/src/amun/gpu/decoder/encoder_decoder_state.h +++ b/src/amun/gpu/decoder/encoder_decoder_state.h @@ -16,13 +16,13 @@ class EncoderDecoderState : public State { virtual std::string Debug(unsigned verbosity = 1) const; CellState& GetStates(); - mblas::Matrix& GetEmbeddings(); + mblas::Tensor& GetEmbeddings(); const CellState& GetStates() const; - const mblas::Matrix& GetEmbeddings() const; + const mblas::Tensor& GetEmbeddings() const; private: CellState states_; - mblas::Matrix embeddings_; + mblas::Tensor embeddings_; }; } diff --git a/src/amun/gpu/decoder/language_model.h b/src/amun/gpu/decoder/language_model.h index 728eb2f2..7fc9f33c 100644 --- a/src/amun/gpu/decoder/language_model.h +++ b/src/amun/gpu/decoder/language_model.h @@ -4,7 +4,7 @@ #include "types.h" #include "scorer.h" -#include "matrix.h" +#include "tensor.h" #include "dl4mt.h" #include "threadpool.h" #include "kenlm.h" diff --git a/src/amun/gpu/dl4mt/cell.h b/src/amun/gpu/dl4mt/cell.h index d058874d..7b4ac955 100644 --- a/src/amun/gpu/dl4mt/cell.h +++ b/src/amun/gpu/dl4mt/cell.h @@ -1,6 +1,6 @@ #pragma once -#include "gpu/mblas/matrix_functions.h" -#include "gpu/mblas/matrix_wrapper.h" +#include "gpu/mblas/tensor_functions.h" +#include "gpu/mblas/tensor_wrapper.h" #include "gpu/mblas/handles.h" #include "cellstate.h" @@ -18,7 +18,7 @@ class Cell { public: virtual void GetNextState(CellState& NextState, const CellState& State, - const mblas::Matrix& Context) const = 0; + const mblas::Tensor& Context) const = 0; virtual CellLength GetStateLength() const = 0; }; diff --git a/src/amun/gpu/dl4mt/cellstate.h b/src/amun/gpu/dl4mt/cellstate.h index d22b0e8d..3a316a55 100644 --- a/src/amun/gpu/dl4mt/cellstate.h +++ b/src/amun/gpu/dl4mt/cellstate.h @@ -1,6 +1,6 @@ #pragma once -#include "gpu/mblas/matrix_functions.h" -#include "gpu/mblas/matrix_wrapper.h" +#include "gpu/mblas/tensor_functions.h" +#include "gpu/mblas/tensor_wrapper.h" #include "gpu/mblas/handles.h" namespace amunmt { @@ -8,15 +8,15 @@ namespace GPU { struct CellState { CellState(){ - output = std::unique_ptr<mblas::Matrix>(new mblas::Matrix()); - cell = std::unique_ptr<mblas::Matrix>(new mblas::Matrix()); + output = std::unique_ptr<mblas::Tensor>(new mblas::Tensor()); + cell = std::unique_ptr<mblas::Tensor>(new mblas::Tensor()); }; - CellState(std::unique_ptr<mblas::Matrix> cell, std::unique_ptr<mblas::Matrix> output): + CellState(std::unique_ptr<mblas::Tensor> cell, std::unique_ptr<mblas::Tensor> output): cell(std::move(cell)), output(std::move(output)) {} - std::unique_ptr<mblas::Matrix> output; - std::unique_ptr<mblas::Matrix> cell; + std::unique_ptr<mblas::Tensor> output; + std::unique_ptr<mblas::Tensor> cell; }; } } diff --git a/src/amun/gpu/dl4mt/decoder.h b/src/amun/gpu/dl4mt/decoder.h index d9eca43b..9f61aff4 100644 --- a/src/amun/gpu/dl4mt/decoder.h +++ b/src/amun/gpu/dl4mt/decoder.h @@ -3,7 +3,7 @@ #include <yaml-cpp/yaml.h> #include "gpu/mblas/vector.h" -#include "gpu/mblas/matrix_functions.h" +#include "gpu/mblas/tensor_functions.h" #include "model.h" #include "gru.h" #include "lstm.h" @@ -24,7 +24,7 @@ class Decoder { : w_(model) {} - void Lookup(mblas::Matrix& Rows, const std::vector<unsigned>& ids) { + void Lookup(mblas::Tensor& Rows, const std::vector<unsigned>& ids) { using namespace mblas; std::vector<unsigned> tids = ids; for(auto&& id : tids) @@ -65,7 +65,7 @@ class Decoder { {} void InitializeState(CellState& State, - const mblas::Matrix& SourceContext, + const mblas::Tensor& SourceContext, const unsigned batchSize, const mblas::Vector<unsigned> &sentenceLengths) { @@ -104,7 +104,7 @@ class Decoder { void GetNextState(CellState& NextState, const CellState& State, - const mblas::Matrix& Context) { + const mblas::Tensor& Context) { gru_->GetNextState(NextState, State, Context); } @@ -112,7 +112,7 @@ class Decoder { const Weights& w_; std::unique_ptr<Cell> gru_; - mblas::Matrix Temp2_; + mblas::Tensor Temp2_; RNNHidden(const RNNHidden&) = delete; }; @@ -124,7 +124,7 @@ class Decoder { void GetNextState(CellState& NextState, const CellState& State, - const mblas::Matrix& Context) { + const mblas::Tensor& Context) { gru_->GetNextState(NextState, State, Context); } @@ -142,7 +142,7 @@ class Decoder { , dBatchMapping_(god.Get<unsigned>("mini-batch") * god.Get<unsigned>("beam-size"), 0) {} - void Init(const mblas::Matrix& SourceContext) { + void Init(const mblas::Tensor& SourceContext) { using namespace mblas; Prod(/*h_[0],*/ SCU_, SourceContext, *w_.U_); @@ -168,9 +168,9 @@ class Decoder { return ret; } - void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, + void GetAlignedSourceContext(mblas::Tensor& AlignedSourceContext, const CellState& HiddenState, - const mblas::Matrix& SourceContext, + const mblas::Tensor& SourceContext, const std::vector<unsigned>& h_sentenceLengths, const mblas::Vector<unsigned> &sentenceLengths, const std::vector<unsigned>& beamSizes) @@ -245,11 +245,11 @@ class Decoder { PAUSE_TIMER("GetAlignedSourceContext"); } - void GetAttention(mblas::Matrix& Attention) { + void GetAttention(mblas::Tensor& Attention) { mblas::Copy(Attention, A_); } - mblas::Matrix& GetAttention() { + mblas::Tensor& GetAttention() { return A_; } @@ -258,13 +258,13 @@ class Decoder { mblas::Vector<unsigned> dBatchMapping_; - mblas::Matrix SCU_; - mblas::Matrix Temp1_; - mblas::Matrix Temp2_; - mblas::Matrix A_; + mblas::Tensor SCU_; + mblas::Tensor Temp1_; + mblas::Tensor Temp2_; + mblas::Tensor A_; - mblas::Matrix Ones_; - mblas::Matrix Sums_; + mblas::Tensor Ones_; + mblas::Tensor Sums_; Alignment(const Alignment&) = delete; }; @@ -290,11 +290,11 @@ class Decoder { } } - void GetProbs(mblas::Matrix& Probs, - std::shared_ptr<mblas::Matrix> &b4, + void GetProbs(mblas::Tensor& Probs, + std::shared_ptr<mblas::Tensor> &b4, const CellState& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext, bool useFusedSoftmax) { using namespace mblas; @@ -339,7 +339,7 @@ class Decoder { Element(Tanh(_1 + _2 + _3), T1_, T2_, T3_); //PAUSE_TIMER("GetProbs.Element"); - std::shared_ptr<mblas::Matrix> w4; + std::shared_ptr<mblas::Tensor> w4; if(!filtered_) { w4 = w_.W4_; b4 = w_.B4_; @@ -348,6 +348,8 @@ class Decoder { b4.reset(&FilteredB4_); } + BEGIN_TIMER("OutputLayer"); + BEGIN_TIMER("GetProbs.Prod4"); Prod(Probs, T1_, *w4); PAUSE_TIMER("GetProbs.Prod4"); @@ -361,6 +363,8 @@ class Decoder { mblas::LogSoftmax(Probs); PAUSE_TIMER("GetProbs.LogSoftMax"); } + + PAUSE_TIMER("OutputLayer"); } void Filter(const std::vector<unsigned>& ids) { @@ -379,15 +383,15 @@ class Decoder { const Weights& w_; bool filtered_; - mblas::Matrix FilteredW4_; - mblas::Matrix FilteredB4_; + mblas::Tensor FilteredW4_; + mblas::Tensor FilteredB4_; - mblas::Matrix T1_; - mblas::Matrix T2_; - mblas::Matrix T3_; + mblas::Tensor T1_; + mblas::Tensor T2_; + mblas::Tensor T3_; - mblas::Matrix TempW4; - mblas::Matrix TempB4; + mblas::Tensor TempW4; + mblas::Tensor TempB4; Softmax(const Softmax&) = delete; }; @@ -403,8 +407,8 @@ class Decoder { void Decode(CellState& NextState, const CellState& State, - const mblas::Matrix& Embeddings, - const mblas::Matrix& SourceContext, + const mblas::Tensor& Embeddings, + const mblas::Tensor& SourceContext, const std::vector<unsigned>& h_sentenceLengths, const mblas::Vector<unsigned> &sentenceLengths, const std::vector<unsigned>& beamSizes, @@ -443,12 +447,12 @@ class Decoder { //PAUSE_TIMER("Decode"); } - mblas::Matrix& GetProbs() { + mblas::Tensor& GetProbs() { return Probs_; } void EmptyState(CellState& State, - const mblas::Matrix& SourceContext, + const mblas::Tensor& SourceContext, unsigned batchSize, const mblas::Vector<unsigned> &sentenceLengths) { @@ -456,12 +460,12 @@ class Decoder { alignment_.Init(SourceContext); } - void EmptyEmbedding(mblas::Matrix& Embedding, unsigned batchSize = 1) { + void EmptyEmbedding(mblas::Tensor& Embedding, unsigned batchSize = 1) { Embedding.NewSize(batchSize, embeddings_.GetCols()); mblas::Fill(Embedding, 0); } - void Lookup(mblas::Matrix& Embedding, + void Lookup(mblas::Tensor& Embedding, const std::vector<unsigned>& w) { embeddings_.Lookup(Embedding, w); } @@ -470,7 +474,7 @@ class Decoder { softmax_.Filter(ids); } - void GetAttention(mblas::Matrix& Attention) { + void GetAttention(mblas::Tensor& Attention) { alignment_.GetAttention(Attention); } @@ -478,7 +482,7 @@ class Decoder { return embeddings_.GetRows(); } - mblas::Matrix& GetAttention() { + mblas::Tensor& GetAttention() { return alignment_.GetAttention(); } @@ -486,7 +490,7 @@ class Decoder { return nBest_; } - const mblas::Matrix *GetBias() const { + const mblas::Tensor *GetBias() const { return b4_.get(); } @@ -494,13 +498,13 @@ class Decoder { void GetHiddenState(CellState& HiddenState, const CellState& PrevState, - const mblas::Matrix& Embedding) { + const mblas::Tensor& Embedding) { rnn1_.GetNextState(HiddenState, PrevState, Embedding); } - void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext, + void GetAlignedSourceContext(mblas::Tensor& AlignedSourceContext, const CellState& HiddenState, - const mblas::Matrix& SourceContext, + const mblas::Tensor& SourceContext, const std::vector<unsigned>& h_sentenceLengths, const mblas::Vector<unsigned> &sentenceLengths, const std::vector<unsigned>& beamSizes) @@ -515,14 +519,14 @@ class Decoder { void GetNextState(CellState& State, const CellState& HiddenState, - const mblas::Matrix& AlignedSourceContext) { + const mblas::Tensor& AlignedSourceContext) { rnn2_.GetNextState(State, HiddenState, AlignedSourceContext); } void GetProbs(const CellState& State, - const mblas::Matrix& Embedding, - const mblas::Matrix& AlignedSourceContext, + const mblas::Tensor& Embedding, + const mblas::Tensor& AlignedSourceContext, bool useFusedSoftmax) { softmax_.GetProbs(Probs_, b4_, State, Embedding, AlignedSourceContext, useFusedSoftmax); @@ -559,8 +563,8 @@ class Decoder { private: CellState HiddenState_; - mblas::Matrix AlignedSourceContext_; - mblas::Matrix Probs_; + mblas::Tensor AlignedSourceContext_; + mblas::Tensor Probs_; Embeddings<Weights::DecEmbeddings> embeddings_; RNNHidden<Weights::DecInit> rnn1_; @@ -569,7 +573,7 @@ class Decoder { Softmax<Weights::DecSoftmax> softmax_; mblas::Vector<NthOutBatch> nBest_; - std::shared_ptr<mblas::Matrix> b4_; + std::shared_ptr<mblas::Tensor> b4_; Decoder(const Decoder&) = delete; }; diff --git a/src/amun/gpu/dl4mt/encoder.cu b/src/amun/gpu/dl4mt/encoder.cu index b3f14c81..03380992 100644 --- a/src/amun/gpu/dl4mt/encoder.cu +++ b/src/amun/gpu/dl4mt/encoder.cu @@ -64,7 +64,7 @@ std::vector<std::vector<FactWord>> GetBatchInput(const Sentences& source, unsign void Encoder::Encode(const Sentences& source, unsigned tab, - mblas::Matrix& context, + mblas::Tensor& context, std::vector<unsigned> &h_sentenceLengths, mblas::Vector<unsigned> &sentenceLengths) { diff --git a/src/amun/gpu/dl4mt/encoder.h b/src/amun/gpu/dl4mt/encoder.h index 557f64f6..c29b5556 100644 --- a/src/amun/gpu/dl4mt/encoder.h +++ b/src/amun/gpu/dl4mt/encoder.h @@ -2,7 +2,7 @@ #include <yaml-cpp/yaml.h> -#include "gpu/mblas/matrix_functions.h" +#include "gpu/mblas/tensor_functions.h" #include "model.h" #include "gru.h" #include "common/sentence.h" @@ -28,7 +28,7 @@ class Encoder { : w_(model) {} - void Lookup(mblas::Matrix& Row, const std::vector<std::vector<Word>>& words) { + void Lookup(mblas::Tensor& Row, const std::vector<std::vector<Word>>& words) { std::vector<std::vector<unsigned>> knownWords(w_.Es_.size(), std::vector<unsigned>(words.size(), 1)); unsigned factorCount = w_.Es_.size(); @@ -36,7 +36,7 @@ class Encoder { const std::vector<Word>& factors = words[i]; for (unsigned factorIdx = 0; factorIdx < factors.size(); ++factorIdx) { const Word& factor = factors[factorIdx]; - const std::shared_ptr<mblas::Matrix>& Emb = w_.Es_.at(factorIdx); + const std::shared_ptr<mblas::Tensor>& Emb = w_.Es_.at(factorIdx); if (factor < Emb->dim(0)) { knownWords[factorIdx][i] = factor; @@ -48,14 +48,14 @@ class Encoder { unsigned wordCount = words.size() / factorCount; //Row.NewSize(0, wordCount); - /* std::vector<std::shared_ptr<mblas::Matrix>>::iterator eit = w_.Es_.begin(); */ + /* std::vector<std::shared_ptr<mblas::Tensor>>::iterator eit = w_.Es_.begin(); */ /* std::vector<HostVector<unsigned>>::iterator wit = knownWords.begin(); */ for (unsigned i = 0; i < knownWords.size(); i++) { const std::vector<unsigned>& factorWords = knownWords.at(i); mblas::Vector<unsigned> dKnownWords(factorWords); - const std::shared_ptr<mblas::Matrix>& Emb = w_.Es_.at(i); - mblas::Matrix factorRow; + const std::shared_ptr<mblas::Tensor>& Emb = w_.Es_.at(i); + mblas::Tensor factorRow; factorRow.NewSize(wordCount, Emb->dim(1)); mblas::Assemble(factorRow, *Emb, dKnownWords); mblas::Transpose(factorRow); @@ -103,19 +103,19 @@ class Encoder { void GetNextState(CellState& NextState, const CellState& State, - const mblas::Matrix& Embd) { + const mblas::Tensor& Embd) { gru_->GetNextState(NextState, State, Embd); } template <class It> - void Encode(It it, It end, mblas::Matrix& Context, + void Encode(It it, It end, mblas::Tensor& Context, unsigned batchSize, bool invert, const mblas::Vector<unsigned> *sentenceLengths=nullptr) { InitializeState(batchSize); - CellState prevState(std::unique_ptr<mblas::Matrix>(new mblas::Matrix(*(State_.cell))), - std::unique_ptr<mblas::Matrix>(new mblas::Matrix(*(State_.output)))); + CellState prevState(std::unique_ptr<mblas::Tensor>(new mblas::Tensor(*(State_.cell))), + std::unique_ptr<mblas::Tensor>(new mblas::Tensor(*(State_.output)))); unsigned n = std::distance(it, end); unsigned i = 0; @@ -166,7 +166,7 @@ class Encoder { void Encode(const Sentences& words, unsigned tab, - mblas::Matrix& context, + mblas::Tensor& context, std::vector<unsigned> &h_sentenceLengths, mblas::Vector<unsigned> &sentenceLengths); @@ -180,7 +180,7 @@ class Encoder { RNN backwardRnn_; // reusing memory - std::vector<mblas::Matrix> embeddedWords_; + std::vector<mblas::Tensor> embeddedWords_; Encoder(const Encoder&) = delete; }; diff --git a/src/amun/gpu/dl4mt/gru.cu b/src/amun/gpu/dl4mt/gru.cu index a3b4d7ae..4b371e1d 100644 --- a/src/amun/gpu/dl4mt/gru.cu +++ b/src/amun/gpu/dl4mt/gru.cu @@ -5,13 +5,13 @@ using namespace std; namespace amunmt { namespace GPU { -__global__ void gElementwiseOps(mblas::MatrixWrapper<float> outWrap, - const mblas::MatrixWrapper<float> stateWrap, - const mblas::MatrixWrapper<float> ruhWrap, - const mblas::MatrixWrapper<float> tempWrap, - const mblas::MatrixWrapper<float> bWrap, - const mblas::MatrixWrapper<float> bx1Wrap, - const mblas::MatrixWrapper<float> bx2Wrap) +__global__ void gElementwiseOps(mblas::TensorWrapper<float> outWrap, + const mblas::TensorWrapper<float> stateWrap, + const mblas::TensorWrapper<float> ruhWrap, + const mblas::TensorWrapper<float> tempWrap, + const mblas::TensorWrapper<float> bWrap, + const mblas::TensorWrapper<float> bx1Wrap, + const mblas::TensorWrapper<float> bx2Wrap) { const unsigned rows = stateWrap.dim(0); const unsigned cols = stateWrap.dim(1); diff --git a/src/amun/gpu/dl4mt/gru.h b/src/amun/gpu/dl4mt/gru.h index fd9e80dc..8b82d81a 100644 --- a/src/amun/gpu/dl4mt/gru.h +++ b/src/amun/gpu/dl4mt/gru.h @@ -1,7 +1,7 @@ #pragma once #include <boost/timer/timer.hpp> -#include "gpu/mblas/matrix_functions.h" -#include "gpu/mblas/matrix_wrapper.h" +#include "gpu/mblas/tensor_functions.h" +#include "gpu/mblas/tensor_wrapper.h" #include "gpu/mblas/handles.h" #include "gpu/dl4mt/cell.h" #include "cellstate.h" @@ -17,7 +17,7 @@ class SlowGRU: public Cell { virtual void GetNextState(CellState& NextState, const CellState& State, - const mblas::Matrix& Context) const { + const mblas::Tensor& Context) const { using namespace mblas; //std::cerr << std::endl; @@ -90,25 +90,25 @@ class SlowGRU: public Cell { const Weights& w_; // reused to avoid allocation - mutable mblas::Matrix RU_; - mutable mblas::Matrix R_; - mutable mblas::Matrix U_; - mutable mblas::Matrix H_; - mutable mblas::Matrix Temp1_; - mutable mblas::Matrix Temp2_; + mutable mblas::Tensor RU_; + mutable mblas::Tensor R_; + mutable mblas::Tensor U_; + mutable mblas::Tensor H_; + mutable mblas::Tensor Temp1_; + mutable mblas::Tensor Temp2_; SlowGRU(const SlowGRU&) = delete; }; /////////////////////////////////////////////////////////////////////////////////////////////// -__global__ void gElementwiseOps(mblas::MatrixWrapper<float> outWrap, - const mblas::MatrixWrapper<float> stateWrap, - const mblas::MatrixWrapper<float> ruhWrap, - const mblas::MatrixWrapper<float> tempWrap, - const mblas::MatrixWrapper<float> bWrap, - const mblas::MatrixWrapper<float> bx1Wrap, - const mblas::MatrixWrapper<float> bx2Wrap); +__global__ void gElementwiseOps(mblas::TensorWrapper<float> outWrap, + const mblas::TensorWrapper<float> stateWrap, + const mblas::TensorWrapper<float> ruhWrap, + const mblas::TensorWrapper<float> tempWrap, + const mblas::TensorWrapper<float> bWrap, + const mblas::TensorWrapper<float> bx1Wrap, + const mblas::TensorWrapper<float> bx2Wrap); template <class Weights> class FastGRU: public Cell { @@ -129,7 +129,7 @@ class FastGRU: public Cell { //std::cerr << "w_.W_=" << w_.W_.Debug(1) << std::endl; //std::cerr << "1WWx_=" << WWx_.Debug(1) << std::endl; - Matrix WxT; + Tensor WxT; Transpose(WxT, *w_.Wx_); //std::cerr << "w_.Wx_=" << w_.Wx_.Debug(1) << std::endl; //std::cerr << "WxT=" << WxT.Debug(1) << std::endl; @@ -141,7 +141,7 @@ class FastGRU: public Cell { //std::cerr << "3WWx_=" << WWx_.Debug(1) << std::endl; Transpose(UUx_, *w_.U_); - Matrix UxT; + Tensor UxT; Transpose(UxT, *w_.Ux_); Concat(UUx_, UxT); Transpose(UUx_); @@ -151,7 +151,7 @@ class FastGRU: public Cell { virtual void GetNextState(CellState& NextState, const CellState& State, - const mblas::Matrix& Context) const { + const mblas::Tensor& Context) const { using namespace mblas; //std::cerr << std::endl; @@ -183,10 +183,10 @@ class FastGRU: public Cell { } - void ElementwiseOps(mblas::Matrix& NextState, - const mblas::Matrix& State, - const mblas::Matrix& RUH, - const mblas::Matrix& Temp) const + void ElementwiseOps(mblas::Tensor& NextState, + const mblas::Tensor& State, + const mblas::Tensor& RUH, + const mblas::Tensor& Temp) const { //BEGIN_TIMER("ElementwiseOps"); @@ -200,13 +200,13 @@ class FastGRU: public Cell { NextState.NewSize(State.dim(0), State.dim(1), 1, 1); //std::cerr << "NextState=" << NextState.Debug() << std::endl; - mblas::MatrixWrapper<float> nextWrap(NextState); - const mblas::MatrixWrapper<float> stateWrap(State); - const mblas::MatrixWrapper<float> ruhWrap(RUH); - const mblas::MatrixWrapper<float> tempWrap(Temp); - const mblas::MatrixWrapper<float> bWrap(*w_.B_); - const mblas::MatrixWrapper<float> bx1Wrap(*w_.Bx1_); - const mblas::MatrixWrapper<float> bx2Wrap(*w_.Bx2_); + mblas::TensorWrapper<float> nextWrap(NextState); + const mblas::TensorWrapper<float> stateWrap(State); + const mblas::TensorWrapper<float> ruhWrap(RUH); + const mblas::TensorWrapper<float> tempWrap(Temp); + const mblas::TensorWrapper<float> bWrap(*w_.B_); + const mblas::TensorWrapper<float> bx1Wrap(*w_.Bx1_); + const mblas::TensorWrapper<float> bx2Wrap(*w_.Bx2_); /* std::cerr << "nextWrap=" << nextWrap.Debug() << std::endl; @@ -243,11 +243,11 @@ class FastGRU: public Cell { const Weights& w_; // reused to avoid allocation - mutable mblas::Matrix WWx_; - mutable mblas::Matrix UUx_; + mutable mblas::Tensor WWx_; + mutable mblas::Tensor UUx_; - mutable mblas::Matrix RUH_; - mutable mblas::Matrix Temp_; + mutable mblas::Tensor RUH_; + mutable mblas::Tensor Temp_; FastGRU(const FastGRU&) = delete; }; diff --git a/src/amun/gpu/dl4mt/lstm.h b/src/amun/gpu/dl4mt/lstm.h index b236e01a..2f343db6 100644 --- a/src/amun/gpu/dl4mt/lstm.h +++ b/src/amun/gpu/dl4mt/lstm.h @@ -1,7 +1,7 @@ #pragma once #include <boost/timer/timer.hpp> -#include "gpu/mblas/matrix_functions.h" -#include "gpu/mblas/matrix_wrapper.h" +#include "gpu/mblas/tensor_functions.h" +#include "gpu/mblas/tensor_wrapper.h" #include "gpu/mblas/handles.h" #include "gpu/dl4mt/cell.h" #include "cellstate.h" @@ -17,7 +17,7 @@ class SlowLSTM: public Cell { virtual void GetNextState(CellState& NextState, const CellState& State, - const mblas::Matrix& Context) const { + const mblas::Tensor& Context) const { using namespace mblas; /* HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream())); */ @@ -66,13 +66,13 @@ class SlowLSTM: public Cell { const Weights& w_; // reused to avoid allocation - mutable mblas::Matrix FIO_; - mutable mblas::Matrix F_; - mutable mblas::Matrix I_; - mutable mblas::Matrix O_; - mutable mblas::Matrix H_; - mutable mblas::Matrix Temp1_; - mutable mblas::Matrix Temp2_; + mutable mblas::Tensor FIO_; + mutable mblas::Tensor F_; + mutable mblas::Tensor I_; + mutable mblas::Tensor O_; + mutable mblas::Tensor H_; + mutable mblas::Tensor Temp1_; + mutable mblas::Tensor Temp2_; SlowLSTM(const SlowLSTM&) = delete; }; diff --git a/src/amun/gpu/dl4mt/model.cu b/src/amun/gpu/dl4mt/model.cu index 9386f704..9942d93c 100644 --- a/src/amun/gpu/dl4mt/model.cu +++ b/src/amun/gpu/dl4mt/model.cu @@ -11,7 +11,7 @@ namespace GPU { for(int i=1; true; i++) { std::string factorKey = "Wemb" + std::to_string(i); - std::shared_ptr<mblas::Matrix> factorEmb = model.get(factorKey, false); + std::shared_ptr<mblas::Tensor> factorEmb = model.get(factorKey, false); if (factorEmb->size() <= 0){ break; } @@ -26,7 +26,7 @@ Weights::EncForwardGRU::EncForwardGRU(const NpzConverter& model) U_(model.get("encoder_U", true)), Wx_(model.get("encoder_Wx", true)), Bx1_(model.get("encoder_bx", true, true)), - Bx2_(new mblas::Matrix(Bx1_->dim(0), Bx1_->dim(1), Bx1_->dim(2), Bx1_->dim(3), true)), + Bx2_(new mblas::Tensor(Bx1_->dim(0), Bx1_->dim(1), Bx1_->dim(2), Bx1_->dim(3), true)), Ux_(model.get("encoder_Ux", true)), Gamma_1_(model.get("encoder_gamma1", false)), Gamma_2_(model.get("encoder_gamma2", false)) @@ -63,7 +63,7 @@ Weights::EncBackwardGRU::EncBackwardGRU(const NpzConverter& model) U_(model.get("encoder_r_U", true)), Wx_(model.get("encoder_r_Wx", true)), Bx1_(model.get("encoder_r_bx", true, true)), - Bx2_(new mblas::Matrix( Bx1_->dim(0), Bx1_->dim(1), Bx1_->dim(2), Bx1_->dim(3), true)), + Bx2_(new mblas::Tensor( Bx1_->dim(0), Bx1_->dim(1), Bx1_->dim(2), Bx1_->dim(3), true)), Ux_(model.get("encoder_r_Ux", true)), Gamma_1_(model.get("encoder_r_gamma1", false)), Gamma_2_(model.get("encoder_r_gamma2", false)) @@ -89,7 +89,7 @@ Weights::DecGRU1::DecGRU1(const NpzConverter& model) U_(model.get("decoder_U", true)), Wx_(model.get("decoder_Wx", true)), Bx1_(model.get("decoder_bx", true, true)), - Bx2_(new mblas::Matrix(Bx1_->dim(0), Bx1_->dim(1), Bx1_->dim(2), Bx1_->dim(3), true)), + Bx2_(new mblas::Tensor(Bx1_->dim(0), Bx1_->dim(1), Bx1_->dim(2), Bx1_->dim(3), true)), Ux_(model.get("decoder_Ux", true)), Gamma_1_(model.get("decoder_cell1_gamma1", false)), Gamma_2_(model.get("decoder_cell1_gamma2", false)) @@ -102,7 +102,7 @@ Weights::DecGRU2::DecGRU2(const NpzConverter& model) U_(model.get("decoder_U_nl", true)), Wx_(model.get("decoder_Wcx", true)), Bx2_(model.get("decoder_bx_nl", true, true)), - Bx1_(new mblas::Matrix(Bx2_->dim(0), Bx2_->dim(1), Bx2_->dim(2), Bx2_->dim(3), true)), + Bx1_(new mblas::Tensor(Bx2_->dim(0), Bx2_->dim(1), Bx2_->dim(2), Bx2_->dim(3), true)), Ux_(model.get("decoder_Ux_nl", true)), Gamma_1_(model.get("decoder_cell2_gamma1", false)), Gamma_2_(model.get("decoder_cell2_gamma2", false)) diff --git a/src/amun/gpu/dl4mt/model.h b/src/amun/gpu/dl4mt/model.h index f95c4424..0829d233 100644 --- a/src/amun/gpu/dl4mt/model.h +++ b/src/amun/gpu/dl4mt/model.h @@ -4,7 +4,7 @@ #include <string> #include <yaml-cpp/yaml.h> -#include "gpu/mblas/matrix.h" +#include "gpu/mblas/tensor.h" #include "gpu/npz_converter.h" namespace amunmt { @@ -19,7 +19,7 @@ struct Weights { // Embedding matrices for word factors. The first factor is the word // surface form. The rest are optional. - std::vector<std::shared_ptr<mblas::Matrix>> Es_; + std::vector<std::shared_ptr<mblas::Tensor>> Es_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -27,15 +27,15 @@ struct Weights { EncForwardGRU(const EncForwardGRU&) = delete; EncForwardGRU(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> Wx_; - const std::shared_ptr<mblas::Matrix> Bx1_; - const std::shared_ptr<mblas::Matrix> Bx2_; - const std::shared_ptr<mblas::Matrix> Ux_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> Wx_; + const std::shared_ptr<mblas::Tensor> Bx1_; + const std::shared_ptr<mblas::Tensor> Bx2_; + const std::shared_ptr<mblas::Tensor> Ux_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -43,15 +43,15 @@ struct Weights { EncBackwardGRU(const EncBackwardGRU&) = delete; EncBackwardGRU(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> Wx_; - const std::shared_ptr<mblas::Matrix> Bx1_; - const std::shared_ptr<mblas::Matrix> Bx2_; - const std::shared_ptr<mblas::Matrix> Ux_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> Wx_; + const std::shared_ptr<mblas::Tensor> Bx1_; + const std::shared_ptr<mblas::Tensor> Bx2_; + const std::shared_ptr<mblas::Tensor> Ux_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -59,14 +59,14 @@ struct Weights { EncForwardLSTM(const EncForwardLSTM&) = delete; EncForwardLSTM(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> Wx_; - const std::shared_ptr<mblas::Matrix> Bx_; - const std::shared_ptr<mblas::Matrix> Ux_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> Wx_; + const std::shared_ptr<mblas::Tensor> Bx_; + const std::shared_ptr<mblas::Tensor> Ux_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -74,14 +74,14 @@ struct Weights { EncBackwardLSTM(const EncBackwardLSTM&) = delete; EncBackwardLSTM(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> Wx_; - const std::shared_ptr<mblas::Matrix> Bx_; - const std::shared_ptr<mblas::Matrix> Ux_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> Wx_; + const std::shared_ptr<mblas::Tensor> Bx_; + const std::shared_ptr<mblas::Tensor> Ux_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -90,7 +90,7 @@ struct Weights { DecEmbeddings(const DecEmbeddings&) = delete; DecEmbeddings(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> E_; + const std::shared_ptr<mblas::Tensor> E_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -99,9 +99,9 @@ struct Weights { DecInit(const DecInit&) = delete; DecInit(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> Wi_; - const std::shared_ptr<mblas::Matrix> Bi_; - const std::shared_ptr<mblas::Matrix> Gamma_; + const std::shared_ptr<mblas::Tensor> Wi_; + const std::shared_ptr<mblas::Tensor> Bi_; + const std::shared_ptr<mblas::Tensor> Gamma_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -110,15 +110,15 @@ struct Weights { DecGRU1(const DecGRU1&) = delete; DecGRU1(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> Wx_; - const std::shared_ptr<mblas::Matrix> Bx1_; - const std::shared_ptr<mblas::Matrix> Bx2_; - const std::shared_ptr<mblas::Matrix> Ux_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> Wx_; + const std::shared_ptr<mblas::Tensor> Bx1_; + const std::shared_ptr<mblas::Tensor> Bx2_; + const std::shared_ptr<mblas::Tensor> Ux_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -127,15 +127,15 @@ struct Weights { DecGRU2(const DecGRU2&) = delete; DecGRU2(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> Wx_; - const std::shared_ptr<mblas::Matrix> Bx2_; - const std::shared_ptr<mblas::Matrix> Bx1_; - const std::shared_ptr<mblas::Matrix> Ux_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> Wx_; + const std::shared_ptr<mblas::Tensor> Bx2_; + const std::shared_ptr<mblas::Tensor> Bx1_; + const std::shared_ptr<mblas::Tensor> Ux_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -144,14 +144,14 @@ struct Weights { DecLSTM1(const DecLSTM1&) = delete; DecLSTM1(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> Wx_; - const std::shared_ptr<mblas::Matrix> Bx_; - const std::shared_ptr<mblas::Matrix> Ux_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> Wx_; + const std::shared_ptr<mblas::Tensor> Bx_; + const std::shared_ptr<mblas::Tensor> Ux_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -160,14 +160,14 @@ struct Weights { DecLSTM2(const DecLSTM2&) = delete; DecLSTM2(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> Wx_; - const std::shared_ptr<mblas::Matrix> Bx_; - const std::shared_ptr<mblas::Matrix> Ux_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> Wx_; + const std::shared_ptr<mblas::Tensor> Bx_; + const std::shared_ptr<mblas::Tensor> Ux_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -184,10 +184,10 @@ struct Weights { Um_(model.get(p(prefix, "Um"), true)), Bmu_(model.get(p(prefix, "bmu"), true, true)) {} - const std::shared_ptr<mblas::Matrix> Wm_; - const std::shared_ptr<mblas::Matrix> Bm_; - const std::shared_ptr<mblas::Matrix> Um_; - const std::shared_ptr<mblas::Matrix> Bmu_; + const std::shared_ptr<mblas::Tensor> Wm_; + const std::shared_ptr<mblas::Tensor> Bm_; + const std::shared_ptr<mblas::Tensor> Um_; + const std::shared_ptr<mblas::Tensor> Bmu_; private: std::string p(std::string prefix, std::string sufix){ return prefix + "_" + sufix; @@ -200,13 +200,13 @@ struct Weights { DecAlignment(const DecAlignment&) = delete; DecAlignment(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> V_; - const std::shared_ptr<mblas::Matrix> W_; - const std::shared_ptr<mblas::Matrix> B_; - const std::shared_ptr<mblas::Matrix> U_; - const std::shared_ptr<mblas::Matrix> C_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> V_; + const std::shared_ptr<mblas::Tensor> W_; + const std::shared_ptr<mblas::Tensor> B_; + const std::shared_ptr<mblas::Tensor> U_; + const std::shared_ptr<mblas::Tensor> C_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -215,17 +215,17 @@ struct Weights { DecSoftmax(const DecSoftmax&) = delete; DecSoftmax(const NpzConverter& model); - const std::shared_ptr<mblas::Matrix> W1_; - const std::shared_ptr<mblas::Matrix> B1_; - const std::shared_ptr<mblas::Matrix> W2_; - const std::shared_ptr<mblas::Matrix> B2_; - const std::shared_ptr<mblas::Matrix> W3_; - const std::shared_ptr<mblas::Matrix> B3_; - const std::shared_ptr<mblas::Matrix> W4_; - const std::shared_ptr<mblas::Matrix> B4_; - const std::shared_ptr<mblas::Matrix> Gamma_0_; - const std::shared_ptr<mblas::Matrix> Gamma_1_; - const std::shared_ptr<mblas::Matrix> Gamma_2_; + const std::shared_ptr<mblas::Tensor> W1_; + const std::shared_ptr<mblas::Tensor> B1_; + const std::shared_ptr<mblas::Tensor> W2_; + const std::shared_ptr<mblas::Tensor> B2_; + const std::shared_ptr<mblas::Tensor> W3_; + const std::shared_ptr<mblas::Tensor> B3_; + const std::shared_ptr<mblas::Tensor> W4_; + const std::shared_ptr<mblas::Tensor> B4_; + const std::shared_ptr<mblas::Tensor> Gamma_0_; + const std::shared_ptr<mblas::Tensor> Gamma_1_; + const std::shared_ptr<mblas::Tensor> Gamma_2_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/amun/gpu/dl4mt/multiplicative.h b/src/amun/gpu/dl4mt/multiplicative.h index e3265cd0..644413d9 100644 --- a/src/amun/gpu/dl4mt/multiplicative.h +++ b/src/amun/gpu/dl4mt/multiplicative.h @@ -1,7 +1,7 @@ #pragma once #include <boost/timer/timer.hpp> -#include "gpu/mblas/matrix_functions.h" -#include "gpu/mblas/matrix_wrapper.h" +#include "gpu/mblas/tensor_functions.h" +#include "gpu/mblas/tensor_wrapper.h" #include "gpu/mblas/handles.h" #include "gpu/dl4mt/cell.h" #include "cellstate.h" @@ -19,7 +19,7 @@ class Multiplicative: public Cell { {} virtual void GetNextState(CellState& NextState, const CellState& State, - const mblas::Matrix& Context) const { + const mblas::Tensor& Context) const { using namespace mblas; // TODO: the weight matrix naming probably is inconsistent /* HANDLE_ERROR( cudaStreamSynchronize(mblas::CudaStreamHandler::GetStream())); */ @@ -39,7 +39,7 @@ class Multiplicative: public Cell { private: CellType<InnerWeights> innerCell_; const Weights::MultWeights<InnerWeights>& w_; - mutable mblas::Matrix x_mult_; + mutable mblas::Tensor x_mult_; mutable CellState tempState_; }; } diff --git a/src/amun/gpu/mblas/nth_element.cu b/src/amun/gpu/mblas/nth_element.cu index 6df073c6..2f979169 100644 --- a/src/amun/gpu/mblas/nth_element.cu +++ b/src/amun/gpu/mblas/nth_element.cu @@ -1,9 +1,9 @@ #include <iostream> #include "common/utils.h" -#include "matrix_wrapper.h" +#include "tensor_wrapper.h" #include "vector_wrapper.h" #include "nth_element.h" -#include "matrix_functions.h" +#include "tensor_functions.h" using namespace std; @@ -29,7 +29,7 @@ NthElement::~NthElement() //cerr << "FOO2" << endl; } -void NthElement::getNBestList(const std::vector<unsigned>& beamSizes, mblas::Matrix& Probs, +void NthElement::getNBestList(const std::vector<unsigned>& beamSizes, mblas::Tensor& Probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst) { /* @@ -75,7 +75,7 @@ void NthElement::getNBestList(const std::vector<unsigned>& beamSizes, mblas::Mat //cerr << "outKeys=" << Debug(outKeys, 2) << endl; } -void NthElement::getNBestList(mblas::Matrix &probs, +void NthElement::getNBestList(mblas::Tensor &probs, const std::vector<unsigned>& batchFirstElementIdxs, const std::vector<unsigned>& cummulatedBeamSizes) { @@ -100,7 +100,7 @@ void NthElement::getNBestList(mblas::Matrix &probs, cudaMemcpyHostToDevice); mblas::VectorWrapper<NthOut> outWrap(d_out); - mblas::MatrixWrapper<float> probsWrap(probs); + mblas::TensorWrapper<float> probsWrap(probs); mblas::VectorWrapper<unsigned> batchPositionWrap(d_batchPosition); mblas::VectorWrapper<NthOut> resWrap(d_res); mblas::VectorWrapper<unsigned> cumBeamSizesWrap(d_cumBeamSizes); @@ -153,7 +153,7 @@ void NthElement::GetPairs(unsigned number, } } -void NthElement::getValueByKey(std::vector<float>& out, const mblas::Matrix &d_in) const +void NthElement::getValueByKey(std::vector<float>& out, const mblas::Tensor &d_in) const { // need a model with multiple scorers to test this method assert(false); @@ -161,7 +161,7 @@ void NthElement::getValueByKey(std::vector<float>& out, const mblas::Matrix &d_i out.resize(d_breakdown.size()); //mblas::VectorWrapper<float> breakdownWrap(d_breakdown); - //const mblas::MatrixWrapper<float> inWrap(d_in); + //const mblas::TensorWrapper<float> inWrap(d_in); //gGetValueByKey<<<1, lastN_, 0, stream_>>> // (breakdownWrap, inWrap, h_res_idx, lastN_); /* diff --git a/src/amun/gpu/mblas/nth_element.h b/src/amun/gpu/mblas/nth_element.h index 9bce6b24..5cc9a9d8 100644 --- a/src/amun/gpu/mblas/nth_element.h +++ b/src/amun/gpu/mblas/nth_element.h @@ -4,7 +4,7 @@ #include <algorithm> #include <cuda.h> -#include "gpu/mblas/matrix.h" +#include "gpu/mblas/tensor.h" #include "gpu/mblas/vector.h" #include "nth_element_kernels.h" @@ -21,7 +21,7 @@ class NthElement { // standard nth_element void getNBestList(const std::vector<unsigned>& beamSizes, - mblas::Matrix& Probs, + mblas::Tensor& Probs, std::vector<float>& outCosts, std::vector<unsigned>& outKeys, const bool isFirst=false); @@ -30,7 +30,7 @@ class NthElement { std::vector<unsigned>& outKeys, std::vector<float>& outValues); - void getValueByKey(std::vector<float>& out, const mblas::Matrix &d_in) const; + void getValueByKey(std::vector<float>& out, const mblas::Tensor &d_in) const; private: const unsigned BLOCK_SIZE = 512; @@ -46,7 +46,7 @@ class NthElement { unsigned maxBeamSize_, maxBatchSize_; - void getNBestList(mblas::Matrix &probs, + void getNBestList(mblas::Tensor &probs, const std::vector<unsigned>& batchFirstElementIdxs, const std::vector<unsigned>& cummulatedBeamSizes); diff --git a/src/amun/gpu/mblas/nth_element_kernels.cu b/src/amun/gpu/mblas/nth_element_kernels.cu index f7707f71..5c1ae460 100644 --- a/src/amun/gpu/mblas/nth_element_kernels.cu +++ b/src/amun/gpu/mblas/nth_element_kernels.cu @@ -20,7 +20,7 @@ void UnrollMaxArgLoop(unsigned n, unsigned max, unsigned tid, float *sdata, unsi } __global__ void gMaxElement(mblas::VectorWrapper<NthOut> out, - const mblas::MatrixWrapper<float> probsWrap, + const mblas::TensorWrapper<float> probsWrap, const mblas::VectorWrapper<unsigned> batchPositionWrap, unsigned numBatches) { extern __shared__ float sdata[]; @@ -98,7 +98,7 @@ __global__ void gMaxElement(mblas::VectorWrapper<NthOut> out, } __global__ void gMaxElementUpdate(mblas::VectorWrapper<NthOut> out, - mblas::MatrixWrapper<float> probsWrap, + mblas::TensorWrapper<float> probsWrap, mblas::VectorWrapper<NthOut> resWrap, const mblas::VectorWrapper<unsigned> batchPositionWrap, const mblas::VectorWrapper<unsigned> cumBeamSizesWrap, @@ -253,8 +253,8 @@ __global__ void gMaxElementUpdate(mblas::VectorWrapper<NthOut> out, } } -__global__ void gGetValueByKey(mblas::MatrixWrapper<float> out, - const mblas::MatrixWrapper<float> in, +__global__ void gGetValueByKey(mblas::TensorWrapper<float> out, + const mblas::TensorWrapper<float> in, unsigned* indices, unsigned n) { unsigned tid = threadIdx.x + blockDim.x * blockIdx.x; diff --git a/src/amun/gpu/mblas/nth_element_kernels.h b/src/amun/gpu/mblas/nth_element_kernels.h index aeefcdd7..1be357e0 100644 --- a/src/amun/gpu/mblas/nth_element_kernels.h +++ b/src/amun/gpu/mblas/nth_element_kernels.h @@ -1,6 +1,6 @@ #pragma once -#include "matrix_wrapper.h" +#include "tensor_wrapper.h" #include "vector_wrapper.h" namespace amunmt { @@ -47,7 +47,7 @@ struct NthOutBatch __device__ __host__ NthOutBatch(const float& rhs) { - // only to be used to init variable in matrix.h gSum + // only to be used to init variable in tensor.h gSum assert(rhs == 0.0f); ind = rhs; score = rhs; @@ -111,19 +111,19 @@ inline std::ostream& operator<<(std::ostream &out, const NthOutBatch &obj) ///////////////////////////////////////////////////////////////////////////////////////// __global__ void gMaxElement(mblas::VectorWrapper<NthOut> out, - const mblas::MatrixWrapper<float> probsWrap, + const mblas::TensorWrapper<float> probsWrap, const mblas::VectorWrapper<unsigned> batchPositionWrap, unsigned numBatches); __global__ void gMaxElementUpdate(mblas::VectorWrapper<NthOut> out, - mblas::MatrixWrapper<float> probsWrap, + mblas::TensorWrapper<float> probsWrap, mblas::VectorWrapper<NthOut> resWrap, const mblas::VectorWrapper<unsigned> batchPositionWrap, const mblas::VectorWrapper<unsigned> cumBeamSizesWrap, unsigned numBlocks); -__global__ void gGetValueByKey(mblas::MatrixWrapper<float> out, - const mblas::MatrixWrapper<float> in, +__global__ void gGetValueByKey(mblas::TensorWrapper<float> out, + const mblas::TensorWrapper<float> in, unsigned* indices, unsigned n); } diff --git a/src/amun/gpu/mblas/matrix.cu b/src/amun/gpu/mblas/tensor.cu index 98946daa..303dce1c 100644 --- a/src/amun/gpu/mblas/matrix.cu +++ b/src/amun/gpu/mblas/tensor.cu @@ -1,4 +1,4 @@ -#include "matrix.h" +#include "tensor.h" using namespace std; diff --git a/src/amun/gpu/mblas/matrix.h b/src/amun/gpu/mblas/tensor.h index a79f6c47..231f9aff 100644 --- a/src/amun/gpu/mblas/matrix.h +++ b/src/amun/gpu/mblas/tensor.h @@ -6,7 +6,7 @@ #include <thrust/functional.h> #include "common/exception.h" -#include "common/base_matrix.h" +#include "common/base_tensor.h" #include "gpu/types-gpu.h" #include "handles.h" #include "vector.h" @@ -52,11 +52,11 @@ T Sum(const T *data, unsigned count) /////////////////////////////////////////////////////////////////////////////////////////////////// template <typename T> -class TMatrix : public BaseMatrix { +class TTensor : public BaseTensor { public: typedef T value_type; - TMatrix() + TTensor() { dim_[0] = 0; dim_[1] = 0; @@ -64,7 +64,7 @@ class TMatrix : public BaseMatrix { dim_[3] = 0; } - TMatrix(unsigned rows, unsigned cols, unsigned c, unsigned d, bool zero = false) + TTensor(unsigned rows, unsigned cols, unsigned c, unsigned d, bool zero = false) { dim_[0] = rows; dim_[1] = cols; @@ -79,13 +79,13 @@ class TMatrix : public BaseMatrix { } } - TMatrix(TMatrix&& m) - : TMatrix() + TTensor(TTensor&& m) + : TTensor() { swap(m); } - TMatrix(const TMatrix& m) + TTensor(const TTensor& m) : vec_(m.vec_) { dim_[0] = m.dim_[0]; @@ -94,7 +94,7 @@ class TMatrix : public BaseMatrix { dim_[3] = m.dim_[3]; } - ~TMatrix() + ~TTensor() { } @@ -131,7 +131,7 @@ class TMatrix : public BaseMatrix { virtual std::string Debug(unsigned verbosity = 1) const { std::stringstream strm; - strm << BaseMatrix::Debug(verbosity) << " "; + strm << BaseTensor::Debug(verbosity) << " "; strm << vec_.data() << " " << vec_.size() << " " << vec_.maxSize() << " " @@ -170,7 +170,7 @@ class TMatrix : public BaseMatrix { return vec_.data(); } - void swap(TMatrix &other) + void swap(TTensor &other) { std::swap(dim_, other.dim_); vec_.swap(other.vec_); @@ -181,8 +181,7 @@ class TMatrix : public BaseMatrix { Vector<T> vec_; }; -typedef TMatrix<float> Matrix; -typedef TMatrix<unsigned> IMatrix; +typedef TTensor<float> Tensor; } // namespace mblas diff --git a/src/amun/gpu/mblas/matrix_functions.cu b/src/amun/gpu/mblas/tensor_functions.cu index 5a30c0c2..aada422c 100644 --- a/src/amun/gpu/mblas/matrix_functions.cu +++ b/src/amun/gpu/mblas/tensor_functions.cu @@ -1,4 +1,4 @@ -#include "gpu/mblas/matrix_functions.h" +#include "gpu/mblas/tensor_functions.h" #include "gpu/mblas/handles.h" using namespace std; @@ -11,13 +11,13 @@ thread_local CudaStreamHandler CudaStreamHandler::instance_; thread_local CublasHandler CublasHandler::instance_; -Matrix& Swap(Matrix& Out, Matrix& In) { +Tensor& Swap(Tensor& Out, Tensor& In) { Out.swap(In); return Out; } -__global__ void gMean(MatrixWrapper<float> out, - const MatrixWrapper<float> in, +__global__ void gMean(TensorWrapper<float> out, + const TensorWrapper<float> in, const VectorWrapper<unsigned> sentenceLengths) { // out = batches * states @@ -51,8 +51,8 @@ __global__ void gMean(MatrixWrapper<float> out, } } -void Mean(Matrix& Out, - const Matrix& In, +void Mean(Tensor& Out, + const Tensor& In, const mblas::Vector<unsigned> &sentenceLengths) { assert(Out.dim(2) == 1); @@ -65,8 +65,8 @@ void Mean(Matrix& Out, unsigned stateLength = Out.dim(1); unsigned sentenceLength = (In.dim(0) * In.dim(2) * In.dim(3)) / batchNum; - MatrixWrapper<float> outWrap(Out); - MatrixWrapper<float> inWrap(In); + TensorWrapper<float> outWrap(Out); + TensorWrapper<float> inWrap(In); //cerr << "outWrap=" << outWrap.Debug() << endl; VectorWrapper<unsigned> sentenceLengthsWrap(sentenceLengths); @@ -81,9 +81,9 @@ void Mean(Matrix& Out, } -__global__ void gWeightedMean(MatrixWrapper<float> out, - const MatrixWrapper<float> weights, - const MatrixWrapper<float> in, +__global__ void gWeightedMean(TensorWrapper<float> out, + const TensorWrapper<float> weights, + const TensorWrapper<float> in, const VectorWrapper<unsigned> mapping ) { @@ -107,16 +107,16 @@ __global__ void gWeightedMean(MatrixWrapper<float> out, } } -void WeightedMean(Matrix& Out,const Matrix& Weights, const Matrix& In, const mblas::Vector<unsigned>& mapping) +void WeightedMean(Tensor& Out,const Tensor& Weights, const Tensor& In, const mblas::Vector<unsigned>& mapping) { int numHypos = Weights.dim(0); int states = In.dim(1); Out.NewSize(numHypos, states); - MatrixWrapper<float> outWrap(Out); - MatrixWrapper<float> weightsWrap(Weights); - MatrixWrapper<float> inWrap(In); + TensorWrapper<float> outWrap(Out); + TensorWrapper<float> weightsWrap(Weights); + TensorWrapper<float> inWrap(In); VectorWrapper<unsigned> mappingWrap(mapping); unsigned size = Out.size(); @@ -140,7 +140,7 @@ void WeightedMean(Matrix& Out,const Matrix& Weights, const Matrix& In, const mbl */ } -Matrix& Transpose(Matrix& Out, const Matrix& In) { +Tensor& Transpose(Tensor& Out, const Tensor& In) { unsigned m = In.dim(0); unsigned n = In.dim(1); @@ -155,14 +155,14 @@ Matrix& Transpose(Matrix& Out, const Matrix& In) { return Out; } -Matrix& Transpose(Matrix& Out) { - thread_local Matrix Temp; +Tensor& Transpose(Tensor& Out) { + thread_local Tensor Temp; Transpose(Temp, Out); Swap(Out, Temp); return Out; } -Matrix& Concat(Matrix& Out, const Matrix& In) { +Tensor& Concat(Tensor& Out, const Tensor& In) { unsigned oldSize = Out.size(); Out.Resize(Out.dim(0) + In.dim(0), Out.dim(1)); @@ -171,7 +171,7 @@ Matrix& Concat(Matrix& Out, const Matrix& In) { return Out; } -Matrix& Copy(Matrix& Out, const Matrix& In) { +Tensor& Copy(Tensor& Out, const Tensor& In) { Out.NewSize(In.dim(0), In.dim(1), In.dim(2), In.dim(3)); mblas::copy(In.data(), In.size(), Out.data(), cudaMemcpyDeviceToDevice); @@ -179,8 +179,8 @@ Matrix& Copy(Matrix& Out, const Matrix& In) { return Out; } -__global__ void gPasteRows(MatrixWrapper<float> out, - const MatrixWrapper<float> in, +__global__ void gPasteRows(TensorWrapper<float> out, + const TensorWrapper<float> in, int rowNo, int colNo) { int inRows = in.dim(0); @@ -198,10 +198,10 @@ __global__ void gPasteRows(MatrixWrapper<float> out, } } -void PasteRows(Matrix& Out, const Matrix& In, const unsigned rowNo, unsigned colNo) +void PasteRows(Tensor& Out, const Tensor& In, const unsigned rowNo, unsigned colNo) { - MatrixWrapper<float> outWrap(Out); - MatrixWrapper<float> inWrap(In); + TensorWrapper<float> outWrap(Out); + TensorWrapper<float> inWrap(In); unsigned size = In.size(); unsigned nThreads = std::min((unsigned) MAX_THREADS, (unsigned)size); @@ -213,8 +213,8 @@ void PasteRows(Matrix& Out, const Matrix& In, const unsigned rowNo, unsigned col } -Matrix& PasteRow(Matrix& Out, - const Matrix& In, +Tensor& PasteRow(Tensor& Out, + const Tensor& In, const unsigned r, const unsigned c) { unsigned start = r * Out.dim(1) + c; @@ -224,8 +224,8 @@ Matrix& PasteRow(Matrix& Out, return Out; } -Matrix& CopyRow(Matrix& Out, - const Matrix& In, +Tensor& CopyRow(Tensor& Out, + const Tensor& In, const unsigned r, const unsigned c) { unsigned length = In.dim(1) - c; Out.NewSize(1, length); @@ -238,8 +238,8 @@ Matrix& CopyRow(Matrix& Out, return Out; } -__global__ void gCopyRows(MatrixWrapper<float> out, - const MatrixWrapper<float> in, +__global__ void gCopyRows(TensorWrapper<float> out, + const TensorWrapper<float> in, const VectorWrapper<unsigned> indicesWrap) { int id = threadIdx.x + blockIdx.x * blockDim.x; @@ -256,8 +256,8 @@ __global__ void gCopyRows(MatrixWrapper<float> out, } } -Matrix& CopyRows(Matrix& Out, - const Matrix& In, +Tensor& CopyRows(Tensor& Out, + const Tensor& In, const mblas::Vector<unsigned>& indices) { assert(In.dim(1) == Out.dim(1)); @@ -279,8 +279,8 @@ Matrix& CopyRows(Matrix& Out, unsigned numPairs = indices.size(); - MatrixWrapper<float> outWrap(Out); - const MatrixWrapper<float> inWrap(In); + TensorWrapper<float> outWrap(Out); + const TensorWrapper<float> inWrap(In); const VectorWrapper<unsigned> indicesWrap(indices); //cerr << "size=" << size << endl; @@ -295,8 +295,8 @@ Matrix& CopyRows(Matrix& Out, } -Matrix& Assemble(Matrix& Out, - const Matrix& In, +Tensor& Assemble(Tensor& Out, + const Tensor& In, const mblas::Vector<unsigned>& indices) { Out.NewSize(indices.size(), In.dim(1)); //cerr << "Assemble=" << Out.Debug() << " " << In.Debug() << indices.size() << endl; @@ -305,8 +305,8 @@ Matrix& Assemble(Matrix& Out, return Out; } -__global__ void gSlice(MatrixWrapper<float> out, - const MatrixWrapper<float> in, +__global__ void gSlice(TensorWrapper<float> out, + const TensorWrapper<float> in, unsigned n, unsigned dim) { unsigned row = blockIdx.x; @@ -323,8 +323,8 @@ __global__ void gSlice(MatrixWrapper<float> out, } -Matrix& Slice(Matrix& Out, - const Matrix& In, +Tensor& Slice(Tensor& Out, + const Tensor& In, unsigned n, unsigned dim) { assert(In.dim(2) == 1); @@ -332,8 +332,8 @@ Matrix& Slice(Matrix& Out, Out.NewSize(In.dim(0), dim); - MatrixWrapper<float> outWrap(Out); - const MatrixWrapper<float> inWrap(In); + TensorWrapper<float> outWrap(Out); + const TensorWrapper<float> inWrap(In); /* cerr << "outWrap=" << outWrap.Debug() << endl; @@ -353,13 +353,13 @@ Matrix& Slice(Matrix& Out, return Out; } -Matrix& Prod(cublasHandle_t handle, Matrix& C, const Matrix& A, const Matrix& B, bool transB) +Tensor& Prod(cublasHandle_t handle, Tensor& C, const Tensor& A, const Tensor& B, bool transB) { BEGIN_TIMER("Prod"); assert((A.dim(2) == A.dim(3) == 1) || (B.dim(2) == B.dim(3) == 1)); - Matrix::value_type alpha = 1.0; - Matrix::value_type beta = 0.0; + Tensor::value_type alpha = 1.0; + Tensor::value_type beta = 0.0; unsigned m = A.dim(0) * A.dim(2) * A.dim(3); unsigned k = A.dim(1); @@ -402,6 +402,7 @@ Matrix& Prod(cublasHandle_t handle, Matrix& C, const Matrix& A, const Matrix& B, cerr << "B=" << B.Debug(0) << endl; cerr << "transB=" << transB << endl; cerr << m << " " << n << " " << k << endl; + cerr << lda << " " << ldb << " " << ldc << endl; cerr << endl; */ bool transA = false; @@ -419,20 +420,20 @@ Matrix& Prod(cublasHandle_t handle, Matrix& C, const Matrix& A, const Matrix& B, return C; } -Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B, +Tensor& Prod(Tensor& C, const Tensor& A, const Tensor& B, bool transB) { //std::cerr << "1C=" << C.Debug() << std::endl; //std::cerr << "1A=" << A.Debug() << std::endl; //std::cerr << "1B=" << B.Debug() << std::endl; - Matrix &ret = Prod(CublasHandler::GetHandle(), C, A, B, transB); + Tensor &ret = Prod(CublasHandler::GetHandle(), C, A, B, transB); //std::cerr << "2C=" << C.Debug() << std::endl; return ret; } -__global__ void gSoftMax(MatrixWrapper<float> out, +__global__ void gSoftMax(TensorWrapper<float> out, const VectorWrapper<unsigned> batchIdsWrap, const VectorWrapper<unsigned> sentenceLengthsWrap, unsigned shareSize) @@ -517,14 +518,14 @@ __global__ void gSoftMax(MatrixWrapper<float> out, } } -Matrix& Softmax(Matrix& Out, +Tensor& Softmax(Tensor& Out, const mblas::Vector<unsigned>& batchIds, const mblas::Vector<unsigned> &sentenceLengths, unsigned batchSize) { unsigned maxLength = Out.dim(1); - MatrixWrapper<float> outWrap(Out); + TensorWrapper<float> outWrap(Out); const VectorWrapper<unsigned> batchIdsWrap(batchIds); const VectorWrapper<unsigned> sentenceLengthsWrap(sentenceLengths); @@ -539,7 +540,7 @@ Matrix& Softmax(Matrix& Out, return Out; } -__global__ void gLogSoftMax(MatrixWrapper<float> out, unsigned shareSize) +__global__ void gLogSoftMax(TensorWrapper<float> out, unsigned shareSize) { extern __shared__ float _share[]; @@ -619,9 +620,9 @@ __global__ void gLogSoftMax(MatrixWrapper<float> out, unsigned shareSize) } -Matrix& LogSoftmax(Matrix& Out) +Tensor& LogSoftmax(Tensor& Out) { - MatrixWrapper<float> outWrap(Out); + TensorWrapper<float> outWrap(Out); int blocks = std::min(MAX_BLOCKS, (int)Out.dim(0)); int threads = std::min(MAX_THREADS, (int)Out.dim(1)); @@ -634,7 +635,7 @@ Matrix& LogSoftmax(Matrix& Out) return Out; } -__global__ void gSetColumn(MatrixWrapper<float> in, int noColumn, float value) { +__global__ void gSetColumn(TensorWrapper<float> in, int noColumn, float value) { int n_rows = in.dim(0); int rowNumber = threadIdx.x + blockDim.x * blockIdx.x; @@ -644,33 +645,33 @@ __global__ void gSetColumn(MatrixWrapper<float> in, int noColumn, float value) { } } -void SetColumn(Matrix& In, int noColumn, float value) { +void SetColumn(Tensor& In, int noColumn, float value) { int nRows = In.dim(0); int nBlocks = nRows / MAX_THREADS + ((nRows % MAX_THREADS == 0) ? 0 : 1); int nThreads = std::min(MAX_THREADS, nRows); - MatrixWrapper<float> inWrap(In); + TensorWrapper<float> inWrap(In); gSetColumn<<<nBlocks, nThreads, 0, mblas::CudaStreamHandler::GetStream()>>> (inWrap, noColumn, value); HANDLE_ERROR(cudaGetLastError()); } -__global__ void gFill(MatrixWrapper<float> in, float val) { +__global__ void gFill(TensorWrapper<float> in, float val) { int index = threadIdx.x + blockDim.x * blockIdx.x; if (index < in.size()) { in[index] = val; } } -void Fill(Matrix& In, float value) { +void Fill(Tensor& In, float value) { unsigned size = In.size(); if (value) { int nThreads = std::min(MAX_THREADS, (int)size); int nBlocks = (size / nThreads) + ((size % nThreads == 0) ? 0 : 1); - MatrixWrapper<float> inWrap(In); + TensorWrapper<float> inWrap(In); gFill<<<nBlocks, nThreads, 0, CudaStreamHandler::GetStream()>>> (inWrap, value); @@ -683,7 +684,7 @@ void Fill(Matrix& In, float value) { } __global__ -void gMapMatrix(MatrixWrapper<float> in, +void gMapMatrix(TensorWrapper<float> in, const VectorWrapper<unsigned> sentenceLengthsWrap, int i) { @@ -698,7 +699,7 @@ void gMapMatrix(MatrixWrapper<float> in, } } -void MapMatrix(Matrix& state, +void MapMatrix(Tensor& state, const mblas::Vector<unsigned> &sentenceLengths, unsigned i) { @@ -711,7 +712,7 @@ void MapMatrix(Matrix& state, int numThreads = std::min((int)state.size(), MAX_THREADS); int numBlocks = (state.size() / numThreads) + ((state.size() % numThreads == 0) ? 0 : 1); - MatrixWrapper<float> stateWrap(state); + TensorWrapper<float> stateWrap(state); VectorWrapper<unsigned> sentenceLengthsWrap(sentenceLengths); gMapMatrix<<<numBlocks, numThreads, 0, CudaStreamHandler::GetStream()>>> @@ -737,10 +738,10 @@ __device__ unsigned getIndex(const dim3 &dim, const dim3 &val) } -__global__ void gLNormalization(MatrixWrapper<float> out, - const MatrixWrapper<float> in, - const MatrixWrapper<float> alphaWrap, - const MatrixWrapper<float> betaWrap, +__global__ void gLNormalization(TensorWrapper<float> out, + const TensorWrapper<float> in, + const TensorWrapper<float> alphaWrap, + const TensorWrapper<float> betaWrap, float eps=0.00001) { extern __shared__ float _share[]; @@ -814,10 +815,10 @@ __global__ void gLNormalization(MatrixWrapper<float> out, } -void Normalization(Matrix &out, - const Matrix &in, - const Matrix &alpha, - const Matrix *beta, +void Normalization(Tensor &out, + const Tensor &in, + const Tensor &alpha, + const Tensor *beta, float eps) { assert(in.dim(0) < MAX_BLOCKS); @@ -830,10 +831,10 @@ void Normalization(Matrix &out, dim3 numBlocks(in.dim(0), in.dim(2), in.dim(3)); int shared = numThreads * sizeof(float) * 2; - MatrixWrapper<float> outWrap(out); - const MatrixWrapper<float> inWrap(in); - const MatrixWrapper<float> alphaWrap(alpha); - MatrixWrapper<float> *betaWrap = beta ? new MatrixWrapper<float>(*beta) : new MatrixWrapper<float>(); + TensorWrapper<float> outWrap(out); + const TensorWrapper<float> inWrap(in); + const TensorWrapper<float> alphaWrap(alpha); + TensorWrapper<float> *betaWrap = beta ? new TensorWrapper<float>(*beta) : new TensorWrapper<float>(); gLNormalization<<<numBlocks, numThreads, shared, CudaStreamHandler::GetStream()>>> (outWrap, inWrap, alphaWrap, *betaWrap, eps); @@ -853,13 +854,13 @@ void Normalization(Matrix &out, delete betaWrap; } -void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, const Matrix& beta, +void Normalization(Tensor& out, const Tensor& in, const Tensor& alpha, const Tensor& beta, float eps) { Normalization(out, in, alpha, &beta, eps); } -void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps) +void Normalization(Tensor& out, const Tensor& in, const Tensor& alpha, float eps) { Normalization(out, in, alpha, nullptr, eps); } @@ -927,7 +928,7 @@ void gBeamSizeInit(VectorWrapper<unsigned> hypo2BeamSizeWrap, } __device__ -float GetMaxScore(const MatrixWrapper<NthOutBatch> &nBestMatrix) +float GetMaxScore(const TensorWrapper<NthOutBatch> &nBestMatrix) { float ret = LOWEST_FLOAT; for (unsigned i = 0; i < nBestMatrix.dim(1); ++i) { @@ -1017,8 +1018,8 @@ void MergeElement(float &minScore, __device__ void NBestAndMax(VectorWrapper<NthOutBatch> &nBestCandidatesWrap, float &topScore, - const MatrixWrapper<float> &in, - const MatrixWrapper<float> &b4Wrap, + const TensorWrapper<float> &in, + const TensorWrapper<float> &b4Wrap, unsigned hypoInd, unsigned maxBeamSize, bool forbidUNK, @@ -1028,10 +1029,10 @@ void NBestAndMax(VectorWrapper<NthOutBatch> &nBestCandidatesWrap, extern __shared__ char _sharePtr[]; // placeholder for shared mem in subsequent function SumAndLogSoftMax - //MatrixWrapper<float> maxMatrix((float*)_sharePtr, blockDim.x, 1, 1, 1); + //TensorWrapper<float> maxMatrix((float*)_sharePtr, blockDim.x, 1, 1, 1); void *ptrOffset = _sharePtr + sizeof(float) * blockDim.x; - MatrixWrapper<NthOutBatch> nBestMatrix((NthOutBatch*)ptrOffset, blockDim.x, maxBeamSize, 1, 1); + TensorWrapper<NthOutBatch> nBestMatrix((NthOutBatch*)ptrOffset, blockDim.x, maxBeamSize, 1, 1); VectorWrapper<NthOutBatch> row = nBestMatrix.Row(threadIdx.x); unsigned vocabSize = in.dim(1); @@ -1106,8 +1107,8 @@ void NBestAndMax(VectorWrapper<NthOutBatch> &nBestCandidatesWrap, /////////////////////////////////////////////////////////////////////////////////////////////////////// __device__ void SumAndLogSoftMax(VectorWrapper<NthOutBatch> &nBestCandidatesWrap, - const MatrixWrapper<float> &in, - const MatrixWrapper<float> &b4Wrap, + const TensorWrapper<float> &in, + const TensorWrapper<float> &b4Wrap, unsigned hypoInd, unsigned maxBeamSize, float topScore, @@ -1159,8 +1160,8 @@ void SumAndLogSoftMax(VectorWrapper<NthOutBatch> &nBestCandidatesWrap, /////////////////////////////////////////////////////////////////////////////////////////////////////// __global__ void gLogSoftMax(VectorWrapper<NthOutBatch> nBestCandidatesWrap, - const MatrixWrapper<float> in, - const MatrixWrapper<float> b4Wrap, + const TensorWrapper<float> in, + const TensorWrapper<float> b4Wrap, unsigned maxBeamSize, bool forbidUNK, const VectorWrapper<unsigned> hypo2BeamSizeWrap, @@ -1204,7 +1205,7 @@ __global__ void gLogSoftMax(VectorWrapper<NthOutBatch> nBestCandidatesWrap, /////////////////////////////////////////////////////////////////////////////////////////////////////// __global__ void gNBestPerBatch(VectorWrapper<NthOutBatch> nBestWrap, VectorWrapper<NthOutBatch> nBestCandidatesWrap, - const MatrixWrapper<float> in, + const TensorWrapper<float> in, const VectorWrapper<float> costsWrap, unsigned maxBeamSize, bool forbidUNK, @@ -1299,8 +1300,8 @@ __global__ void gNBestPerBatch(VectorWrapper<NthOutBatch> nBestWrap, /////////////////////////////////////////////////////////////////////////////////////////////////////// void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest, - const Matrix& in, - const Matrix& b4, + const Tensor& in, + const Tensor& b4, const mblas::Vector<float> &costs, bool forbidUNK, unsigned maxBeamSize, @@ -1352,8 +1353,8 @@ void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest, cerr << endl; */ - MatrixWrapper<float> inWrap(in); - MatrixWrapper<float> b4Wrap(b4); + TensorWrapper<float> inWrap(in); + TensorWrapper<float> b4Wrap(b4); VectorWrapper<unsigned> hypo2BeamSizeWrap(hypo2BeamSize); VectorWrapper<unsigned> hypo2CandidateWrap(hypo2Candidate); VectorWrapper<unsigned> batch2HypoWrap(batch2Hypo); diff --git a/src/amun/gpu/mblas/matrix_functions.h b/src/amun/gpu/mblas/tensor_functions.h index b7d7a0e5..720f811b 100644 --- a/src/amun/gpu/mblas/matrix_functions.h +++ b/src/amun/gpu/mblas/tensor_functions.h @@ -10,8 +10,8 @@ #include <iostream> #include "gpu/mblas/thrust_functions.h" -#include "gpu/mblas/matrix.h" -#include "gpu/mblas/matrix_wrapper.h" +#include "gpu/mblas/tensor.h" +#include "gpu/mblas/tensor_wrapper.h" #include "gpu/mblas/handles.h" #include "gpu/mblas/nth_element_kernels.h" #include "gpu/mblas/vector_wrapper.h" @@ -73,66 +73,66 @@ void copy(const T *in, unsigned count, T *out, cudaMemcpyKind kind) { HANDLE_ERROR( cudaMemcpyAsync(out, in, count * sizeof(T), kind, CudaStreamHandler::GetStream()) ); } -void Fill(Matrix& In, float value=0.0f); +void Fill(Tensor& In, float value=0.0f); -Matrix& Swap(Matrix& Out, Matrix& In); +Tensor& Swap(Tensor& Out, Tensor& In); -void Mean(Matrix& Out, - const Matrix& In, +void Mean(Tensor& Out, + const Tensor& In, const mblas::Vector<unsigned> &sentenceLengths); -void WeightedMean(Matrix& Out,const Matrix& Weights, const Matrix& In, const mblas::Vector<unsigned>& mapping); +void WeightedMean(Tensor& Out,const Tensor& Weights, const Tensor& In, const mblas::Vector<unsigned>& mapping); -Matrix& Transpose(Matrix& Out, const Matrix& In); +Tensor& Transpose(Tensor& Out, const Tensor& In); -Matrix& Transpose(Matrix& Out); +Tensor& Transpose(Tensor& Out); -Matrix& Copy(Matrix& Out, const Matrix& In); +Tensor& Copy(Tensor& Out, const Tensor& In); -Matrix& PasteRow(Matrix& Out, - const Matrix& In, +Tensor& PasteRow(Tensor& Out, + const Tensor& In, const unsigned r = 0, const unsigned c = 0); -void PasteRows(Matrix& Out, const Matrix& In, const unsigned rowNo, unsigned colNo=0); +void PasteRows(Tensor& Out, const Tensor& In, const unsigned rowNo, unsigned colNo=0); -Matrix& CopyRow(Matrix& Out, - const Matrix& In, +Tensor& CopyRow(Tensor& Out, + const Tensor& In, const unsigned r = 0, const unsigned c = 0); -Matrix& Concat(Matrix& Out, const Matrix& In); +Tensor& Concat(Tensor& Out, const Tensor& In); -void MapMatrix(Matrix& state, +void MapMatrix(Tensor& state, const mblas::Vector<unsigned> &sentenceLengths, unsigned i); -Matrix& CopyRows(Matrix& Out, - const Matrix& In, +Tensor& CopyRows(Tensor& Out, + const Tensor& In, const mblas::Vector<unsigned>& indices); -Matrix& Assemble(Matrix& Out, - const Matrix& In, +Tensor& Assemble(Tensor& Out, + const Tensor& In, const mblas::Vector<unsigned>& indices); -Matrix& Slice(Matrix& Out, - const Matrix& In, +Tensor& Slice(Tensor& Out, + const Tensor& In, unsigned n, unsigned dim); -Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B, +Tensor& Prod(Tensor& C, const Tensor& A, const Tensor& B, bool transB = false); -Matrix& Softmax(Matrix& Out, +Tensor& Softmax(Tensor& Out, const mblas::Vector<unsigned>& batchIds, const mblas::Vector<unsigned> &sentenceLengths, unsigned batchSize); -Matrix& LogSoftmax(Matrix& Out); +Tensor& LogSoftmax(Tensor& Out); template <class Functor> __global__ void gBroadcast(Functor functor, - MatrixWrapper<float> outWrap, - const MatrixWrapper<float> in1Wrap, - const MatrixWrapper<float> in2Wrap, + TensorWrapper<float> outWrap, + const TensorWrapper<float> in1Wrap, + const TensorWrapper<float> in2Wrap, const VectorWrapper<unsigned> batchMappingWrap) { int id = threadIdx.x + blockIdx.x * blockDim.x; @@ -168,10 +168,10 @@ __global__ void gBroadcast(Functor functor, } template <class Functor> -Matrix& Broadcast(Functor functor, - Matrix& out, - const Matrix& in1, - const Matrix& in2, +Tensor& Broadcast(Functor functor, + Tensor& out, + const Tensor& in1, + const Tensor& in2, const mblas::Vector<unsigned>& batchMapping, unsigned srcSize) { @@ -183,9 +183,9 @@ Matrix& Broadcast(Functor functor, out.NewSize(srcSize, cols, sumOfBeamSizes); - MatrixWrapper<float> outWrap(out); - const MatrixWrapper<float> in1Wrap(in1); - const MatrixWrapper<float> in2Wrap(in2); + TensorWrapper<float> outWrap(out); + const TensorWrapper<float> in1Wrap(in1); + const TensorWrapper<float> in2Wrap(in2); const VectorWrapper<unsigned> batchMappingWrap(batchMapping); unsigned size = out.size(); @@ -215,7 +215,7 @@ Matrix& Broadcast(Functor functor, template <class Functor> __global__ void gBroadcastVecColumn(Functor functor, - MatrixWrapper<float> outWrap, + TensorWrapper<float> outWrap, const VectorWrapper<float> inWrap) { extern __shared__ float sdataOrig[]; @@ -240,12 +240,12 @@ __global__ void gBroadcastVecColumn(Functor functor, } template <class Functor> -Matrix& BroadcastVecColumn(Functor functor, Matrix& Out, const mblas::Vector<float>& In) +Tensor& BroadcastVecColumn(Functor functor, Tensor& Out, const mblas::Vector<float>& In) { unsigned rows = Out.dim(0); unsigned cols = Out.dim(1); - MatrixWrapper<float> outWrap(Out); + TensorWrapper<float> outWrap(Out); const VectorWrapper<float> inWrap(In); int threads = std::min(MAX_THREADS, (int)cols); @@ -260,8 +260,8 @@ Matrix& BroadcastVecColumn(Functor functor, Matrix& Out, const mblas::Vector<flo template <class Functor> __global__ void gBroadcastVec(Functor functor, - MatrixWrapper<float> outWrap, - const MatrixWrapper<float> inWrap) + TensorWrapper<float> outWrap, + const TensorWrapper<float> inWrap) { unsigned cols = outWrap.dim(1); @@ -282,15 +282,15 @@ __global__ void gBroadcastVec(Functor functor, } template <class Functor> -Matrix& BroadcastVec(Functor functor, Matrix& Out, const Matrix& In) +Tensor& BroadcastVec(Functor functor, Tensor& Out, const Tensor& In) { //std::cerr << "Out=" << Out.Debug() << std::endl; //std::cerr << "In=" << In.Debug() << std::endl; unsigned cols = Out.dim(1); - MatrixWrapper<float> outWrap(Out); - const MatrixWrapper<float> inWrap(In); + TensorWrapper<float> outWrap(Out); + const TensorWrapper<float> inWrap(In); int threads = std::min(MAX_THREADS, (int)cols); int blocks = cols / threads + ((cols % threads == 0) ? 0 : 1); @@ -305,7 +305,7 @@ Matrix& BroadcastVec(Functor functor, Matrix& Out, const Matrix& In) template <class Functor> __global__ void gElement(Functor functor, - MatrixWrapper<float> outWrap) + TensorWrapper<float> outWrap) { unsigned ind = blockIdx.x * blockDim.x + threadIdx.x; if (ind < outWrap.size()) { @@ -314,15 +314,15 @@ __global__ void gElement(Functor functor, } template <class Functor> -Matrix& Element(Functor functor, - Matrix& Out) +Tensor& Element(Functor functor, + Tensor& Out) { unsigned size = Out.size(); unsigned threads = std::min((unsigned) MAX_THREADS, (unsigned)size); unsigned blocks = size / threads + ((size % threads == 0) ? 0 : 1); const cudaStream_t& stream = CudaStreamHandler::GetStream(); - MatrixWrapper<float> outWrap(Out); + TensorWrapper<float> outWrap(Out); gElement<<<blocks, threads, 0, stream>>> (functor, outWrap); @@ -333,8 +333,8 @@ Matrix& Element(Functor functor, template <class Functor> __global__ void gElement(Functor functor, - MatrixWrapper<float> outWrap, - const MatrixWrapper<float> inWrap) + TensorWrapper<float> outWrap, + const TensorWrapper<float> inWrap) { unsigned ind = blockIdx.x * blockDim.x + threadIdx.x; if (ind < outWrap.size()) { @@ -343,8 +343,8 @@ __global__ void gElement(Functor functor, } template <class Functor> -Matrix& Element(Functor functor, - Matrix& Out, const Matrix& In) +Tensor& Element(Functor functor, + Tensor& Out, const Tensor& In) { assert(Out.size() == In.size()); @@ -353,8 +353,8 @@ Matrix& Element(Functor functor, unsigned blocks = size / threads + ((size % threads == 0) ? 0 : 1); const cudaStream_t& stream = CudaStreamHandler::GetStream(); - MatrixWrapper<float> outWrap(Out); - const MatrixWrapper<float> inWrap(In); + TensorWrapper<float> outWrap(Out); + const TensorWrapper<float> inWrap(In); gElement<<<blocks, threads, 0, stream>>> (functor, outWrap, inWrap); @@ -365,9 +365,9 @@ Matrix& Element(Functor functor, template <class Functor> __global__ void gElement(Functor functor, - MatrixWrapper<float> outWrap, - const MatrixWrapper<float> in1Wrap, - const MatrixWrapper<float> in2Wrap) + TensorWrapper<float> outWrap, + const TensorWrapper<float> in1Wrap, + const TensorWrapper<float> in2Wrap) { unsigned ind = blockIdx.x * blockDim.x + threadIdx.x; if (ind < outWrap.size()) { @@ -376,8 +376,8 @@ __global__ void gElement(Functor functor, } template <class Functor> -Matrix& Element(Functor functor, - Matrix& Out, const Matrix& In1, const Matrix& In2) +Tensor& Element(Functor functor, + Tensor& Out, const Tensor& In1, const Tensor& In2) { //std::cerr << "Out=" << Out.Debug() << std::endl; //std::cerr << "In1=" << In1.Debug() << std::endl; @@ -395,9 +395,9 @@ Matrix& Element(Functor functor, //std::cerr << "Element3=" << In1.Debug(0) << std::endl; //std::cerr << "Element3=" << In2.Debug(0) << std::endl; //std::cerr << std::endl; - MatrixWrapper<float> outWrap(Out); - const MatrixWrapper<float> in1Wrap(In1); - const MatrixWrapper<float> in2Wrap(In2); + TensorWrapper<float> outWrap(Out); + const TensorWrapper<float> in1Wrap(In1); + const TensorWrapper<float> in2Wrap(In2); //std::cerr << "outWrap=" << outWrap.Debug() << std::endl; gElement<<<blocks, threads, 0, stream>>> @@ -411,16 +411,16 @@ Matrix& Element(Functor functor, return Out; } -void SetColumn(Matrix& In, int noColumn, float value); +void SetColumn(Tensor& In, int noColumn, float value); -void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, const Matrix& beta, +void Normalization(Tensor& out, const Tensor& in, const Tensor& alpha, const Tensor& beta, float eps); -void Normalization(Matrix& out, const Matrix& in, const Matrix& alpha, float eps); +void Normalization(Tensor& out, const Tensor& in, const Tensor& alpha, float eps); void LogSoftmaxAndNBest(mblas::Vector<NthOutBatch> &nBest, - const Matrix& in, - const Matrix& b4, + const Tensor& in, + const Tensor& b4, const mblas::Vector<float> &costs, bool forbidUNK, unsigned maxBeamSize, diff --git a/src/amun/gpu/mblas/matrix_wrapper.h b/src/amun/gpu/mblas/tensor_wrapper.h index 4e50a9aa..4dff4bc7 100644 --- a/src/amun/gpu/mblas/matrix_wrapper.h +++ b/src/amun/gpu/mblas/tensor_wrapper.h @@ -1,5 +1,5 @@ #pragma once -#include "matrix.h" +#include "tensor.h" #include "vector_wrapper.h" namespace amunmt { @@ -7,10 +7,10 @@ namespace GPU { namespace mblas { template <typename T> -class MatrixWrapper +class TensorWrapper { public: - MatrixWrapper() + TensorWrapper() { dim_[0] = 0; dim_[1] = 0; @@ -22,7 +22,7 @@ public: dataConst_ = nullptr; } - MatrixWrapper(const TMatrix<T> &matrix) + TensorWrapper(const TTensor<T> &matrix) { dim_[0] = matrix.dim(0); dim_[1] = matrix.dim(1); @@ -34,7 +34,7 @@ public: dataConst_ = matrix.data(); } - MatrixWrapper(TMatrix<T> &matrix) + TensorWrapper(TTensor<T> &matrix) { dim_[0] = matrix.dim(0); dim_[1] = matrix.dim(1); @@ -46,7 +46,7 @@ public: dataConst_ = data_; } - MatrixWrapper(unsigned a, unsigned b, unsigned c, unsigned d) + TensorWrapper(unsigned a, unsigned b, unsigned c, unsigned d) { // test constructor dim_[0] = a; dim_[1] = b; @@ -59,7 +59,7 @@ public: } __device__ - MatrixWrapper(T *ptr, unsigned a, unsigned b, unsigned c, unsigned d) + TensorWrapper(T *ptr, unsigned a, unsigned b, unsigned c, unsigned d) { dim_[0] = a; dim_[1] = b; @@ -309,7 +309,7 @@ protected: inline void testidToMatrixInd() { - MatrixWrapper<float> matrix(2, 4, 3, 5); + TensorWrapper<float> matrix(2, 4, 3, 5); std::cerr << "matrix=" << matrix.Debug() << std::endl; diff --git a/src/amun/gpu/mblas/vector_wrapper.h b/src/amun/gpu/mblas/vector_wrapper.h index 98ccfb85..15cce201 100644 --- a/src/amun/gpu/mblas/vector_wrapper.h +++ b/src/amun/gpu/mblas/vector_wrapper.h @@ -1,6 +1,6 @@ #pragma once #include <sstream> -#include "matrix.h" +#include "tensor.h" #include "gpu/mblas/vector.h" namespace amunmt { diff --git a/src/amun/gpu/npz_converter.cu b/src/amun/gpu/npz_converter.cu index 9791206a..c808ad32 100644 --- a/src/amun/gpu/npz_converter.cu +++ b/src/amun/gpu/npz_converter.cu @@ -1,6 +1,6 @@ #include "npz_converter.h" #include "common/exception.h" -#include "mblas/matrix_functions.h" +#include "mblas/tensor_functions.h" using namespace std; @@ -33,17 +33,17 @@ T Debug(const T *data, unsigned size) return sum; } -std::shared_ptr<mblas::Matrix> NpzConverter::get(const std::string& key, bool mandatory, bool transpose) const +std::shared_ptr<mblas::Tensor> NpzConverter::get(const std::string& key, bool mandatory, bool transpose) const { //mblas::TestMemCpy(); - std::shared_ptr<mblas::Matrix> ret; + std::shared_ptr<mblas::Tensor> ret; auto it = model_.find(key); if(it != model_.end()) { NpyMatrixWrapper np(it->second); unsigned size = np.size(); - mblas::Matrix *matrix = new mblas::Matrix(np.size1(), np.size2(), 1, 1); + mblas::Tensor *matrix = new mblas::Tensor(np.size1(), np.size2(), 1, 1); mblas::copy(np.data(), size, matrix->data(), cudaMemcpyHostToDevice); if (transpose) { @@ -53,12 +53,12 @@ std::shared_ptr<mblas::Matrix> NpzConverter::get(const std::string& key, bool ma ret.reset(matrix); } else if (mandatory) { - std::cerr << "Error: Matrix not found:" << key << std::endl; + std::cerr << "Error: Tensor not found:" << key << std::endl; //amunmt_UTIL_THROW2(strm.str()); // << key << std::endl abort(); } else { - mblas::Matrix *matrix = new mblas::Matrix(); + mblas::Tensor *matrix = new mblas::Tensor(); ret.reset(matrix); } @@ -66,14 +66,14 @@ std::shared_ptr<mblas::Matrix> NpzConverter::get(const std::string& key, bool ma return ret; } -std::shared_ptr<mblas::Matrix> NpzConverter::getFirstOfMany(const std::vector<std::pair<std::string, bool>> keys, bool mandatory) const +std::shared_ptr<mblas::Tensor> NpzConverter::getFirstOfMany(const std::vector<std::pair<std::string, bool>> keys, bool mandatory) const { - std::shared_ptr<mblas::Matrix> ret; + std::shared_ptr<mblas::Tensor> ret; for (auto key : keys) { auto it = model_.find(key.first); if(it != model_.end()) { NpyMatrixWrapper np(it->second); - mblas::Matrix *matrix = new mblas::Matrix(np.size1(), np.size2(), 1, 1); + mblas::Tensor *matrix = new mblas::Tensor(np.size1(), np.size2(), 1, 1); mblas::copy(np.data(), np.size(), matrix->data(), cudaMemcpyHostToDevice); if (key.second) { @@ -85,7 +85,7 @@ std::shared_ptr<mblas::Matrix> NpzConverter::getFirstOfMany(const std::vector<st } if (mandatory) { - std::cerr << "Error: Matrix not found:" << keys[0].first << std::endl; + std::cerr << "Error: Tensor not found:" << keys[0].first << std::endl; //amunmt_UTIL_THROW2(strm.str()); // << key << std::endl abort(); } diff --git a/src/amun/gpu/npz_converter.h b/src/amun/gpu/npz_converter.h index 204a1f4b..074c66a1 100644 --- a/src/amun/gpu/npz_converter.h +++ b/src/amun/gpu/npz_converter.h @@ -1,7 +1,7 @@ #pragma once #include "cnpy/cnpy.h" -#include "mblas/matrix_functions.h" +#include "mblas/tensor_functions.h" namespace amunmt { namespace GPU { @@ -46,8 +46,8 @@ class NpzConverter { void Destruct(); - std::shared_ptr<mblas::Matrix> get(const std::string& key, bool mandatory, bool transpose = false) const; - std::shared_ptr<mblas::Matrix> getFirstOfMany(const std::vector<std::pair<std::string, bool>> keys, bool mandatory) const; + std::shared_ptr<mblas::Tensor> get(const std::string& key, bool mandatory, bool transpose = false) const; + std::shared_ptr<mblas::Tensor> getFirstOfMany(const std::vector<std::pair<std::string, bool>> keys, bool mandatory) const; private: cnpy::npz_t model_; diff --git a/src/marian b/src/marian -Subproject 8b00026868ce0e3cb6107aa4a866c59447aa427 +Subproject 3deb9015a8729f4453f0f06b27385e764cce196 |