Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <marcinjd@microsoft.com>2022-02-06 23:00:48 +0300
committerMarcin Junczys-Dowmunt <marcinjd@microsoft.com>2022-02-06 23:00:48 +0300
commit8da539e835e8661d00697c8bd01164e64ab9ce62 (patch)
tree37c3dc44a73c127e897ee8698c57931642460fa7
parent266b931daa11bcd0f682d79a05542833e328849b (diff)
merged with master
-rw-r--r--azure-pipelines.yml40
-rw-r--r--src/common/aliases.cpp4
-rw-r--r--src/common/config_parser.cpp28
-rw-r--r--src/common/definitions.h10
-rw-r--r--src/common/utils.cpp8
-rw-r--r--src/data/batch_generator.h35
-rw-r--r--src/data/corpus.cpp152
-rw-r--r--src/data/corpus.h3
-rw-r--r--src/data/corpus_base.cpp77
-rw-r--r--src/data/corpus_base.h105
-rw-r--r--src/data/corpus_nbest.cpp7
-rw-r--r--src/data/corpus_sqlite.cpp6
-rw-r--r--src/data/sentencepiece_vocab.cpp8
-rw-r--r--src/data/text_input.cpp6
-rw-r--r--src/graph/expression_operators.cpp7
-rw-r--r--src/graph/expression_operators.h15
-rw-r--r--src/graph/node_operators_binary.h61
-rw-r--r--src/graph/node_operators_tuple.h2
-rw-r--r--src/layers/output.cpp22
-rw-r--r--src/models/costs.cpp35
-rw-r--r--src/models/costs.h32
-rw-r--r--src/models/encoder_decoder.cpp2
-rw-r--r--src/models/model_factory.cpp21
-rw-r--r--src/models/transformer.h36
-rwxr-xr-xsrc/tensors/cpu/tensor_operators.cpp13
-rwxr-xr-xsrc/tensors/gpu/element.cu12
-rwxr-xr-xsrc/tensors/gpu/prod.cpp6
-rw-r--r--src/tensors/gpu/tensor_operators.cu210
-rw-r--r--src/tensors/tensor_operators.h42
-rw-r--r--src/training/graph_group.cpp135
-rw-r--r--src/training/graph_group.h18
-rw-r--r--src/training/graph_group_async.cpp6
-rw-r--r--src/training/graph_group_singleton.cpp8
-rw-r--r--src/training/graph_group_sync.cpp8
-rw-r--r--src/translator/beam_search.cpp5
-rw-r--r--src/translator/nth_element.cpp2
-rw-r--r--src/translator/translator.h2
37 files changed, 849 insertions, 340 deletions
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 4f7ce02d..bc76f85c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -6,6 +6,13 @@
# 3. Choose "Existing Azure Pipelines YAML file" and specify path to this file
# 4. "More actions" > "Save"
+parameters:
+# Allow skipping the entire 'Build' stage
+- name: runBuilds
+ displayName: Run builds? Uncheck to run regression tests only.
+ type: boolean
+ default: true
+
# The pipeline CI trigger is set on the branch master only and PR trigger on a
# (non-draft) pull request to any branch
trigger:
@@ -45,6 +52,7 @@ stages:
######################################################################
- job: BuildWindows
+ condition: eq(${{ parameters.runBuilds }}, true)
displayName: Windows
strategy:
@@ -180,6 +188,7 @@ stages:
######################################################################
- job: BuildUbuntu
+ condition: eq(${{ parameters.runBuilds }}, true)
displayName: Ubuntu
timeoutInMinutes: 90
@@ -237,17 +246,7 @@ stages:
examples: true
static: true
################################################################
- # Ubuntu 16.04 supports CUDA 8+
- "16.04 CUDA 9.2 gcc-7":
- image: ubuntu-16.04
- boost: true
- cpu: true
- gpu: true
- cuda: 9.2
- gcc: 7
- unit_tests: true
- examples: true
- static: false
+ # Ubuntu 16.04 is no longer available on Azure-hosted machines
pool:
vmImage: $(image)
@@ -322,18 +321,17 @@ stages:
######################################################################
- job: BuildUbuntuMinimal
- displayName: Ubuntu CPU+GPU gcc-5 cmake 3.5
+ condition: eq(${{ parameters.runBuilds }}, true)
+ displayName: Ubuntu CPU+GPU gcc-7 cmake 3.5
pool:
- vmImage: ubuntu-16.04
+ vmImage: ubuntu-18.04
steps:
- checkout: self
submodules: true
# The script simplifies installation of different versions of CUDA.
- # Ubuntu 16.04 on Azure-hosted VMs have GCC 5.5 as gcc-5, which is not compatible with CUDA 9.
- # Downgrading to GCC 5.4 (the default gcc on Ubuntu 16.04) would be more work...
- bash: ./scripts/ci/install_cuda_ubuntu.sh "10.0"
displayName: Install CUDA
@@ -346,10 +344,10 @@ stages:
# GCC 5 is the minimum version supported
- bash: |
- /usr/bin/gcc-5 --version
+ /usr/bin/gcc-7 --version
mkdir -p build
cd build
- CC=/usr/bin/gcc-5 CXX=/usr/bin/g++-5 CUDAHOSTCXX=/usr/bin/g++-5 \
+ CC=/usr/bin/gcc-7 CXX=/usr/bin/g++-7 CUDAHOSTCXX=/usr/bin/g++-7 \
../cmake-3.5.1-Linux-x86_64/bin/cmake .. \
-DCOMPILE_CPU=on \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.0
@@ -368,10 +366,11 @@ stages:
######################################################################
- job: BuildMacOS
+ condition: eq(${{ parameters.runBuilds }}, true)
displayName: macOS CPU clang
pool:
- vmImage: macos-latest
+ vmImage: macos-10.15
steps:
- checkout: self
@@ -416,6 +415,7 @@ stages:
######################################################################
- job: BuildInstall
+ condition: eq(${{ parameters.runBuilds }}, true)
displayName: Linux CPU library install
pool:
@@ -580,7 +580,7 @@ stages:
# Avoid using $(Build.SourcesDirectory) in bash tasks because on Windows pools it uses '\'
# instead of '/', which often breaks the job
- - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
+ - bash: MARIAN=../marian-dev/build TIMEOUT=10m bash ./run_mrt.sh '#cpu' '#basics' '#devops'
continueOnError: true
displayName: Run tests
workingDirectory: marian-prod-tests
@@ -677,7 +677,7 @@ stages:
AWS_SECRET_SAS_TOKEN: $(blob-sas-token)
workingDirectory: marian-prod-tests
- - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics'
+ - bash: MARIAN=../marian-dev/build bash ./run_mrt.sh '#cpu' '#basics' '#devops'
continueOnError: true
displayName: Run tests
workingDirectory: marian-prod-tests
diff --git a/src/common/aliases.cpp b/src/common/aliases.cpp
index 36613327..b38ccc64 100644
--- a/src/common/aliases.cpp
+++ b/src/common/aliases.cpp
@@ -31,8 +31,8 @@ void ConfigParser::addAliases(cli::CLIWrapper& cli) {
cli.alias("fp16", "true", [&](YAML::Node& config) {
if(mode_ == cli::mode::training) {
config["precision"] = std::vector<std::string>({"float16", "float32"}); // inference type, optimization type, save type
- // scaling factor (power of 2), frequency, multiplier at increase, tolerance, range, minium factor
- config["cost-scaling"] = std::vector<std::string>({"0", "1000", "2", "0.05", "10", "1e-5"});
+ // scaling factor, frequency, multiplier at increase, minium scaling factor
+ config["cost-scaling"] = std::vector<std::string>({"256.f", "1000", "2.f", "256.f"});
} else {
config["precision"] = std::vector<std::string>({"float16"}); // for inference we do not need the other types
}
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 333d87a7..ebbe4a89 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -267,10 +267,16 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
"Pool encoder states instead of using cross attention (selects first encoder state, best used with special token)");
cli.add<int>("--transformer-dim-ffn",
"Size of position-wise feed-forward network (transformer)",
- 2048);
+ 2048);
+ cli.add<int>("--transformer-decoder-dim-ffn",
+ "Size of position-wise feed-forward network in decoder (transformer). Uses --transformer-dim-ffn if 0.",
+ 0);
cli.add<int>("--transformer-ffn-depth",
"Depth of filters (transformer)",
2);
+ cli.add<int>("--transformer-decoder-ffn-depth",
+ "Depth of filters in decoder (transformer). Uses --transformer-ffn-depth if 0",
+ 0);
cli.add<std::string>("--transformer-ffn-activation",
"Activation between filters: swish or relu (transformer)",
"swish");
@@ -528,15 +534,15 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
// mixed precision training
cli.add<bool>("--fp16",
"Shortcut for mixed precision training with float16 and cost-scaling, "
- "corresponds to: --precision float16 float32 --cost-scaling 0 1000 2 0.05 10 1e-5f");
+ "corresponds to: --precision float16 float32 --cost-scaling 256.f 1000 2.f 256.f");
cli.add<std::vector<std::string>>("--precision",
"Mixed precision training for forward/backward pass and optimizaton. "
"Defines types for: forward/backward pass, optimization.",
{"float32", "float32"});
cli.add<std::vector<std::string>>("--cost-scaling",
"Dynamic cost scaling for mixed precision training: "
- "power of 2, scaling window, scaling factor, tolerance, range, minimum factor")
- ->implicit_val("0.f 1000 2.f 0.05f 10 1e-5f");
+ "scaling factor, frequency, multiplier, minimum factor")
+ ->implicit_val("256.f 1000 2.f 256.f");
cli.add<size_t>("--gradient-norm-average-window",
"Window size over which the exponential average of the gradient norm is recorded (for logging and scaling). "
"After this many updates about 90% of the mass of the exponential average comes from these updates",
@@ -702,9 +708,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Use softmax shortlist: path first best prune");
cli.add<std::vector<float>>("--weights",
"Scorer weights");
- cli.add<bool>("--output-sampling",
- "Noise output layer with gumbel noise",
- false);
+ cli.add<std::vector<std::string>>("--output-sampling",
+ "Noise output layer with gumbel noise. Implicit default is 'full' for sampling from full distribution. "
+ " Also accepts 'topk num' (e.g. topk 100) for top-100 sampling.")
+ ->implicit_val("full");
cli.add<std::vector<int>>("--output-approx-knn",
"Use approximate knn search in output layer (currently only in transformer)")
->implicit_val("100 1024");
@@ -889,6 +896,10 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
if(mode_ == cli::mode::training) {
cli.add<bool>("--shuffle-in-ram",
"Keep shuffled corpus in RAM, do not write to temp file");
+
+ cli.add<size_t>("--data-threads",
+ "Number of concurrent threads to use during data reading and processing", 1);
+
// @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
cli.add<size_t>("--all-caps-every",
"When forming minibatches, preprocess every Nth line on the fly to all-caps. Assumes UTF-8");
@@ -907,6 +918,9 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
cli.add<bool>("--mini-batch-round-up",
"Round up batch size to next power of 2 for more efficient training, but this can make batch size less stable. Disable with --mini-batch-round-up=false",
true);
+ } else {
+ cli.add<size_t>("--data-threads",
+ "Number of concurrent threads to use during data reading and processing", 1);
}
// clang-format on
}
diff --git a/src/common/definitions.h b/src/common/definitions.h
index 159791d0..e28ea5dc 100644
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -106,24 +106,24 @@ using Weak = std::weak_ptr<T>;
/** @brief Creates shared_ptr of any type, passes all arguments to any available
* constructor */
template <class T, typename... Args>
-Ptr<T> New(Args&&... args) {
- return Ptr<T>(new T(std::forward<Args>(args)...));
+inline Ptr<T> New(Args&&... args) {
+ return std::make_shared<T>(std::forward<Args>(args)...);
}
template <class T>
-Ptr<T> New(Ptr<T> p) {
+inline Ptr<T> New(Ptr<T> p) {
return Ptr<T>(p);
}
/** @brief Creates InstrusivePtr of any type, passes all arguments to any available
* constructor */
template <class T, typename... Args>
-IPtr<T> INew(Args&&... args) {
+inline IPtr<T> INew(Args&&... args) {
return IPtr<T>(new T(std::forward<Args>(args)...));
}
template <class T>
-IPtr<T> INew(Ptr<T> p) {
+inline IPtr<T> INew(Ptr<T> p) {
return IPtr<T>(p);
}
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index 72624041..99fc790a 100644
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -70,22 +70,20 @@ void split(const std::string& line,
// the function guarantees that the output has as many elements as requested
void splitTsv(const std::string& line, std::vector<std::string>& fields, size_t numFields) {
fields.clear();
+ fields.resize(numFields); // make sure there is as many elements as requested
size_t begin = 0;
size_t pos = 0;
for(size_t i = 0; i < numFields; ++i) {
pos = line.find('\t', begin);
if(pos == std::string::npos) {
- fields.push_back(line.substr(begin));
+ fields[i] = line.substr(begin);
break;
}
- fields.push_back(line.substr(begin, pos - begin));
+ fields[i] = line.substr(begin, pos - begin);
begin = pos + 1;
}
- if(fields.size() < numFields) // make sure there is as many elements as requested
- fields.resize(numFields);
-
ABORT_IF(pos != std::string::npos, "Excessive field(s) in the tab-separated line: '{}'", line);
}
diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h
index a248db23..ea977468 100644
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@@ -2,6 +2,7 @@
#include "common/options.h"
#include "common/signal_handling.h"
+#include "common/timer.h"
#include "data/batch_stats.h"
#include "data/rng_engine.h"
#include "training/training_state.h"
@@ -92,6 +93,8 @@ private:
// this runs on a bg thread; sequencing is handled by caller, but locking is done in here
std::deque<BatchPtr> fetchBatches() {
+ timer::Timer total;
+
typedef typename Sample::value_type Item;
auto itemCmp = [](const Item& sa, const Item& sb) { return sa.size() < sb.size(); }; // sort by element length, not content
@@ -135,19 +138,29 @@ private:
if(current_ != data_->end())
++current_;
}
- size_t sets = 0;
- while(current_ != data_->end() && maxiBatch->size() < maxSize) { // loop over data
+
+ Samples maxiBatchTemp;
+ while(current_ != data_->end() && maxiBatchTemp.size() < maxSize) { // loop over data
if (saveAndExitRequested()) // stop generating batches
return std::deque<BatchPtr>();
- maxiBatch->push(*current_);
- sets = current_->size();
+
+ maxiBatchTemp.push_back(*current_);
+
// do not consume more than required for the maxi batch as this causes
// that line-by-line translation is delayed by one sentence
- bool last = maxiBatch->size() == maxSize;
+ bool last = maxiBatchTemp.size() == maxSize;
if(!last)
++current_; // this actually reads the next line and pre-processes it
}
- size_t numSentencesRead = maxiBatch->size();
+ size_t numSentencesRead = maxiBatchTemp.size();
+
+ size_t sets = 0;
+ for(auto&& s : maxiBatchTemp) {
+ if(!s.empty()) {
+ sets = s.size();
+ maxiBatch->push(s);
+ }
+ }
// construct the actual batches and place them in the queue
Samples batchVector;
@@ -163,6 +176,7 @@ private:
BatchStats::const_iterator cachedStatsIter;
if (stats_)
cachedStatsIter = stats_->begin();
+
while(!maxiBatch->empty()) { // while there are sentences in the queue
if (saveAndExitRequested()) // stop generating batches
return std::deque<BatchPtr>();
@@ -178,12 +192,7 @@ private:
lengths[i] = batchVector.back()[i].size(); // record max lengths so far
maxBatchSize = stats_->findBatchSize(lengths, cachedStatsIter);
- // this optimization makes no difference indeed
-#if 0 // sanity check: would we find the same entry if searching from the start?
- auto it = stats_->lower_bound(lengths);
- auto maxBatchSize1 = stats_->findBatchSize(lengths, it);
- ABORT_IF(maxBatchSize != maxBatchSize1, "findBatchSize iter caching logic is borked");
-#endif
+
makeBatch = batchVector.size() >= maxBatchSize;
// if last added sentence caused a bump then we likely have bad padding, so rather move it into the next batch
if(batchVector.size() > maxBatchSize) {
@@ -231,6 +240,8 @@ private:
LOG(debug, "[data] fetched {} batches with {} sentences. Per batch: {} sentences, {} labels.",
tempBatches.size(), numSentencesRead,
(double)totalSent / (double)totalDenom, (double)totalLabels / (double)totalDenom);
+ LOG(debug, "[data] fetching batches took {:.2f} seconds, {:.2f} sents/s", total.elapsed(), (double)numSentencesRead / total.elapsed());
+
return tempBatches;
}
diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp
index d8a364b2..643a7de9 100644
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@@ -14,18 +14,30 @@ namespace data {
Corpus::Corpus(Ptr<Options> options, bool translate /*= false*/, size_t seed /*= Config:seed*/)
: CorpusBase(options, translate, seed),
- shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
- allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
- titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
+ shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
+ allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
+ titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {
+
+ auto numThreads = options_->get<size_t>("data-threads", 1);
+ if(numThreads > 1)
+ threadPool_.reset(new ThreadPool(numThreads));
+
+}
Corpus::Corpus(std::vector<std::string> paths,
std::vector<Ptr<Vocab>> vocabs,
Ptr<Options> options,
size_t seed /*= Config:seed*/)
: CorpusBase(paths, vocabs, options, seed),
- shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
- allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
- titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {}
+ shuffleInRAM_(options_->get<bool>("shuffle-in-ram", false)),
+ allCapsEvery_(options_->get<size_t>("all-caps-every", 0)),
+ titleCaseEvery_(options_->get<size_t>("english-title-case-every", 0)) {
+
+ auto numThreads = options_->get<size_t>("data-threads", 1);
+ if(numThreads > 1)
+ threadPool_.reset(new ThreadPool(numThreads));
+
+}
void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
bool isFactoredVocab = vocabs_.back()->tryAs<FactoredVocab>() != nullptr;
@@ -52,16 +64,10 @@ void Corpus::preprocessLine(std::string& line, size_t streamId, bool& altered) {
}
SentenceTuple Corpus::next() {
- // Used for handling TSV inputs
- // Determine the total number of fields including alignments or weights
- auto tsvNumAllFields = tsvNumInputFields_;
- if(alignFileIdx_ > -1)
- ++tsvNumAllFields;
- if(weightFileIdx_ > -1)
- ++tsvNumAllFields;
- std::vector<std::string> fields(tsvNumAllFields);
-
- for(;;) { // (this is a retry loop for skipping invalid sentences)
+ size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size();
+ std::vector<std::string> fields(numStreams);
+
+ while(true) { // retry loop
// get index of the current sentence
size_t curId = pos_; // note: at end, pos_ == total size
// if corpus has been shuffled, ids_ contains sentence indexes
@@ -69,83 +75,91 @@ SentenceTuple Corpus::next() {
curId = ids_[pos_];
pos_++;
- // fill up the sentence tuple with sentences from all input files
- SentenceTuple tup(curId);
size_t eofsHit = 0;
- size_t numStreams = corpusInRAM_.empty() ? files_.size() : corpusInRAM_.size();
- for(size_t i = 0; i < numStreams; ++i) {
- std::string line;
-
+ for(size_t i = 0; i < numStreams; ++i) { // looping of all streams
// fetch line, from cached copy in RAM or actual file
if (!corpusInRAM_.empty()) {
if (curId < corpusInRAM_[i].size())
- line = corpusInRAM_[i][curId];
+ fields[i] = corpusInRAM_[i][curId];
else {
eofsHit++;
continue;
}
}
else {
- bool gotLine = io::getline(*files_[i], line).good();
+ bool gotLine = io::getline(*files_[i], fields[i]).good();
if(!gotLine) {
eofsHit++;
continue;
}
}
+ }
- if(i > 0 && i == alignFileIdx_) {
- addAlignmentToSentenceTuple(line, tup);
- } else if(i > 0 && i == weightFileIdx_) {
- addWeightsToSentenceTuple(line, tup);
- } else {
- if(tsv_) { // split TSV input and add each field into the sentence tuple
- utils::splitTsv(line, fields, tsvNumAllFields);
- size_t shift = 0;
- for(size_t j = 0; j < tsvNumAllFields; ++j) {
- // index j needs to be shifted to get the proper vocab index if guided-alignment or
- // data-weighting are preceding source or target sequences in TSV input
- if(j == alignFileIdx_ || j == weightFileIdx_) {
- ++shift;
- } else {
- size_t vocabId = j - shift;
- bool altered;
- preprocessLine(fields[j], vocabId, /*out=*/altered);
- if (altered)
- tup.markAltered();
- addWordsToSentenceTuple(fields[j], vocabId, tup);
- }
- }
-
- // weights are added last to the sentence tuple, because this runs a validation that needs
- // length of the target sequence
- if(alignFileIdx_ > -1)
- addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
- if(weightFileIdx_ > -1)
- addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
+ if(eofsHit == numStreams)
+ return SentenceTuple(); // unintialized SentenceTuple which will be invalid when tested
+ ABORT_IF(eofsHit != 0, "not all input files have the same number of lines");
+
+ auto makeSentenceTuple = [this](size_t curId, std::vector<std::string> fields) {
+ if(tsv_) {
+ // with tsv inputs data, there is only one input stream, hence we only have one field
+ // which needs to be tokenized into tab-separated fields
+ ABORT_IF(fields.size() != 1, "Reading TSV file, but we have don't have exactly one stream??");
+ size_t numAllFields = tsvNumInputFields_;
+ if(alignFileIdx_ > -1)
+ ++numAllFields;
+ if(weightFileIdx_ > -1)
+ ++numAllFields;
+ // replace single-element fields array with extracted tsv fields
+ std::vector<std::string> tmpFields;
+ utils::splitTsv(fields[0], tmpFields, numAllFields); // this verifies the number of fields
+ fields.swap(tmpFields);
+ }
+
+ // fill up the sentence tuple with sentences from all input files
+ SentenceTupleImpl tup(curId);
+ size_t shift = 0;
+ for(size_t i = 0; i < fields.size(); ++i) {
+ // index j needs to be shifted to get the proper vocab index if guided-alignment or
+ // data-weighting are preceding source or target sequences in TSV input
+ if(i == alignFileIdx_ || i == weightFileIdx_) {
+ ++shift;
} else {
+ size_t vocabId = i - shift;
bool altered;
- preprocessLine(line, i, /*out=*/altered);
+ preprocessLine(fields[i], vocabId, /*out=*/altered);
if (altered)
tup.markAltered();
- addWordsToSentenceTuple(line, i, tup);
+ addWordsToSentenceTuple(fields[i], vocabId, tup);
}
+
+ // weights are added last to the sentence tuple, because this runs a validation that needs
+ // length of the target sequence
+ if(alignFileIdx_ > -1)
+ addAlignmentToSentenceTuple(fields[alignFileIdx_], tup);
+ if(weightFileIdx_ > -1)
+ addWeightsToSentenceTuple(fields[weightFileIdx_], tup);
}
- }
-
- if (eofsHit == numStreams)
- return SentenceTuple(0);
- ABORT_IF(eofsHit != 0, "not all input files have the same number of lines");
- // check if all streams are valid, that is, non-empty and no longer than maximum allowed length
- if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
- return words.size() > 0 && words.size() <= maxLength_;
- }))
- return tup;
+ // check if all streams are valid, that is, non-empty and no longer than maximum allowed length
+ if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
+ return words.size() > 0 && words.size() <= maxLength_;
+ })) {
+ return tup;
+ } else {
+ return SentenceTupleImpl(); // return an empty tuple if above test does not pass
+ }
+ };
+
+ if(threadPool_) { // use thread pool if available
+ return SentenceTuple(threadPool_->enqueue(makeSentenceTuple, curId, fields));
+ } else { // otherwise launch here and just pass the result into the wrapper
+ auto tup = makeSentenceTuple(curId, fields);
+ if(!tup.empty())
+ return SentenceTuple(tup);
+ }
- // otherwise skip this sentence and try the next one
- // @TODO: tail recursion?
- }
+ } // end of retry loop
}
// reset and initialize shuffled reading
@@ -167,6 +181,8 @@ void Corpus::reset() {
pos_ = 0;
for (size_t i = 0; i < paths_.size(); ++i) {
if(paths_[i] == "stdin" || paths_[i] == "-") {
+ std::cin.tie(0);
+ std::ios_base::sync_with_stdio(false);
files_[i].reset(new std::istream(std::cin.rdbuf()));
// Probably not necessary, unless there are some buffers
// that we want flushed.
diff --git a/src/data/corpus.h b/src/data/corpus.h
index e8e9a9fd..281d43a2 100644
--- a/src/data/corpus.h
+++ b/src/data/corpus.h
@@ -4,6 +4,7 @@
#include <iostream>
#include <random>
+#include "3rd_party/threadpool.h"
#include "common/definitions.h"
#include "common/file_stream.h"
#include "common/options.h"
@@ -20,6 +21,8 @@ class Corpus : public CorpusBase {
private:
std::vector<UPtr<io::TemporaryFile>> tempFiles_;
std::vector<size_t> ids_;
+
+ UPtr<ThreadPool> threadPool_; // thread pool for parallelized data reading
// for shuffle-in-ram
bool shuffleInRAM_{false};
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index 9d95a121..20301103 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -12,7 +12,24 @@ typedef std::vector<float> MaskBatch;
typedef std::pair<WordBatch, MaskBatch> WordMask;
typedef std::vector<WordMask> SentBatch;
-CorpusIterator::CorpusIterator() : pos_(-1), tup_(0) {}
+void SentenceTupleImpl::setWeights(const std::vector<float>& weights) {
+ if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine
+ ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights");
+ auto numWeights = weights.size();
+ auto numTrgWords = back().size();
+ // word-level weights may or may not contain a weight for EOS tokens
+ if(numWeights != numTrgWords && numWeights != numTrgWords - 1)
+ LOG(warn,
+ "[warn] "
+ "Number of weights ({}) does not match the number of target words ({}) in line #{}",
+ numWeights,
+ numTrgWords,
+ id_);
+ }
+ weights_ = weights;
+}
+
+CorpusIterator::CorpusIterator() : pos_(-1) {}
CorpusIterator::CorpusIterator(CorpusBase* corpus)
: corpus_(corpus), pos_(0), tup_(corpus_->next()) {}
@@ -23,7 +40,7 @@ void CorpusIterator::increment() {
}
bool CorpusIterator::equal(CorpusIterator const& other) const {
- return this->pos_ == other.pos_ || (this->tup_.empty() && other.tup_.empty());
+ return this->pos_ == other.pos_ || (!this->tup_.valid() && !other.tup_.valid());
}
const SentenceTuple& CorpusIterator::dereference() const {
@@ -390,7 +407,7 @@ CorpusBase::CorpusBase(Ptr<Options> options, bool translate, size_t seed)
void CorpusBase::addWordsToSentenceTuple(const std::string& line,
size_t batchIndex,
- SentenceTuple& tup) const {
+ SentenceTupleImpl& tup) const {
// This turns a string in to a sequence of numerical word ids. Depending
// on the vocabulary type, this can be non-trivial, e.g. when SentencePiece
// is used.
@@ -411,7 +428,7 @@ void CorpusBase::addWordsToSentenceTuple(const std::string& line,
}
void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
- SentenceTuple& tup) const {
+ SentenceTupleImpl& tup) const {
ABORT_IF(rightLeft_,
"Guided alignment and right-left model cannot be used "
"together at the moment");
@@ -420,7 +437,7 @@ void CorpusBase::addAlignmentToSentenceTuple(const std::string& line,
tup.setAlignment(align);
}
-void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const {
+void CorpusBase::addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const {
auto elements = utils::split(line, " ");
if(!elements.empty()) {
@@ -549,6 +566,7 @@ size_t CorpusBase::getNumberOfTSVInputFields(Ptr<Options> options) {
return 0;
}
+<<<<<<< HEAD
void SentenceTuple::setWeights(const std::vector<float>& weights) {
if(weights.size() != 1) { // this assumes a single sentence-level weight is always fine
ABORT_IF(empty(), "Source and target sequences should be added to a tuple before data weights");
@@ -564,6 +582,55 @@ void SentenceTuple::setWeights(const std::vector<float>& weights) {
id_);
}
weights_ = weights;
+=======
+// experimental: hide inline-fix source tokens from cross attention
+std::vector<float> SubBatch::crossMaskWithInlineFixSourceSuppressed() const
+{
+ const auto& srcVocab = *vocab();
+
+ auto factoredVocab = vocab()->tryAs<FactoredVocab>();
+ size_t inlineFixGroupIndex = 0, inlineFixSrc = 0;
+ auto hasInlineFixFactors = factoredVocab && factoredVocab->tryGetFactor(FactoredVocab_INLINE_FIX_WHAT_serialized, /*out*/ inlineFixGroupIndex, /*out*/ inlineFixSrc);
+
+ auto fixSrcId = srcVocab[FactoredVocab_FIX_SRC_ID_TAG];
+ auto fixTgtId = srcVocab[FactoredVocab_FIX_TGT_ID_TAG];
+ auto fixEndId = srcVocab[FactoredVocab_FIX_END_ID_TAG];
+ auto unkId = srcVocab.getUnkId();
+ auto hasInlineFixTags = fixSrcId != unkId && fixTgtId != unkId && fixEndId != unkId;
+
+ auto m = mask(); // default return value, which we will modify in-place below in case we need to
+ if (hasInlineFixFactors || hasInlineFixTags) {
+ LOG_ONCE(info, "[data] Suppressing cross-attention into inline-fix source tokens");
+
+ // example: force French translation of name "frank" to always be "franck"
+ // - hasInlineFixFactors: "frank|is franck|it", "frank|is" cannot be cross-attended to
+ // - hasInlineFixTags: "<IOPEN> frank <IDELIM> franck <ICLOSE>", "frank" and all tags cannot be cross-attended to
+ auto dimBatch = batchSize(); // number of sentences in the batch
+ auto dimWidth = batchWidth(); // number of words in the longest sentence in the batch
+ const auto& d = data();
+ size_t numWords = 0;
+ for (size_t b = 0; b < dimBatch; b++) { // loop over batch entries
+ bool inside = false;
+ for (size_t s = 0; s < dimWidth; s++) { // loop over source positions
+ auto i = locate(/*batchIdx=*/b, /*wordPos=*/s);
+ if (!m[i])
+ break;
+ numWords++;
+ // keep track of entering/exiting the inline-fix source tags
+ auto w = d[i];
+ if (w == fixSrcId)
+ inside = true;
+ else if (w == fixTgtId)
+ inside = false;
+ bool wHasSrcIdFactor = hasInlineFixFactors && factoredVocab->getFactor(w, inlineFixGroupIndex) == inlineFixSrc;
+ if (inside || w == fixSrcId || w == fixTgtId || w == fixEndId || wHasSrcIdFactor)
+ m[i] = 0.0f; // decoder must not look at embedded source, nor the markup tokens
+ }
+ }
+ ABORT_IF(batchWords() != 0/*n/a*/ && numWords != batchWords(), "batchWords() inconsistency??");
+ }
+ return m;
+>>>>>>> master
}
} // namespace data
diff --git a/src/data/corpus_base.h b/src/data/corpus_base.h
index d504a7ea..a54c20f8 100644
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@@ -11,6 +11,8 @@
#include "data/rng_engine.h"
#include "data/vocab.h"
+#include <future>
+
namespace marian {
namespace data {
@@ -22,7 +24,7 @@ namespace data {
* construction of marian::data::CorpusBatch objects. They are not a part of
* marian::data::CorpusBatch.
*/
-class SentenceTuple {
+class SentenceTupleImpl {
private:
size_t id_;
std::vector<Words> tuple_; // [stream index][step index]
@@ -34,11 +36,16 @@ public:
typedef Words value_type;
/**
+ * @brief Creates an empty tuple with 0 id (default constructor).
+ */
+ SentenceTupleImpl() : id_(0) {}
+
+ /**
* @brief Creates an empty tuple with the given Id.
*/
- SentenceTuple(size_t id) : id_(id) {}
+ SentenceTupleImpl(size_t id) : id_(id) {}
- ~SentenceTuple() { tuple_.clear(); }
+ ~SentenceTupleImpl() {}
/**
* @brief Returns the sentence's ID.
@@ -114,6 +121,92 @@ public:
void setAlignment(const WordAlignment& alignment) { alignment_ = alignment; }
};
+class SentenceTuple {
+private:
+ std::shared_ptr<std::future<SentenceTupleImpl>> fImpl_;
+ mutable std::shared_ptr<SentenceTupleImpl> impl_;
+
+public:
+ typedef Words value_type;
+
+ /**
+ * @brief Creates an empty tuple with no associated future.
+ */
+ SentenceTuple() {}
+
+ SentenceTuple(const SentenceTupleImpl& tupImpl)
+ : impl_(std::make_shared<SentenceTupleImpl>(tupImpl)) {}
+
+ SentenceTuple(std::future<SentenceTupleImpl>&& fImpl)
+ : fImpl_(new std::future<SentenceTupleImpl>(std::move(fImpl))) {}
+
+ SentenceTupleImpl& get() const {
+ if(!impl_) {
+ ABORT_IF(!fImpl_ || !fImpl_->valid(), "No future tuple associated with SentenceTuple");
+ impl_ = std::make_shared<SentenceTupleImpl>(fImpl_->get());
+ }
+ return *impl_;
+ }
+
+ /**
+ * @brief Returns the sentence's ID.
+ */
+ size_t getId() const { return get().getId(); }
+
+ /**
+ * @brief Returns whether this Tuple was altered or augmented from what
+ * was provided to Marian in input.
+ */
+ bool isAltered() const { return get().isAltered(); }
+
+ /**
+ * @brief The size of the tuple, e.g. two for parallel data with a source and
+ * target sentences.
+ */
+ size_t size() const { return get().size(); }
+
+ /**
+ * @brief confirms that the tuple has been populated with data
+ */
+ bool valid() const {
+ return fImpl_ || impl_;
+ }
+
+ /**
+ * @brief The i-th tuple sentence.
+ *
+ * @param i Tuple's index.
+ */
+ Words& operator[](size_t i) { return get()[i]; }
+ const Words& operator[](size_t i) const { return get()[i]; }
+
+ /**
+ * @brief The last tuple sentence, i.e. the target sentence.
+ */
+ Words& back() { return get().back(); }
+ const Words& back() const { return get().back(); }
+
+ /**
+ * @brief Checks whether the tuple is empty.
+ */
+ bool empty() const { return get().empty(); }
+
+ auto begin() const -> decltype(get().begin()) { return get().begin(); }
+ auto end() const -> decltype(get().end()) { return get().end(); }
+
+ auto rbegin() const -> decltype(get().rbegin()) { return get().rbegin(); }
+ auto rend() const -> decltype(get().rend()) { return get().rend(); }
+
+ /**
+ * @brief Get sentence weights.
+ *
+ * For sentence-level weights the vector contains only one element.
+ */
+ const std::vector<float>& getWeights() const { return get().getWeights(); }
+
+ const WordAlignment& getAlignment() const { return get().getAlignment(); }
+};
+
/**
* @brief Batch of sentences represented as word indices with masking.
*/
@@ -583,17 +676,17 @@ protected:
* @brief Helper function converting a line of text into words using the i-th
* vocabulary and adding them to the sentence tuple.
*/
- void addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTuple& tup) const;
+ void addWordsToSentenceTuple(const std::string& line, size_t batchIndex, SentenceTupleImpl& tup) const;
/**
* @brief Helper function parsing a line with word alignments and adding them
* to the sentence tuple.
*/
- void addAlignmentToSentenceTuple(const std::string& line, SentenceTuple& tup) const;
+ void addAlignmentToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const;
/**
* @brief Helper function parsing a line of weights and adding them to the
* sentence tuple.
*/
- void addWeightsToSentenceTuple(const std::string& line, SentenceTuple& tup) const;
+ void addWeightsToSentenceTuple(const std::string& line, SentenceTupleImpl& tup) const;
void addAlignmentsToBatch(Ptr<CorpusBatch> batch, const std::vector<Sample>& batchVector);
diff --git a/src/data/corpus_nbest.cpp b/src/data/corpus_nbest.cpp
index d5a48d8d..8029d351 100644
--- a/src/data/corpus_nbest.cpp
+++ b/src/data/corpus_nbest.cpp
@@ -43,7 +43,7 @@ SentenceTuple CorpusNBest::next() {
pos_++;
// fill up the sentence tuple with sentences from all input files
- SentenceTuple tup(curId);
+ SentenceTupleImpl tup(curId);
std::string line;
lastLines_.resize(files_.size() - 1);
@@ -74,9 +74,10 @@ SentenceTuple CorpusNBest::next() {
if(cont && std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
return words.size() > 0 && words.size() <= maxLength_;
}))
- return tup;
+ return SentenceTuple(tup);
}
- return SentenceTuple(0);
+
+ return SentenceTuple();
}
void CorpusNBest::reset() {
diff --git a/src/data/corpus_sqlite.cpp b/src/data/corpus_sqlite.cpp
index 297847c0..f7c577f2 100644
--- a/src/data/corpus_sqlite.cpp
+++ b/src/data/corpus_sqlite.cpp
@@ -109,7 +109,7 @@ SentenceTuple CorpusSQLite::next() {
while(select_->executeStep()) {
// fill up the sentence tuple with sentences from all input files
size_t curId = select_->getColumn(0).getInt();
- SentenceTuple tup(curId);
+ SentenceTupleImpl tup(curId);
for(size_t i = 0; i < files_.size(); ++i) {
auto line = select_->getColumn((int)(i + 1));
@@ -126,9 +126,9 @@ SentenceTuple CorpusSQLite::next() {
if(std::all_of(tup.begin(), tup.end(), [=](const Words& words) {
return words.size() > 0 && words.size() <= maxLength_;
}))
- return tup;
+ return SentenceTuple(tup);
}
- return SentenceTuple(0);
+ return SentenceTuple();
}
void CorpusSQLite::shuffle() {
diff --git a/src/data/sentencepiece_vocab.cpp b/src/data/sentencepiece_vocab.cpp
index 090d478b..8f774c2b 100644
--- a/src/data/sentencepiece_vocab.cpp
+++ b/src/data/sentencepiece_vocab.cpp
@@ -236,18 +236,20 @@ public:
return words;
}
- std::string decode(const Words& sentence, bool /*ignoreEOS*/) const override {
+ std::string decode(const Words& sentence, bool ignoreEOS) const override {
std::string line;
if(keepEncoded_) { // i.e. keep the sentence segmented into subword units
for(const Word& id : sentence)
- line += (*this)[id] + " ";
+ if(!ignoreEOS || id != getEosId())
+ line += (*this)[id] + " ";
line.pop_back(); // trim the trailing whitespace
} else {
// convert vector of Word to vector of int
std::vector<int> spmSentence;
spmSentence.reserve(sentence.size());
for(auto&& word : sentence)
- spmSentence.push_back(word.toWordIndex());
+ if(!ignoreEOS || word != getEosId())
+ spmSentence.push_back(word.toWordIndex());
spm_->Decode(spmSentence, &line);
}
return line;
diff --git a/src/data/text_input.cpp b/src/data/text_input.cpp
index 958190fc..b1f4cdd4 100644
--- a/src/data/text_input.cpp
+++ b/src/data/text_input.cpp
@@ -40,7 +40,7 @@ SentenceTuple TextInput::next() {
size_t curId = pos_++;
// fill up the sentence tuple with source and/or target sentences
- SentenceTuple tup(curId);
+ SentenceTupleImpl tup(curId);
for(size_t i = 0; i < files_.size(); ++i) {
std::string line;
if(io::getline(*files_[i], line)) {
@@ -57,9 +57,9 @@ SentenceTuple TextInput::next() {
}
if(tup.size() == files_.size()) // check if each input file provided an example
- return tup;
+ return SentenceTuple(tup);
else if(tup.size() == 0) // if no file provided examples we are done
- return SentenceTuple(0);
+ return SentenceTuple();
else // neither all nor none => we have at least on missing entry
ABORT("There are missing entries in the text tuples.");
}
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 322a29ad..5294fca3 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -357,6 +357,13 @@ Expr gather(Expr a, int axis, Expr indices) {
return Expression<GatherNodeOp>(a, axis, indices);
}
+// scatter() -- scatter arbitrary elements along an axis; batched or non-batched
+// This is the reverse operation to gather.
+Expr scatter(Expr a, int axis, Expr indices, Expr source) {
+ return Expression<ScatterNodeOp>(a, axis, indices, source);
+}
+
+
// index_select() -- gather arbitrary elements along an axis from an unbatched
// input 'a'. Indices are specified as a 1D vector.
// This is used e.g. for embedding lookup.
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index dc756c7d..1e98047f 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -687,10 +687,23 @@ Expr stopGradient(Expr a);
* @param indices The indices to be gathered
* @returns Gathered expression with the same shape as @p indices
* @note @p a and @p indices must have the same rank
- * @note The non-target axes of @p a and @p indicies must have the same size, or be broadcastable.
+ * @note The non-target axes of @p a and @p indices must have the same size, or be broadcastable.
*/
Expr gather(Expr a, int axis, Expr indices);
+/**
+ * Scatter elements from source along an axis into a. Unindexed elements from a remain unchanged.
+ * This is the reverse operation to gather.
+ * @param a The input expression
+ * @param axis The axis along which to index
+ * @param indices The indices to be scattered
+ * @param source Expression with values to scatter.
+ * @returns Scattered expression with the same shape as @p a now containing values from @p source in positions @p indices
+ * @note @p source and @p indices must have the same rank
+ * @note In this version @p source and @p indicies must have the same shape
+ */
+Expr scatter(Expr a, int axis, Expr indices, Expr source);
+
#if 0
// reverse operation to gather. a is expression into with values from b are inserted and positions indices along axis.
// with broadcasting
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index a180bb5c..b2a646b1 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -1033,12 +1033,14 @@ struct GatherNodeOp : public NaryNodeOp {
NodeOps forwardOps() override {
return {NodeOp(
+ // @TODO: rename to gather
Select(val_, child(0)->val(), child(1)->val(), axis_))};
}
NodeOps backwardOps() override {
return {NodeOp(
- Insert(child(0)->grad(), adj_, child(1)->val(), axis_))};
+ // @TODO: rename to scatter
+ Insert</*add=*/true>(child(0)->grad(), adj_, child(1)->val(), axis_))};
}
Shape newShape(Expr a, int axis, Expr indices) {
@@ -1046,7 +1048,6 @@ struct GatherNodeOp : public NaryNodeOp {
axis = shape.axis(axis);
auto rank = shape.size();
ABORT_IF(rank != indices->shape().size(), "Mismatching ranks for input ({}) and indices ({})", std::string(shape), std::string(indices->shape()));
- axis = a->shape().axis(axis);
shape.set(axis, indices->shape()[axis]);
for (size_t i = 0; i < rank; ++i) {
if (i != axis) {
@@ -1086,6 +1087,62 @@ private:
int axis_;
};
+struct ScatterNodeOp : public NaryNodeOp {
+ ScatterNodeOp(Expr a, int axis, Expr indices, Expr source)
+ : NaryNodeOp({a, indices, source}, newShape(a, axis, indices, source), a->value_type()),
+ axis_(a->shape().axis(axis)) {
+ matchOrAbort<IndexType>(indices->value_type());
+ }
+
+ NodeOps forwardOps() override {
+ return {NodeOp(
+ CopyCast(val_, child(0)->val()); // @TODO: use normal copy
+ Insert</*add=*/false>(val_, child(2)->val(), child(1)->val(), axis_)
+ )};
+ }
+
+ NodeOps backwardOps() override {
+ ABORT("backward for ScatterNodeOp not yet implemented");
+ }
+
+ Shape newShape(Expr a, int axis, Expr indices, Expr source) {
+ ABORT_IF(axis != -1, "only last dimensions");
+ ABORT_IF(indices->shape() != source->shape(), "Shapes must match");
+
+ Shape shape = a->shape();
+ // @TODO: do proper checking
+ return shape;
+ }
+
+ const std::string type() override { return "scatter"; }
+
+ const std::string color() override { return "orange"; }
+
+ virtual size_t hash() override {
+ if(!hash_) {
+ size_t seed = NaryNodeOp::hash();
+ util::hash_combine(seed, axis_);
+ hash_ = seed;
+ }
+ return hash_;
+ }
+
+ virtual bool equal(Expr node) override {
+ if(!NaryNodeOp::equal(node))
+ return false;
+ auto cnode = std::dynamic_pointer_cast<ScatterNodeOp>(node);
+ if(!cnode)
+ return false;
+ if(axis_ != cnode->axis_)
+ return false;
+ return true;
+ }
+
+private:
+ friend class SerializationHelpers;
+ int axis_;
+};
+
struct ColsNodeOp : public NaryNodeOp {
ColsNodeOp(Expr a, Expr indices)
: NaryNodeOp({a, indices}, newShape(a, indices), a->value_type()) {
diff --git a/src/graph/node_operators_tuple.h b/src/graph/node_operators_tuple.h
index c7a9531a..8acb1bc8 100644
--- a/src/graph/node_operators_tuple.h
+++ b/src/graph/node_operators_tuple.h
@@ -133,7 +133,7 @@ public:
}
void backward() override {
- Insert(/*out*/child(0)->grad(), adj_, val_, axis_);
+ Insert</*add=*/true>(/*out*/child(0)->grad(), adj_, val_, axis_);
}
const std::string type() override { return "topk"; }
diff --git a/src/layers/output.cpp b/src/layers/output.cpp
index 4d6e488a..efff58df 100644
--- a/src/layers/output.cpp
+++ b/src/layers/output.cpp
@@ -309,14 +309,24 @@ Logits Output::applyAsLogits(Expr input) /*override final*/ {
}
return Logits(std::move(allLogits), factoredVocab_);
} else if(shortlist_) {
- return Logits(affineOrDot(input,
- shortlist_->getCachedShortWt(),
- shortlist_->getCachedShortb(),
+ const Shape &inputShape = input->shape();
+ assert(inputShape[1] == 1); // time dimension always 1 for decoding
+ input = reshape(input, {inputShape[0], inputShape[2], 1, inputShape[3]});
+
+ Expr Wt = shortlist_->getCachedShortWt();
+ Expr b = shortlist_->getCachedShortb();
+ Expr ret = affineShortlist(input,
+ Wt,
+ b,
false,
- /*transB=*/isLegacyUntransposedW ? false : true));
+ /*transB=*/isLegacyUntransposedW ? false : true);
+ const Shape &retShape = ret->shape();
+ assert(retShape[2] == 1); // time dimension always 1 for decoding
+ ret = reshape(ret, {retShape[0], 1, retShape[1], retShape[3]});
+ return Logits(ret);
} else {
- return Logits(
- affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true));
+ Expr ret = affineOrDot(input, Wt_, b_, false, /*transB=*/isLegacyUntransposedW ? false : true);
+ return Logits(ret);
}
}
diff --git a/src/models/costs.cpp b/src/models/costs.cpp
index c688b211..4b15bcb3 100644
--- a/src/models/costs.cpp
+++ b/src/models/costs.cpp
@@ -10,5 +10,40 @@ Ptr<DecoderState> LogSoftmaxStep::apply(Ptr<DecoderState> state) {
return state;
}
+Ptr<DecoderState> GumbelSoftmaxStep::apply(Ptr<DecoderState> state) {
+ state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
+ [](Expr logits) { // lemma gets gumbelled
+ return logsoftmax(logits + constant_like(logits, inits::gumbel()));
+ },
+ logsoftmax)); // factors don't
+ return state;
+}
+
+TopkGumbelSoftmaxStep::TopkGumbelSoftmaxStep(int k) : k_{k} {}
+
+Ptr<DecoderState> TopkGumbelSoftmaxStep::apply(Ptr<DecoderState> state) {
+ state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
+ [=](Expr logits) { // lemma gets gumbelled
+ // create logits-sized tensor consisting only of invalid path scores
+ float invalidPathScore = NumericLimits<float>(logits->value_type()).lowest;
+ Expr invalidLogits = constant_like(logits, inits::fromValue(invalidPathScore));
+
+ // select top-k values
+ Expr val, idx;
+ std::tie(val, idx) = topk(logits, k_, /*axis=*/-1, /*descending=*/true);
+
+ // uncomment below to display probability mass in top-k selection
+ // debug(sum(gather(softmax(logits), -1, idx), -1), "sum");
+
+ // Add Gumbel noise to top-k values only and compute logsoftmax, used for argmax sampling later in beam-search
+ Expr gumbelVal = logsoftmax(val + constant_like(val, inits::gumbel()));
+
+ // Scatter gumbelled values back into logits to fill with usable values
+ return scatter(invalidLogits, -1, idx, gumbelVal);
+ },
+ logsoftmax)); // factors don't
+ return state;
+}
+
} // namespace models
} // namespace marian
diff --git a/src/models/costs.h b/src/models/costs.h
index 982a13c5..9bb2b103 100644
--- a/src/models/costs.h
+++ b/src/models/costs.h
@@ -297,20 +297,30 @@ public:
virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
};
-// Gumbel-max noising for sampling during beam-search
-// Seems to work well enough with beam-size=1. Turn on
-// with --output-sampling during translation with marian-decoder
+// Gumbel-max noising for sampling during translation.
+// Produces accurate sampling with beam=1. Turn on
+// with --output-sampling [full] during translation
+// with marian-decoder for samnpling from the full
+// softmax distribution.
class GumbelSoftmaxStep : public ILogProbStep {
public:
virtual ~GumbelSoftmaxStep() {}
- virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override {
- state->setLogProbs(state->getLogProbs().applyUnaryFunctions(
- [](Expr logits) { // lemma gets gumbelled
- return logsoftmax(logits + constant_like(logits, inits::gumbel()));
- },
- logsoftmax)); // factors don't
- return state;
- }
+ virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
+};
+
+
+// Gumbel-max noising for top-k sampling during translation.
+// Produces accurate sampling with beam=1. Turn on
+// with --output-sampling topk [10] during translation
+// with marian-decoder for top-10 sampling.
+class TopkGumbelSoftmaxStep : public ILogProbStep {
+private:
+ int k_{1};
+
+public:
+ TopkGumbelSoftmaxStep(int k);
+ virtual ~TopkGumbelSoftmaxStep() {}
+ virtual Ptr<DecoderState> apply(Ptr<DecoderState> state) override;
};
// class to wrap an IEncoderDecoder and a ILogProbStep that are executed in sequence,
diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp
index 5711ea1b..a6f4dd3d 100644
--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@@ -38,7 +38,9 @@ EncoderDecoder::EncoderDecoder(Ptr<ExpressionGraph> graph, Ptr<Options> options)
modelFeatures_.insert("transformer-heads");
modelFeatures_.insert("transformer-no-projection");
modelFeatures_.insert("transformer-dim-ffn");
+ modelFeatures_.insert("transformer-decoder-dim-ffn");
modelFeatures_.insert("transformer-ffn-depth");
+ modelFeatures_.insert("transformer-decoder-ffn-depth");
modelFeatures_.insert("transformer-ffn-activation");
modelFeatures_.insert("transformer-dim-aan");
modelFeatures_.insert("transformer-aan-depth");
diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp
index e176e6a4..52a87e72 100644
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@@ -370,10 +370,25 @@ Ptr<IModel> createModelFromOptions(Ptr<Options> options, usage use) {
// add (log)softmax if requested
if (use == usage::translation) {
if(std::dynamic_pointer_cast<EncoderDecoder>(baseModel)) {
- if(options->get<bool>("output-sampling", false))
- return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<GumbelSoftmaxStep>());
- else
+ if(options->hasAndNotEmpty("output-sampling")) {
+ auto sampling = options->get<std::vector<std::string>>("output-sampling", {});
+ std::string method = sampling.size() > 0 ? sampling[0] : "full";
+
+ if(method == "full" || method == "1" /*for backwards-compat when output-sampling: true in yaml file*/) {
+ LOG(info, "Output sampling from the full softmax distribution");
+ return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<GumbelSoftmaxStep>());
+ } else if(method == "topk") {
+ int k = sampling.size() > 1 ? std::stoi(sampling[1]) : 10;
+ if(k == 1)
+ LOG(info, "Output sampling with k=1 is equivalent to beam search with beam size 1");
+ LOG(info, "Output sampling via top-{} sampling", k);
+ return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<TopkGumbelSoftmaxStep>(k));
+ } else {
+ ABORT("Unknown sampling method: {}", method);
+ }
+ } else {
return New<Stepwise>(std::dynamic_pointer_cast<EncoderDecoder>(baseModel), New<LogSoftmaxStep>());
+ }
}
#ifdef COMPILE_EXAMPLES
// note: 'usage::translation' here means 'inference'
diff --git a/src/models/transformer.h b/src/models/transformer.h
index ec68b801..95a55d3a 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -148,8 +148,7 @@ public:
int dimDepth = dimModel / dimHeads;
- auto output
- = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});
+ auto output = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});
return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth]
}
@@ -364,9 +363,9 @@ public:
Expr LayerAttention(std::string prefix,
Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
- const Expr& keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
- const Expr& values, // ...?
- const Expr& mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
+ Expr keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+ Expr values, // ...?
+ Expr mask, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
int dimHeads,
bool cache = false,
bool saveAttentionWeights = false) {
@@ -376,6 +375,12 @@ public:
auto opsPre = opt<std::string>("transformer-preprocess");
auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb);
+ // fixes missing norm for keys and values in self-attention with pre-norm
+ if(input == keys)
+ keys = output;
+ if(input == values)
+ values = output;
+
// multi-head self-attention over previous input
output = MultiHead(prefix, dimModel, dimHeads, output, keys, values, mask, cache, saveAttentionWeights);
@@ -403,7 +408,7 @@ public:
opt<int>("transformer-heads"), /*cache=*/false);
}
- Expr LayerFFN(std::string prefix, Expr input) const {
+ Expr LayerFFN(std::string prefix, Expr input, bool isDecoder=false) const {
int dimModel = input->shape()[-1];
float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
@@ -411,13 +416,22 @@ public:
auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb);
auto actName = opt<std::string>("transformer-ffn-activation");
+
int dimFfn = opt<int>("transformer-dim-ffn");
int depthFfn = opt<int>("transformer-ffn-depth");
- float ffnDropProb
- = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
-
+ if(isDecoder) {
+ int decDimFfn = opt<int>("transformer-decoder-dim-ffn", 0);
+ if(decDimFfn != 0)
+ dimFfn = decDimFfn;
+
+ int decDepthFfn = opt<int>("transformer-decoder-ffn-depth", 0);
+ if(decDepthFfn != 0)
+ depthFfn = decDepthFfn;
+ }
+
ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn);
-
+
+ float ffnDropProb = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
auto initFn = inits::glorotUniform(true, true, depthScaling_ ? 1.f / sqrtf((float)depth_) : 1.f);
// the stack of FF layers
@@ -866,7 +880,7 @@ public:
// remember decoder state
decoderStates.push_back(decoderState);
- query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+ query = LayerFFN(prefix_ + "_l" + layerNo + "_ffn", query, /*isDecoder=*/true); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
checkpoint(query);
}
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 1afb8f64..1e1adc38 100755
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -24,6 +24,10 @@ void IsNaN(const Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool& /*isNaN*/, b
ABORT("Not implemented");
}
+bool SanitizeGradient(marian::Tensor /*in*/, Ptr<Allocator> /*allocator*/, bool /*pruneNaN*/, bool /*clipInf*/) {
+ ABORT("Not implemented");
+}
+
template <bool add, typename To, typename From>
void CopyCastTo(To* out, const From* in, int length) {
for(int i = 0; i < length; ++i)
@@ -735,6 +739,7 @@ void Select(Tensor out,
}
}
+template <bool add>
void Insert(Tensor out,
const Tensor in,
const Tensor indices,
@@ -756,10 +761,16 @@ void Insert(Tensor out,
int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
dims[axisCPU] = (int)indices->data<IndexType>()[idxIndex];
int outIndex = outShape.index(dims);
- out->data()[outIndex] += in->data()[index];
+ if(add)
+ out->data()[outIndex] += in->data()[index];
+ else
+ out->data()[outIndex] = in->data()[index];
}
}
+template void Insert<true>(Tensor out, const Tensor in, const Tensor indices, int axis);
+template void Insert<false>(Tensor out, const Tensor in, const Tensor indices, int axis);
+
void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
int rows = out_->shape().elements() / out_->shape().back();
int cols = out_->shape().back();
diff --git a/src/tensors/gpu/element.cu b/src/tensors/gpu/element.cu
index 6790efd4..e9cbe081 100755
--- a/src/tensors/gpu/element.cu
+++ b/src/tensors/gpu/element.cu
@@ -29,7 +29,9 @@ __global__ void gElement(
indices[i] = tensors[i].shape().bindex(dims);
}
- tensors[0].data()[index] = functional::apply(functor, tensors, indices);
+ // This performs the internal application of the functor in float32 regardless of the input type.
+ // It seems there are no speed penalties but improved precision.
+ tensors[0].data()[index] = (T)functional::applyWithCast<float>(functor, tensors, indices);
}
}
}
@@ -65,13 +67,7 @@ void Element(Functor functor, Tensor out, Tensors... tensors) {
ElementTyped<float>(functor, out, tensors...);
} else if(out->type() == Type::float16) {
#if COMPILE_FP16
- std::vector<marian::Tensor> ts({out, tensors...});
- bool div2 = std::all_of(ts.cbegin(), ts.cend(), [](marian::Tensor t){ return t->shape()[-1] % 2 == 0; });
- if(div2) {
- ElementTyped<halfx2>(functor, out, tensors...);
- } else {
- ElementTyped<half>(functor, out, tensors...);
- }
+ ElementTyped<half>(functor, out, tensors...);
#else
ABORT("FP16 not supported with chosen current hardware or CUDA version");
#endif
diff --git a/src/tensors/gpu/prod.cpp b/src/tensors/gpu/prod.cpp
index bf0d2395..c72af4db 100755
--- a/src/tensors/gpu/prod.cpp
+++ b/src/tensors/gpu/prod.cpp
@@ -562,7 +562,11 @@ void ProdBatchedLegacy(marian::Tensor C,
ProdBatchedTypedLegacy<float, float>(C, allocator, A, B, transA, transB, beta, scalar);
#if COMPILE_FP16
} else if(C->type() == Type::float16) { // not a *.cu file
- ProdBatchedTypedLegacy<half, half>(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar));
+ // we use computeType=float here for fp16 training as this seems more stable and roughly as fast
+ ProdBatchedTypedLegacy<half, float>(C, allocator, A, B, transA, transB, beta, scalar);
+
+ // original for reference:
+ // ProdBatchedTypedLegacy<half, half>(C, allocator, A, B, transA, transB, __float2half(beta), __float2half(scalar));
#endif
} else {
ABORT("ProdBatchedLegacy not implemented for element type {}", C->type());
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index d55214bc..2103ca9d 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -16,15 +16,12 @@ namespace gpu {
namespace atomics {
static inline __device__ void atomicAdd(float *address, float val) {
- //*address += val;
::atomicAdd(address, val);
}
#if COMPILE_FP16
// @TODO: copied from CuTorch, adapt this better, give credit.
static inline __device__ void atomicAdd(half *address, half val) {
- //*address += val;
-
#if __CUDA_ARCH__ >= 700 && CUDA_VERSION >= 10000 // compute capability 70 and higher with CUDA 10
::atomicAdd(address, val);
#else // __CUDA_ARCH__ < 700
@@ -50,7 +47,8 @@ static inline __device__ void atomicAdd(half *address, half val) {
} while (assumed != old);
#endif // __CUDA_ARCH__
}
-#endif
+#endif // COMPILE_FP16
+
}
@@ -96,6 +94,81 @@ void IsNaN(const Tensor in, Ptr<Allocator> allocator, bool& isNaN, bool& isInf)
cudaStreamSynchronize(0);
}
+template <typename T>
+__global__ void gSanitizeGradient(T* in, int length,
+ bool* isNaN, bool* isInf,
+ bool pruneNaN, bool clipInf,
+ float forNaN = 0.f, float forInf = 65504.f, float forInfNeg = -65504.f) {
+ for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+ int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+ if(index < length) {
+ float v = (float)in[index];
+ // handle NaN
+ if(isnan(v)) {
+ if(pruneNaN) {
+ in[index] = (T)forNaN;
+ } else {
+ *isNaN = true;
+ }
+ }
+ // handle +/- Inf
+ if(isinf(v)) {
+ if(clipInf) {
+ in[index] = v > 0 ? (T)forInf : (T)forInfNeg;
+ } else {
+ *isInf = true;
+ }
+ }
+ }
+ }
+}
+
+// This function is meant to clean gradients, i.e. clip infinities and prune NaNs if required.
+// If all NaNs and Infs have been removed we return `true` for indicating a sane gradient.
+// If `clipInf` is set, infinities are replaced with the maximum/minimum non-inf value for the tensor.
+// In that case infinities do not result in a bad gradient, since they get clipped.
+// If `pruneNaN` is set, NaNs are replaced with 0. Since NaNs get removed now they do not result
+// in a bad gradient.
+// If NaNs or infinities are detected but not removed (either because of `pruneNaN=false` or `clipInf=false`),
+// we return `false` indicating a bad gradient.
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
+ cudaSetDevice(in->getDeviceId().no);
+
+ int length = in->size();
+
+ int threads = std::min(MAX_THREADS, length);
+ int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+ auto mem = allocator->alloc<bool>(2);
+ bool* dIsNaN = &mem->data<bool>()[0];
+ bool* dIsInf = &mem->data<bool>()[1];
+ fill(in->getBackend(), dIsNaN, dIsNaN + 2, false);
+
+ float forNaN = 0.f;
+ float forInf = NumericLimits<float>(in->type()).max;
+ float forInfNeg = NumericLimits<float>(in->type()).lowest;
+
+ if(in->type() == Type::float32) {
+ gSanitizeGradient<<<blocks, threads>>>(in->data<float>(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg);
+#if COMPILE_FP16
+ } else if(in->type() == Type::float16) {
+ gSanitizeGradient<<<blocks, threads>>>(in->data<half>(), length, dIsNaN, dIsInf, pruneNaN, clipInf, forNaN, forInf, forInfNeg);
+#endif
+ } else {
+ ABORT("gSanitizeGradient for type {} not implemented", in->type());
+ }
+
+ bool isNaN, isInf;
+ CudaCopy(dIsNaN, dIsNaN + 1, &isNaN);
+ CudaCopy(dIsInf, dIsInf + 1, &isInf);
+
+ allocator->free(mem);
+
+ cudaStreamSynchronize(0);
+
+ return !isNaN && !isInf;
+}
+
template <bool add, typename To, typename From>
__global__ void gCopyCastTo(To* out, const From* in, int length) {
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
@@ -1090,7 +1163,7 @@ void PasteRows(Tensor out,
size_t rowsToCopy = indices->size();
int threads = std::min(MAX_THREADS, (int)cols);
-#if 1 // @TODO: make this configurable with a 'deterministic' flag
+#if 0 // @TODO: make this configurable with a 'deterministic' flag
// If we only use one block, then each core operates on a different column,
// hence the summation becomes deterministic.
// However, we only use e.g. 512 cores out of possibly 3000+, so this will be
@@ -1236,7 +1309,7 @@ __global__ void gSelect(T* out,
}
}
-template <typename T>
+template <bool add, typename T>
__global__ void gInsert(T* out,
functional::Shape outShape,
const T* in,
@@ -1254,7 +1327,10 @@ __global__ void gInsert(T* out,
int idxIndex = idxShape.bindex(dims); // broadcast index into indices tensor
dims[axis] = (int)d_indices[idxIndex];
int outIndex = outShape.index(dims);
- out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
+ if(add)
+ out[outIndex] += in[index]; // this is probably wrong, atomicAdd?
+ else
+ out[outIndex] = in[index];
}
}
}
@@ -1276,21 +1352,21 @@ void Select(Tensor out,
if(out->type() == Type::float32) {
gSelect<<<blocks, threads>>>(out->data<float>(),
- out->shape(),
- in->data<float>(),
- in->shape(),
- axisGPU,
- indices->data<IndexType>(),
- indices->shape());
+ out->shape(),
+ in->data<float>(),
+ in->shape(),
+ axisGPU,
+ indices->data<IndexType>(),
+ indices->shape());
#if COMPILE_FP16
} else if (out->type() == Type::float16) {
gSelect<<<blocks, threads>>>(out->data<half>(),
- out->shape(),
- in->data<half>(),
- in->shape(),
- axisGPU,
- indices->data<IndexType>(),
- indices->shape());
+ out->shape(),
+ in->data<half>(),
+ in->shape(),
+ axisGPU,
+ indices->data<IndexType>(),
+ indices->shape());
#endif
} else if(out->type() == Type::uint32) {
gSelect<<<blocks, threads>>>(out->data<IndexType>(),
@@ -1305,6 +1381,7 @@ void Select(Tensor out,
}
}
+template <bool add>
void Insert(Tensor out,
const Tensor in,
const Tensor indices,
@@ -1320,28 +1397,31 @@ void Insert(Tensor out,
int axisGPU = axis + functional::Shape::size() - out->shape().size();
if(out->type() == Type::float32) {
- gInsert<<<blocks, threads>>>(out->data<float>(),
- out->shape(),
- in->data<float>(),
- in->shape(),
- axisGPU,
- indices->data<IndexType>(),
- indices->shape());
+ gInsert<add><<<blocks, threads>>>(out->data<float>(),
+ out->shape(),
+ in->data<float>(),
+ in->shape(),
+ axisGPU,
+ indices->data<IndexType>(),
+ indices->shape());
#if COMPILE_FP16
} else if (out->type() == Type::float16) {
- gInsert<<<blocks, threads>>>(out->data<half>(),
- out->shape(),
- in->data<half>(),
- in->shape(),
- axisGPU,
- indices->data<IndexType>(),
- indices->shape());
+ gInsert<add><<<blocks, threads>>>(out->data<half>(),
+ out->shape(),
+ in->data<half>(),
+ in->shape(),
+ axisGPU,
+ indices->data<IndexType>(),
+ indices->shape());
#endif
} else {
ABORT("Insert not implemented for type {}", out->type());
}
}
+template void Insert<true>(Tensor out, const Tensor in, const Tensor indices, int axis);
+template void Insert<false>(Tensor out, const Tensor in, const Tensor indices, int axis);
+
template <typename T>
__global__ void gGRUFastForward(T* out,
const T* state,
@@ -1355,7 +1435,7 @@ __global__ void gGRUFastForward(T* out,
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
- T m = !mask || mask[j];
+ float m = !mask || mask[j];
T* rowOut = out + j * cols;
const T* rowState = state + j * cols;
@@ -1365,21 +1445,21 @@ __global__ void gGRUFastForward(T* out,
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols) {
- T r = functional::Ops<T>::sigmoid(xWrow[i] + sUrow[i] + b[i]);
+ float r = functional::Ops<float>::sigmoid((float)xWrow[i] + (float)sUrow[i] + (float)b[i]);
int k = i + cols;
- T z = functional::Ops<T>::sigmoid(xWrow[k] + sUrow[k] + b[k]);
+ float z = functional::Ops<float>::sigmoid((float)xWrow[k] + (float)sUrow[k] + (float)b[k]);
int l = i + 2 * cols;
- T h;
+ float h;
if(final)
- h = functional::Ops<T>::tanh(xWrow[l] + (sUrow[l] + b[l]) * r);
+ h = functional::Ops<float>::tanh((float)xWrow[l] + ((float)sUrow[l] + (float)b[l]) * r);
else
- h = functional::Ops<T>::tanh(xWrow[l] + sUrow[l] * r + b[l]);
+ h = functional::Ops<float>::tanh((float)xWrow[l] + (float)sUrow[l] * r + (float)b[l]);
- T out = ((T)1.f - z) * h + z * rowState[i];
- rowOut[i] = m * out + ((T)1.f - m) * rowState[i];
+ float out = (1.f - z) * h + z * (float)rowState[i];
+ rowOut[i] = (T)(m * out + (1.f - m) * (float)rowState[i]);
}
}
}
@@ -1441,7 +1521,7 @@ __global__ void gGRUFastBackward(T* outState,
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
- T m = !mask || mask[j];
+ float m = !mask || mask[j];
T* rowOutState = outState + j * cols;
T* rowOutXW = outXW + j * cols * 3;
@@ -1459,56 +1539,56 @@ __global__ void gGRUFastBackward(T* outState,
int k = i + cols;
int l = i + 2 * cols;
- T r = functional::Ops<T>::sigmoid(rowXW[i] + rowSU[i] + b[i]);
- T z = functional::Ops<T>::sigmoid(rowXW[k] + rowSU[k] + b[k]);
+ float r = functional::Ops<float>::sigmoid((float)rowXW[i] + (float)rowSU[i] + (float)b[i]);
+ float z = functional::Ops<float>::sigmoid((float)rowXW[k] + (float)rowSU[k] + (float)b[k]);
- T h;
+ float h;
if(final)
- h = functional::Ops<T>::tanh(rowXW[l] + (rowSU[l] + b[l]) * r);
+ h = functional::Ops<float>::tanh((float)rowXW[l] + ((float)rowSU[l] + (float)b[l]) * r);
else
- h = functional::Ops<T>::tanh(rowXW[l] + rowSU[l] * r + b[l]);
+ h = functional::Ops<float>::tanh((float)rowXW[l] + (float)rowSU[l] * r + (float)b[l]);
- T adj = rowAdj[i];
+ float adj = rowAdj[i];
- T t = ((T)1.f - z) * ((T)1.f - h * h);
+ float t = (1.f - z) * (1.f - h * h);
// df/ds
if(outState)
- rowOutState[i] += (m * z - m + (T)1.f) * adj;
+ rowOutState[i] += (T)((m * z - m + 1.f) * adj);
// df/d(xW_r) ...
- T dfdxW_r = m * r * ((T)1.f - r) * t * adj;
+ float dfdxW_r = m * r * (1.f - r) * t * adj;
if(final)
- dfdxW_r *= rowSU[l] + b[l];
+ dfdxW_r *= (float)rowSU[l] + (float)b[l];
else
- dfdxW_r *= rowSU[l];
+ dfdxW_r *= (float)rowSU[l];
if(outXW)
- rowOutXW[i] += dfdxW_r;
+ rowOutXW[i] += (T)dfdxW_r;
if(outSU)
- rowOutSU[i] += dfdxW_r;
+ rowOutSU[i] += (T)dfdxW_r;
if(outB)
- rowOutB[i] += dfdxW_r;
+ rowOutB[i] += (T)dfdxW_r;
// df/d(xW_z) ...
- T dfdxW_z = m * ((T)1.f - z) * z * (rowState[i] - h) * adj;
+ float dfdxW_z = m * (1.f - z) * z * ((float)rowState[i] - h) * adj;
if(outXW)
- rowOutXW[k] += dfdxW_z;
+ rowOutXW[k] += (T)dfdxW_z;
if(outSU)
- rowOutSU[k] += dfdxW_z;
+ rowOutSU[k] += (T)dfdxW_z;
if(outB)
- rowOutB[k] += dfdxW_z;
+ rowOutB[k] += (T)dfdxW_z;
// df/d(xW_x) ...
- T dfdxW_x = m * t * adj;
+ float dfdxW_x = m * t * adj;
if(outXW)
- rowOutXW[l] += dfdxW_x;
+ rowOutXW[l] += (T)dfdxW_x;
if(outSU)
- rowOutSU[l] += dfdxW_x * r;
+ rowOutSU[l] += (T)(dfdxW_x * r);
if(outB)
if(final)
- rowOutB[l] += dfdxW_x * r;
+ rowOutB[l] += (T)(dfdxW_x * r);
else
- rowOutB[l] += dfdxW_x;
+ rowOutB[l] += (T)dfdxW_x;
}
}
}
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index 6e587953..1fc4542d 100644
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -41,6 +41,25 @@ DISPATCH2(CopyCast, marian::Tensor, const marian::Tensor);
DISPATCH2(AddCast, marian::Tensor, const marian::Tensor);
DISPATCH4(IsNaN, const Tensor, Ptr<Allocator>, bool&, bool&);
+#ifdef CUDA_FOUND
+namespace gpu {
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
+}
+#endif
+
+namespace cpu {
+bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf);
+}
+
+static inline bool SanitizeGradient(marian::Tensor in, Ptr<Allocator> allocator, bool pruneNaN, bool clipInf) {
+#ifdef CUDA_FOUND
+ if(in->getBackend()->getDeviceId().type == DeviceType::gpu)
+ return gpu::SanitizeGradient(in, allocator, pruneNaN, clipInf);
+ else
+#endif
+ return cpu::SanitizeGradient(in, allocator, pruneNaN, clipInf);
+}
+
template <class Functor, class... Tensors>
void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
#ifdef CUDA_FOUND
@@ -278,7 +297,28 @@ DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const marian::Tensor)
DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const marian::Tensor)
DISPATCH4(Select, marian::Tensor, const marian::Tensor, const marian::Tensor, int)
-DISPATCH4(Insert, marian::Tensor, const marian::Tensor, const marian::Tensor, int)
+
+#ifdef CUDA_FOUND
+namespace gpu {
+ template <bool add>
+ void Insert(Tensor out, const Tensor in, const Tensor indices, int axis);
+}
+#endif
+
+namespace cpu {
+ template <bool add>
+ void Insert(Tensor out, const Tensor in, const Tensor indices, int axis);
+}
+
+template <bool add>
+static inline void Insert(Tensor out, const Tensor in, const Tensor indices, int axis) {
+#ifdef CUDA_FOUND
+ if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
+ gpu::Insert<add>(out, in, indices, axis);
+ else
+#endif
+ cpu::Insert<add>(out, in, indices, axis);
+}
DISPATCH7(TopK, marian::Tensor, marian::Tensor, Ptr<Allocator>, const marian::Tensor, int, int, bool);
diff --git a/src/training/graph_group.cpp b/src/training/graph_group.cpp
index e9c977b9..59cd4b6d 100644
--- a/src/training/graph_group.cpp
+++ b/src/training/graph_group.cpp
@@ -10,25 +10,19 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
mbRoundUp_(options_->get<bool>("mini-batch-round-up", true)) {
if(options_->hasAndNotEmpty("cost-scaling")) {
auto vcs = options_->get<std::vector<std::string>>("cost-scaling");
- costScale_ = true;
- float costExponent = std::stof(vcs[0]);
- costScaleFactor_ = std::pow(2.0f, costExponent);
-
- if(vcs.size() > 1) costScaleFreq_ = std::stoul(vcs[1]);
- if(vcs.size() > 2) costScaleMultiplier_ = std::stof(vcs[2]);
- if(vcs.size() > 3) costScaleNanTolerance_ = std::stof(vcs[3]);
- if(vcs.size() > 4) costScaleNanRange_ = std::stoul(vcs[4]);
- if(vcs.size() > 5) costScaleFactorMinimum_ = std::stof(vcs[5]);
+
+ costScaling_ = true;
+ costScalingFactor_ = std::stof( vcs[0]);
+ if(vcs.size() > 1) costScalingFreq_ = std::stoul(vcs[1]);
+ if(vcs.size() > 2) costScalingMultiplier_ = std::stof( vcs[2]);
+ if(vcs.size() > 3) costScalingFactorMinimum_ = std::stof( vcs[3]);
LOG_ONCE(info,
- "Training with cost scaling - factor: 2^{} = {}, frequency: {}, multiplier: {}, tolerance: {}, range: {}, minimum: {}",
- costExponent,
- costScaleFactor_,
- costScaleFreq_,
- costScaleMultiplier_,
- costScaleNanTolerance_,
- costScaleNanRange_,
- costScaleFactorMinimum_);
+ "Training with cost scaling - factor: {}, frequency: {}, multiplier: {}, minimum: {}",
+ costScalingFactor_,
+ costScalingFreq_,
+ costScalingMultiplier_,
+ costScalingFactorMinimum_);
}
if(options_->hasAndNotEmpty("dynamic-gradient-scaling")) {
@@ -37,11 +31,16 @@ GraphGroup::GraphGroup(Ptr<Options> options, Ptr<IMPIWrapper> mpi)
if(vgc.size() > 0) dynamicGradientScalingFactor_ = std::stof(vgc[0]);
if(vgc.size() > 1) dynamicGradientScalingUseLogs_ = vgc[1] == "log";
+ if(vgc.size() > 2) dynamicGradientScalingFadeout_ = std::stoul(vgc[2]);
LOG_ONCE(info,
"Re-scaling gradient to have average gradient norm if (log={}) gradient norm diverges from average by {} sigmas",
dynamicGradientScalingUseLogs_,
dynamicGradientScalingFactor_);
+ if(dynamicGradientScalingFadeout_ > 0)
+ LOG_ONCE(info,
+ "Dynamic gradient re-scaling will fade out linearly after {} updates",
+ dynamicGradientScalingFadeout_);
}
if(options_->get<bool>("check-gradient-nan")) {
@@ -96,21 +95,17 @@ void GraphGroup::initGraphsAndOpts() {
// given number of iterations. Usually we increase by 2 which adds
// one more bit for precision.
void GraphGroup::increaseCostScaleFactor() {
- if(!costScale_)
+ if(!costScaling_)
return;
noNanSeen_++;
size_t total = nanSeen_ + noNanSeen_;
- float nanPercent = noNanSeen_ == (float)nanSeen_ / (float)total; // total is at least 1 because of noNanSeen_++
- if(noNanSeen_ % costScaleFreq_ == 0) {
- costScaleFactor_ *= costScaleMultiplier_;
- LOG(debug,
- "NaN/Inf percentage {:.2f} after {} gradient updates. Increasing cost-scaling factor to {}",
- nanPercent,
- total,
- costScaleFactor_);
+ if(noNanSeen_ % costScalingFreq_ == 0) {
+ costScalingFactor_ *= costScalingMultiplier_;
+ if(isMainProcess())
+ LOG(debug, "No NaN/Inf after {} gradient updates. Increasing cost-scaling factor to {}", total, costScalingFactor_);
// Resetting counts after cost-scale change
noNanSeen_ = 0;
@@ -120,48 +115,56 @@ void GraphGroup::increaseCostScaleFactor() {
// call when a NaN was seen to decrease cost-scaling factor
void GraphGroup::decreaseCostScaleFactor() {
- if(!costScale_)
+ if(!costScaling_)
return;
nanSeen_++;
size_t total = nanSeen_ + noNanSeen_;
- float nanPercent = (float)nanSeen_ / (float)total; // total is at least 1 because of nanSeen_++
- if(total >= costScaleNanRange_ && nanPercent > costScaleNanTolerance_) {
- if(costScaleFactor_ > costScaleFactorMinimum_) {
- costScaleFactor_ /= costScaleMultiplier_;
- LOG(debug,
- "NaN/Inf percentage {:.2f} in {} gradient updates, reducing cost-scaling factor to {}",
- nanPercent,
- total,
- costScaleFactor_);
- } else {
- // @TODO: think if should this rather abort?
- LOG(warn,
- "NaN/Inf percentage {:.2f} in {} gradient updates, but cost-scaling factor {} is already at minimum",
- nanPercent,
- total,
- costScaleFactor_);
- }
- // Resetting counts after cost-scale change
- noNanSeen_ = 0;
- nanSeen_ = 0;
+ // do not reduce cost-scaling factor below minimum
+ if(costScalingFactor_ > costScalingFactorMinimum_)
+ costScalingFactor_ /= costScalingMultiplier_;
+
+ if(isMainProcess()) {
+ if(costScalingFactor_ > costScalingFactorMinimum_)
+ LOG(debug, "Seen NaN/Inf after {} gradient updates. Reduced cost-scaling factor to {}", total, costScalingFactor_);
+ else
+ LOG(debug, "Seen NaN/Inf after {} gradient updates, Reduced cost-scaling factor to minimum {}. Pruning NaNs now.", total, costScalingFactor_);
}
+
+ // Resetting counts after cost-scale change
+ noNanSeen_ = 0;
+ nanSeen_ = 0;
}
float GraphGroup::checkNanOrNorm(size_t i, size_t begin, size_t end) {
auto curGrad = graphs_[i]->params()->grads()->subtensor(begin, end-begin);
- if(checkGradientNan_ || costScale_) {
- bool hasNan = false, hasInf = false;
- IsNaN(curGrad, graphs_[i]->allocator(), hasNan, hasInf); // @TODO: make safe with different compiler options
- if(hasNan || hasInf) {
- LOG(debug, "Found Nan ({}) or Inf ({})", hasNan, hasInf);
+ // If costScaling_ then check for NaN values if the costScalingFactor_ is larger than
+ // the minimum. If a NaN value is seen we exit here and will reduce the factor next and
+ // this skips an update.
+ // If costScalingFactor_ is already at the minimum, prune the NaN values away. This replaces
+ // NaNs with 0. Updates are not skipped any more.
+ // Regardless of NaNs, we clip +/-inf to the largest corresponding values for the gradient value type.
+ // This changes the gradient but seems to be quite stable. In effect, for fp16 this is equivalent
+ // to gradient clipping at (65504.f / costScalingFactor_) which in most cases is still large.
+ if(costScaling_ || checkGradientNan_) {
+ bool pruneNaN = !checkGradientNan_ && costScalingFactor_ == costScalingFactorMinimum_;
+ bool clipInf = !checkGradientNan_;
+ bool saneGradient = SanitizeGradient(curGrad, graphs_[i]->allocator(), pruneNaN, clipInf);
+
+ // This should never happen, if it does, something is wrong with the kernel above and needs to be fixed.
+ ABORT_IF(pruneNaN && clipInf && !saneGradient, "We are removing NaNs and clipping Infs, but gradient is still not sane??");
+
+ if(!saneGradient) {
+ LOG(debug, "Found NaN");
return std::numeric_limits<float>::quiet_NaN();
}
}
-
+
+ // The optional clipping above will affect the norm here. The norm can be non-finite despite the above
+ // gradient sanitization, hence check again and propagate a NaN.
if(dynamicGradientScaling_) {
auto gNorm = L2Norm(curGrad, graphs_[i]->allocator());
if(isFinite(gNorm) && gNorm > 0.0)
@@ -197,8 +200,8 @@ float GraphGroup::executeAndCollectNorm(const std::function<float(size_t, size_t
float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords) {
float normalizationFactor = 1.f;
- if(costScale_)
- normalizationFactor *= costScaleFactor_;
+ if(costScaling_)
+ normalizationFactor *= costScalingFactor_;
if(options_->get<bool>("normalize-gradient"))
normalizationFactor *= updateTrgWords;
@@ -207,9 +210,9 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
return normalizationFactor;
if(dynamicGradientScaling_) {
- // make gradient norm invariant to changes in costScaleFactor_, luckily norm(c * g) = c * norm(g)
- if(costScale_)
- gNorm = gNorm / costScaleFactor_;
+ // make gradient norm invariant to changes in costScalingFactor_, luckily norm(c * g) = c * norm(g)
+ if(costScaling_)
+ gNorm = gNorm / costScalingFactor_;
// Normalize gradient norm w.r.t. number of labels in batch for statistics,
// there should be no gradient normalization before this point, @TODO: check this
@@ -231,11 +234,17 @@ float GraphGroup::computeNormalizationFactor(float gNorm, size_t updateTrgWords)
auto deltaTransform = gNormTransform - gNormAvgTransform; // compute the difference between the current transformer gradient norm and the running average.
auto gNormStdTransform = std::sqrt(gNormVarTransform); // compute STD for the running average of (log) gradient norms.
+ float fadeoutMultiplier = 1.f;
+ if(dynamicGradientScalingFadeout_ > 0ul) // fade out linearly after that many updates @TODO: allow units other than updates
+ fadeoutMultiplier = (float)std::max(dynamicGradientScalingFadeout_, scheduler_->numberOfBatches()) / (float)dynamicGradientScalingFadeout_;
+
+ float dynamicGradientScalingFactorWithFadeout = dynamicGradientScalingFactor_ * fadeoutMultiplier; // if fadeoutMultiplier increases dynamic gradient scaling becomes less and less likely to happen over time.
// delta of (log) gradient norm vs (log) gradient norm average is larger than N standard deviations
// hence rescale gradient using the average.
- if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactor_ * gNormStdTransform) {
- LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f}",
- dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactor_, gNormStdTransform);
+ if(scheduler_->numberOfBatches() >= window && deltaTransform > dynamicGradientScalingFactorWithFadeout * gNormStdTransform) {
+ if(isMainProcess())
+ LOG(debug, "log gradient norms: {} :: {:.4f} - {:.4f} = {:.4f} > {:.4f} * {:.4f} - scaling gradient by {:.4f}",
+ dynamicGradientScalingUseLogs_, gNormTransform, gNormAvgTransform, deltaTransform, dynamicGradientScalingFactorWithFadeout, gNormStdTransform, gNormAvg / gNorm);
normalizationFactor *= gNorm / gNormAvg; // since we later do gradient / normalizationFactor this divides by norm and multiplies by the average, rescaling to the average.
}
@@ -288,9 +297,7 @@ void GraphGroup::load(const OptimizerBase::ScatterStateFunc& scatterFn) {
restoreFromCheckpoint(modelFileName, scatterFn);
} else if(options_->hasAndNotEmpty("pretrained-model")) {
std::string nameInit = options_->get<std::string>("pretrained-model");
- LOG(info,
- "[training] Initializing model weights with pre-trained model {}",
- nameInit);
+ LOG(info, "[training] Initializing model weights with pre-trained model {}", nameInit);
size_t i = 0;
for(auto graph : graphs_)
diff --git a/src/training/graph_group.h b/src/training/graph_group.h
index 0e4a68dc..9f1362e7 100644
--- a/src/training/graph_group.h
+++ b/src/training/graph_group.h
@@ -60,21 +60,21 @@ protected:
double typicalTrgBatchWords_{0}; // for dynamic batch sizing: typical batch size in words
bool mbRoundUp_{true}; // round up batches for more efficient training but can make batch size less stable, disable with --mini-batch-round-up=false
- bool costScale_{false};
- float costScaleFactor_{1.f}; // @TODO, add current costScaleFactor_ to trainingState for serialization
- size_t costScaleFreq_{2000};
- float costScaleMultiplier_{2.f};
- float costScaleNanTolerance_{0.f};
- size_t costScaleNanRange_{1};
- float costScaleFactorMinimum_{1.f}; // @TODO make this configureable
+ bool costScaling_{false};
+ float costScalingFactor_{1.f}; // @TODO, add current costScalingFactor_ to trainingState for serialization
+ size_t costScalingFreq_{2000};
+ float costScalingMultiplier_{2.f};
+ float costScalingFactorMinimum_{1.f};
+
size_t noNanSeen_{0}; // @TODO, add current noNanSeen_ to trainingState for serialization
size_t nanSeen_{0};
+ bool checkGradientNan_{false};
+
bool dynamicGradientScaling_{false};
float dynamicGradientScalingFactor_{2.f};
bool dynamicGradientScalingUseLogs_{false};
-
- bool checkGradientNan_{false};
+ size_t dynamicGradientScalingFadeout_{0ul};
// determines the number of input streams (i.e. input files or fields in the TSV input) that need
// to be included in the batch, i.e. without alignments and weights
diff --git a/src/training/graph_group_async.cpp b/src/training/graph_group_async.cpp
index 72b06e48..f85f9cf8 100644
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@@ -143,13 +143,13 @@ void AsyncGraphGroup::execute(Ptr<data::Batch> batch) {
thread_local Tensor accGradients;
thread_local Ptr<TensorAllocator> accAlloc;
- ABORT_IF(costScale_ ,"Cost-scaling not implemented for AsyncSGD");
+ ABORT_IF(costScaling_ ,"Cost-scaling not implemented for AsyncSGD");
auto graph = graphs_[tid];
Ptr<RationalLoss> dynamicLoss = models_[tid]->build(graph, batch);
- if(costScaleFactor_ != 1.f) {
+ if(costScalingFactor_ != 1.f) {
// it's ok to go out of scope, this will still insert the new top node into the graph
- auto costNode = dynamicLoss->loss() * costScaleFactor_;
+ auto costNode = dynamicLoss->loss() * costScalingFactor_;
}
if(t % optimizerDelay_ == 0) {
diff --git a/src/training/graph_group_singleton.cpp b/src/training/graph_group_singleton.cpp
index 7dc86137..16261070 100644
--- a/src/training/graph_group_singleton.cpp
+++ b/src/training/graph_group_singleton.cpp
@@ -16,16 +16,16 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
auto opt = optimizerShards_[0];
auto lossNode = model->build(graph, batch);
- if(costScaleFactor_ != 1.f) {
+ if(costScalingFactor_ != 1.f) {
// for fp16 training, it's ok to go out of scope, we do not use the scaled version for anything
- auto scaledLoss = lossNode->loss() * costScaleFactor_;
+ auto scaledLoss = lossNode->loss() * costScalingFactor_;
}
graph->forward();
graph->backward();
bool noNanOrInf = true;
- if(costScale_) {
+ if(costScaling_) {
// Are there NaNs in the gradient?
bool hasNan = false, hasInf = false;
IsNaN(graph->params()->grads(), graph->allocator(), hasNan, hasInf);
@@ -39,7 +39,7 @@ void SingletonGraph::execute(Ptr<data::Batch> batch) {
opt->update(graph->params()->vals(),
graph->params()->grads(),
batch->wordsTrg(),
- costScaleFactor_);
+ costScalingFactor_);
if(scheduler_) {
scheduler_->update(*lossNode, batch);
diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp
index 8c06761e..c90a384e 100644
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@@ -252,8 +252,8 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
{ // let loss go out of scope, frees memory
auto rationalLoss = models_[localDeviceIndex]->build(graph, subBatch);
- if(costScaleFactor_ != 1.f)
- rationalLoss->loss() * costScaleFactor_;
+ if(costScalingFactor_ != 1.f)
+ rationalLoss->loss() * costScalingFactor_;
graph->forward();
localDeviceLosses[localDeviceIndex] += *rationalLoss;
@@ -262,7 +262,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
graph->backward(/*zero=*/false); // (gradients are reset before we get here)
}
-#if 1
+#if 0 // @TODO: this can probably be removed now, keep around until confirmed.
// experimental and should eventually be somewhere else
// Handle local gradient explosion but only clip to largest possible value
// given number of GPUs and type. Should clip rarely. Also clips inf
@@ -284,7 +284,7 @@ void SyncGraphGroup::update(std::vector<Ptr<data::Batch>> subBatches, size_t num
comm_->scatterReduceAndResetGrads(); // reduce gradients across all devices (globally) into shards
float gradNorm = 0.f;
- if(costScale_ || dynamicGradientScaling_ || checkGradientNan_) {
+ if(costScaling_ || dynamicGradientScaling_ || checkGradientNan_) {
// Wrapping member function
auto checkNanOrNorm = [&](size_t i, size_t begin, size_t end) {
return GraphGroup::checkNanOrNorm(i, begin, end);
diff --git a/src/translator/beam_search.cpp b/src/translator/beam_search.cpp
index 2a0d3947..580895f2 100644
--- a/src/translator/beam_search.cpp
+++ b/src/translator/beam_search.cpp
@@ -94,7 +94,7 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
// For factored decoding, the word is built over multiple decoding steps,
// starting with the lemma, then adding factors one by one.
if (factorGroup == 0) {
- word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx); // @BUGBUG: reverseMap is only correct if factoredVocab_->getGroupRange(0).first == 0
+ word = factoredVocab->lemma2Word(shortlist ? shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx) : wordIdx);
std::vector<size_t> factorIndices; factoredVocab->word2factors(word, factorIndices);
//LOG(info, "{} + {} ({}) -> {} -> {}",
// factoredVocab->decode(prevHyp->tracebackWords()),
@@ -115,7 +115,7 @@ Beams BeamSearch::toHyps(const std::vector<unsigned int>& nBestKeys, // [current
}
}
else if (shortlist)
- word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) origBatchIdx, wordIdx));
+ word = Word::fromWordIndex(shortlist->reverseMap((int) prevBeamHypIdx, (int) currentBatchIdx, wordIdx));
else
word = Word::fromWordIndex(wordIdx);
@@ -330,6 +330,7 @@ Histories BeamSearch::search(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch>
auto prevBatchIdxMap = batchIdxMap; // [origBatchIdx -> currentBatchIdx] but shifted by one time step
// main loop over output time steps
for (size_t t = 0; ; t++) {
+ //std::cerr << "\nstep=" << t << std::endl;
ABORT_IF(origDimBatch != beams.size(), "Lost a batch entry??");
// determine beam size for next output time step, as max over still-active sentences
// E.g. if all batch entries are down from beam 5 to no more than 4 surviving hyps, then
diff --git a/src/translator/nth_element.cpp b/src/translator/nth_element.cpp
index 237d9b9d..dbcceec4 100644
--- a/src/translator/nth_element.cpp
+++ b/src/translator/nth_element.cpp
@@ -3,7 +3,9 @@
* SPDX-License-Identifier: MIT
*/
+#include "common/utils.h"
#include "translator/nth_element.h"
+
#include <algorithm>
#include <iterator>
#include <limits>
diff --git a/src/translator/translator.h b/src/translator/translator.h
index 4084ced9..0621fc8c 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -122,7 +122,7 @@ public:
threadPool.enqueue(task, device, id++);
}
- if(options_->get<bool>("output-sampling", false)) {
+ if(options_->hasAndNotEmpty("output-sampling")) {
if(options_->get<size_t>("beam-size") > 1)
LOG(warn,
"[warning] Output sampling and beam search (beam-size > 1) are contradictory methods "