Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMartin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>2020-11-04 20:31:59 +0300
committerMartin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>2020-11-04 20:31:59 +0300
commitf3e4cbf705b35a40ad104b35ef7dfcbf184f6b2a (patch)
tree60f8ed6304a91031487daac3d894cb51489f700b
parentfabbe203091dec345165dc63528072b86d177d19 (diff)
Merged PR 16219: Allow to set epoch display width
Allow to set the display width of the fractional pert of a logical epoch.
-rw-r--r--CHANGELOG.md2
-rwxr-xr-xsrc/common/config_parser.cpp7
-rwxr-xr-xsrc/training/scheduler.h75
3 files changed, 50 insertions, 34 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 420f0445..279ba35f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]
### Added
-- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels.
+- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument.
- Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation
- Add --after option which is meant to replace --after-batches and --after-epochs and can take label based criteria
- Add --transformer-postprocess-top option to enable correctly normalized prenorm behavior
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index e67c9b2c..31e9a7f1 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -398,9 +398,10 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
cli.add<std::string/*SchedulerPeriod*/>("--save-freq",
"Save model file every arg updates (append 't' for every arg target labels)",
"10000u");
- cli.add<std::string>("--logical-epoch",
- "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt)",
- "1e");
+ cli.add<std::vector<std::string>>("--logical-epoch",
+ "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt). "
+ "Second parameter defines width of fractional display, 0 by default.",
+ {"1e", "0"});
addSuboptionsInputLength(cli);
addSuboptionsTSV(cli);
diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index 14f68510..b9e203d6 100755
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -17,6 +17,7 @@ private:
bool first_{true}; // true if this is the first update after renewing the training
SchedulingParameter logicalEpoch_;
+ size_t logicalEpochWidth_{0};
timer::Timer timer_;
timer::Timer heartBeatTimer_;
@@ -26,29 +27,6 @@ private:
// which indicates the end of the training data stream from STDIN
bool endOfStdin_{false}; // true at the end of the epoch if training from STDIN;
- // Here we calculate the logical epoch as defined by the user, by default this will be just a traditional data epoch.
- // We understand a data epoch as a complete pass throught the training data as far as that information is available.
- // By contrast, a logical epoch is defined somewhat indepdently of the number of data passes as by the number of seen updates or labels
- // or as a multitude of data epochs.
- float calculateLogicalEpoch() {
- if(logicalEpoch_.unit == SchedulingUnit::epochs)
- return (float)state_->epochs / (float)logicalEpoch_.n; // logical epoch as multiple of n data epochs
- else if(logicalEpoch_.unit == SchedulingUnit::trgLabels)
- return (float)state_->labelsTotal / (float)logicalEpoch_.n; // logical epoch as multiple of n labels
- else if(logicalEpoch_.unit == SchedulingUnit::updates)
- return (float)state_->batches / (float)logicalEpoch_.n; // logical epoch as multiple of n gradient updates (not actually batches @TODO: change name)
- else
- ABORT("Unknown scheduling unit occurred in logical epoch"); // shouldn't really happen unless we add a new unit in the corresponding enum
- }
-
- // Formatting for logical epochs
- std::string formatLogicalEpoch() {
- if(logicalEpoch_.unit == SchedulingUnit::epochs && logicalEpoch_.n == 1)
- return fmt::format("{}", calculateLogicalEpoch()); // for a data epoch, output is an integer and looks like before this feature was introduced
- else
- return fmt::format("{:.4f}", calculateLogicalEpoch()); // all other outputs can be fractional, hence floating point format
- }
-
// determine scheduled LR decay factor (--lr-decay-inv-sqrt option)
float getScheduledLRDecayFactor(const TrainingState& state) const {
auto args = options_->get<std::vector<std::string>>("lr-decay-inv-sqrt");
@@ -134,7 +112,51 @@ private:
return ss.str();
}
+ // Here we calculate the logical epoch as defined by the user, by default this will be just a traditional data epoch.
+ // We understand a data epoch as a complete pass throught the training data as far as that information is available.
+ // By contrast, a logical epoch is defined somewhat indepdently of the number of data passes as by the number of seen updates or labels
+ // or as a multitude of data epochs.
+ float calculateLogicalEpoch() {
+ if(logicalEpoch_.unit == SchedulingUnit::epochs)
+ return (float)state_->epochs / (float)logicalEpoch_.n; // logical epoch as multiple of n data epochs
+ else if(logicalEpoch_.unit == SchedulingUnit::trgLabels)
+ return (float)state_->labelsTotal / (float)logicalEpoch_.n; // logical epoch as multiple of n labels
+ else if(logicalEpoch_.unit == SchedulingUnit::updates)
+ return (float)state_->batches / (float)logicalEpoch_.n; // logical epoch as multiple of n gradient updates (not actually batches @TODO: change name)
+ else
+ ABORT("Unknown scheduling unit occurred in logical epoch"); // shouldn't really happen unless we add a new unit in the corresponding enum
+ }
+
+ // Formatting for logical epochs
+ std::string formatLogicalEpoch() {
+ return fmt::format("{:." + std::to_string(logicalEpochWidth_) + "f}", calculateLogicalEpoch());
+ }
+
public:
+ Scheduler(Ptr<Options> options, Ptr<TrainingState> state)
+ : options_(options), state_(state) {
+
+ // parse logical-epoch parameters
+ auto logicalEpochStr = options->get<std::vector<std::string>>("logical-epoch", {"1e", "0"});
+ ABORT_IF(logicalEpochStr.empty(), "Logical epoch information is missing?");
+
+ logicalEpoch_ = SchedulingParameter::parse(logicalEpochStr[0]);
+
+ // here we deduce the floating point width to be used in formatLogicalEpoch()
+ if(logicalEpochStr.size() > 1) { // if the width is given, just use that
+ logicalEpochWidth_ = std::stoul(logicalEpochStr[1]);
+ } else { // the width is not given so we deduce a suitable display width
+ if(logicalEpoch_.unit == SchedulingUnit::epochs && logicalEpoch_.n == 1)
+ logicalEpochWidth_ = 0; // for a data epoch, output is an integer and looks like before this feature was introduced
+ else
+ logicalEpochWidth_ = 3; // all other outputs can be fractional, hence floating point format. We choose
+ // 3 as a default which corresponds to a multiplier of 1000 (3 orders of magnitude).
+ }
+
+ ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??");
+ updateLearningRate(*state);
+ }
+
// test if any parameters specify dynamic MB scaling
bool isDynamicMBSizeScaling() const {
auto mbWarmup = SchedulingParameter::parse(options_->get<std::string>("mini-batch-warmup"));
@@ -172,13 +194,6 @@ public:
return ratio;
}
- Scheduler(Ptr<Options> options, Ptr<TrainingState> state)
- : options_(options), state_(state),
- logicalEpoch_(SchedulingParameter::parse(options->get<std::string>("logical-epoch", "1e"))) {
- ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??");
- updateLearningRate(*state);
- }
-
bool keepGoing() {
if(saveAndExitRequested()) // via SIGTERM
return false;