diff options
author | Martin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2020-11-04 20:31:59 +0300 |
---|---|---|
committer | Martin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> | 2020-11-04 20:31:59 +0300 |
commit | f3e4cbf705b35a40ad104b35ef7dfcbf184f6b2a (patch) | |
tree | 60f8ed6304a91031487daac3d894cb51489f700b | |
parent | fabbe203091dec345165dc63528072b86d177d19 (diff) |
Merged PR 16219: Allow to set epoch display width
Allow to set the display width of the fractional pert of a logical epoch.
-rw-r--r-- | CHANGELOG.md | 2 | ||||
-rwxr-xr-x | src/common/config_parser.cpp | 7 | ||||
-rwxr-xr-x | src/training/scheduler.h | 75 |
3 files changed, 50 insertions, 34 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 420f0445..279ba35f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added -- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. +- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument. - Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation - Add --after option which is meant to replace --after-batches and --after-epochs and can take label based criteria - Add --transformer-postprocess-top option to enable correctly normalized prenorm behavior diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index e67c9b2c..31e9a7f1 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -398,9 +398,10 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { cli.add<std::string/*SchedulerPeriod*/>("--save-freq", "Save model file every arg updates (append 't' for every arg target labels)", "10000u"); - cli.add<std::string>("--logical-epoch", - "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt)", - "1e"); + cli.add<std::vector<std::string>>("--logical-epoch", + "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt). " + "Second parameter defines width of fractional display, 0 by default.", + {"1e", "0"}); addSuboptionsInputLength(cli); addSuboptionsTSV(cli); diff --git a/src/training/scheduler.h b/src/training/scheduler.h index 14f68510..b9e203d6 100755 --- a/src/training/scheduler.h +++ b/src/training/scheduler.h @@ -17,6 +17,7 @@ private: bool first_{true}; // true if this is the first update after renewing the training SchedulingParameter logicalEpoch_; + size_t logicalEpochWidth_{0}; timer::Timer timer_; timer::Timer heartBeatTimer_; @@ -26,29 +27,6 @@ private: // which indicates the end of the training data stream from STDIN bool endOfStdin_{false}; // true at the end of the epoch if training from STDIN; - // Here we calculate the logical epoch as defined by the user, by default this will be just a traditional data epoch. - // We understand a data epoch as a complete pass throught the training data as far as that information is available. - // By contrast, a logical epoch is defined somewhat indepdently of the number of data passes as by the number of seen updates or labels - // or as a multitude of data epochs. - float calculateLogicalEpoch() { - if(logicalEpoch_.unit == SchedulingUnit::epochs) - return (float)state_->epochs / (float)logicalEpoch_.n; // logical epoch as multiple of n data epochs - else if(logicalEpoch_.unit == SchedulingUnit::trgLabels) - return (float)state_->labelsTotal / (float)logicalEpoch_.n; // logical epoch as multiple of n labels - else if(logicalEpoch_.unit == SchedulingUnit::updates) - return (float)state_->batches / (float)logicalEpoch_.n; // logical epoch as multiple of n gradient updates (not actually batches @TODO: change name) - else - ABORT("Unknown scheduling unit occurred in logical epoch"); // shouldn't really happen unless we add a new unit in the corresponding enum - } - - // Formatting for logical epochs - std::string formatLogicalEpoch() { - if(logicalEpoch_.unit == SchedulingUnit::epochs && logicalEpoch_.n == 1) - return fmt::format("{}", calculateLogicalEpoch()); // for a data epoch, output is an integer and looks like before this feature was introduced - else - return fmt::format("{:.4f}", calculateLogicalEpoch()); // all other outputs can be fractional, hence floating point format - } - // determine scheduled LR decay factor (--lr-decay-inv-sqrt option) float getScheduledLRDecayFactor(const TrainingState& state) const { auto args = options_->get<std::vector<std::string>>("lr-decay-inv-sqrt"); @@ -134,7 +112,51 @@ private: return ss.str(); } + // Here we calculate the logical epoch as defined by the user, by default this will be just a traditional data epoch. + // We understand a data epoch as a complete pass throught the training data as far as that information is available. + // By contrast, a logical epoch is defined somewhat indepdently of the number of data passes as by the number of seen updates or labels + // or as a multitude of data epochs. + float calculateLogicalEpoch() { + if(logicalEpoch_.unit == SchedulingUnit::epochs) + return (float)state_->epochs / (float)logicalEpoch_.n; // logical epoch as multiple of n data epochs + else if(logicalEpoch_.unit == SchedulingUnit::trgLabels) + return (float)state_->labelsTotal / (float)logicalEpoch_.n; // logical epoch as multiple of n labels + else if(logicalEpoch_.unit == SchedulingUnit::updates) + return (float)state_->batches / (float)logicalEpoch_.n; // logical epoch as multiple of n gradient updates (not actually batches @TODO: change name) + else + ABORT("Unknown scheduling unit occurred in logical epoch"); // shouldn't really happen unless we add a new unit in the corresponding enum + } + + // Formatting for logical epochs + std::string formatLogicalEpoch() { + return fmt::format("{:." + std::to_string(logicalEpochWidth_) + "f}", calculateLogicalEpoch()); + } + public: + Scheduler(Ptr<Options> options, Ptr<TrainingState> state) + : options_(options), state_(state) { + + // parse logical-epoch parameters + auto logicalEpochStr = options->get<std::vector<std::string>>("logical-epoch", {"1e", "0"}); + ABORT_IF(logicalEpochStr.empty(), "Logical epoch information is missing?"); + + logicalEpoch_ = SchedulingParameter::parse(logicalEpochStr[0]); + + // here we deduce the floating point width to be used in formatLogicalEpoch() + if(logicalEpochStr.size() > 1) { // if the width is given, just use that + logicalEpochWidth_ = std::stoul(logicalEpochStr[1]); + } else { // the width is not given so we deduce a suitable display width + if(logicalEpoch_.unit == SchedulingUnit::epochs && logicalEpoch_.n == 1) + logicalEpochWidth_ = 0; // for a data epoch, output is an integer and looks like before this feature was introduced + else + logicalEpochWidth_ = 3; // all other outputs can be fractional, hence floating point format. We choose + // 3 as a default which corresponds to a multiplier of 1000 (3 orders of magnitude). + } + + ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??"); + updateLearningRate(*state); + } + // test if any parameters specify dynamic MB scaling bool isDynamicMBSizeScaling() const { auto mbWarmup = SchedulingParameter::parse(options_->get<std::string>("mini-batch-warmup")); @@ -172,13 +194,6 @@ public: return ratio; } - Scheduler(Ptr<Options> options, Ptr<TrainingState> state) - : options_(options), state_(state), - logicalEpoch_(SchedulingParameter::parse(options->get<std::string>("logical-epoch", "1e"))) { - ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??"); - updateLearningRate(*state); - } - bool keepGoing() { if(saveAndExitRequested()) // via SIGTERM return false; |