Merged PR 16219: Allow to set epoch display width

Allow to set the display width of the fractional pert of a logical epoch.
author: Martin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> 2020-11-04 20:31:59 +0300
committer: Martin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com> 2020-11-04 20:31:59 +0300
commit: f3e4cbf705b35a40ad104b35ef7dfcbf184f6b2a (patch)
tree: 60f8ed6304a91031487daac3d894cb51489f700b
parent: fabbe203091dec345165dc63528072b86d177d19 (diff)
3 files changed, 50 insertions, 34 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 420f0445..279ba35f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
-- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels.
+- Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument.
 - Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation
 - Add --after option which is meant to replace --after-batches and --after-epochs and can take label based criteria
 - Add --transformer-postprocess-top option to enable correctly normalized prenorm behavior
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index e67c9b2c..31e9a7f1 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -398,9 +398,10 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   cli.add<std::string/*SchedulerPeriod*/>("--save-freq",
       "Save model file every  arg  updates (append 't' for every  arg  target labels)",
       "10000u");
-  cli.add<std::string>("--logical-epoch",
-      "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt)",
-      "1e");
+  cli.add<std::vector<std::string>>("--logical-epoch",
+      "Redefine logical epoch counter as multiple of data epochs (e.g. 1e), updates (e.g. 100Ku) or labels (e.g. 1Gt). "
+      "Second parameter defines width of fractional display, 0 by default.",
+      {"1e", "0"});
 
   addSuboptionsInputLength(cli);
   addSuboptionsTSV(cli);
diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index 14f68510..b9e203d6 100755
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -17,6 +17,7 @@ private:
 
   bool first_{true};        // true if this is the first update after renewing the training
   SchedulingParameter logicalEpoch_;
+  size_t logicalEpochWidth_{0};
 
   timer::Timer timer_;
   timer::Timer heartBeatTimer_;
@@ -26,29 +27,6 @@ private:
   // which indicates the end of the training data stream from STDIN
   bool endOfStdin_{false};  // true at the end of the epoch if training from STDIN;
 
-  // Here we calculate the logical epoch as defined by the user, by default this will be just a traditional data epoch.
-  // We understand a data epoch as a complete pass throught the training data as far as that information is available.
-  // By contrast, a logical epoch is defined somewhat indepdently of the number of data passes as by the number of seen updates or labels
-  // or as a multitude of data epochs.
-  float calculateLogicalEpoch() {
-    if(logicalEpoch_.unit == SchedulingUnit::epochs)
-      return (float)state_->epochs / (float)logicalEpoch_.n;      // logical epoch as multiple of n data epochs
-    else if(logicalEpoch_.unit == SchedulingUnit::trgLabels)
-      return (float)state_->labelsTotal / (float)logicalEpoch_.n; // logical epoch as multiple of n labels
-    else if(logicalEpoch_.unit == SchedulingUnit::updates)
-      return (float)state_->batches / (float)logicalEpoch_.n;     // logical epoch as multiple of n gradient updates (not actually batches @TODO: change name)
-    else
-      ABORT("Unknown scheduling unit occurred in logical epoch"); // shouldn't really happen unless we add a new unit in the corresponding enum
-  }
-
-  // Formatting for logical epochs
-  std::string formatLogicalEpoch() {
-    if(logicalEpoch_.unit == SchedulingUnit::epochs && logicalEpoch_.n == 1)
-      return fmt::format("{}", calculateLogicalEpoch());     // for a data epoch, output is an integer and looks like before this feature was introduced
-    else
-      return fmt::format("{:.4f}", calculateLogicalEpoch()); // all other outputs can be fractional, hence floating point format
-  }
-
   // determine scheduled LR decay factor (--lr-decay-inv-sqrt option)
   float getScheduledLRDecayFactor(const TrainingState& state) const {
     auto args = options_->get<std::vector<std::string>>("lr-decay-inv-sqrt");
@@ -134,7 +112,51 @@ private:
     return ss.str();
   }
 
+  // Here we calculate the logical epoch as defined by the user, by default this will be just a traditional data epoch.
+  // We understand a data epoch as a complete pass throught the training data as far as that information is available.
+  // By contrast, a logical epoch is defined somewhat indepdently of the number of data passes as by the number of seen updates or labels
+  // or as a multitude of data epochs.
+  float calculateLogicalEpoch() {
+    if(logicalEpoch_.unit == SchedulingUnit::epochs)
+      return (float)state_->epochs / (float)logicalEpoch_.n;      // logical epoch as multiple of n data epochs
+    else if(logicalEpoch_.unit == SchedulingUnit::trgLabels)
+      return (float)state_->labelsTotal / (float)logicalEpoch_.n; // logical epoch as multiple of n labels
+    else if(logicalEpoch_.unit == SchedulingUnit::updates)
+      return (float)state_->batches / (float)logicalEpoch_.n;     // logical epoch as multiple of n gradient updates (not actually batches @TODO: change name)
+    else
+      ABORT("Unknown scheduling unit occurred in logical epoch"); // shouldn't really happen unless we add a new unit in the corresponding enum
+  }
+
+  // Formatting for logical epochs
+  std::string formatLogicalEpoch() {
+    return fmt::format("{:." + std::to_string(logicalEpochWidth_) + "f}", calculateLogicalEpoch());
+  }
+
 public:
+  Scheduler(Ptr<Options> options, Ptr<TrainingState> state)
+      : options_(options), state_(state) {
+
+    // parse logical-epoch parameters
+    auto logicalEpochStr = options->get<std::vector<std::string>>("logical-epoch", {"1e", "0"});
+    ABORT_IF(logicalEpochStr.empty(), "Logical epoch information is missing?");
+
+    logicalEpoch_ = SchedulingParameter::parse(logicalEpochStr[0]);
+
+    // here we deduce the floating point width to be used in formatLogicalEpoch()
+    if(logicalEpochStr.size() > 1) { // if the width is given, just use that
+      logicalEpochWidth_ = std::stoul(logicalEpochStr[1]);
+    } else { // the width is not given so we deduce a suitable display width
+      if(logicalEpoch_.unit == SchedulingUnit::epochs && logicalEpoch_.n == 1)
+        logicalEpochWidth_ = 0; // for a data epoch, output is an integer and looks like before this feature was introduced
+      else
+        logicalEpochWidth_ = 3; // all other outputs can be fractional, hence floating point format. We choose
+                                // 3 as a default which corresponds to a multiplier of 1000 (3 orders of magnitude).
+    }
+
+    ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??");
+    updateLearningRate(*state);
+  }
+
   // test if any parameters specify dynamic MB scaling
   bool isDynamicMBSizeScaling() const {
     auto mbWarmup = SchedulingParameter::parse(options_->get<std::string>("mini-batch-warmup"));
@@ -172,13 +194,6 @@ public:
     return ratio;
   }
 
-  Scheduler(Ptr<Options> options, Ptr<TrainingState> state)
-      : options_(options), state_(state), 
-        logicalEpoch_(SchedulingParameter::parse(options->get<std::string>("logical-epoch", "1e"))) {
-    ABORT_IF(state_->factor != 1, "state.factor unexpectedly not 1 at this point??");
-    updateLearningRate(*state);
-  }
-
   bool keepGoing() {
     if(saveAndExitRequested()) // via SIGTERM
       return false;
author	Martin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>	2020-11-04 20:31:59 +0300
committer	Martin Junczys-Dowmunt <Marcin.JunczysDowmunt@microsoft.com>	2020-11-04 20:31:59 +0300
commit	f3e4cbf705b35a40ad104b35ef7dfcbf184f6b2a (patch)
tree	60f8ed6304a91031487daac3d894cb51489f700b
parent	fabbe203091dec345165dc63528072b86d177d19 (diff)