1 files changed, 107 insertions, 88 deletions
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
index 356fd798d..7bc74429e 100755
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -8,7 +8,9 @@ import subprocess
 import sys
 import os
 
-logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+logging.basicConfig(
+    format='%(asctime)s %(levelname)s: %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
 parser = argparse.ArgumentParser()
 parser.add_argument("-w", "--working-dir", dest="working_dir")
 parser.add_argument("-c", "--corpus", dest="corpus_stem")
@@ -18,8 +20,10 @@ parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int)
 parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int)
 parser.add_argument("-s", "--noise", dest="noise", type=int)
 parser.add_argument("-d", "--hidden", dest="hidden", type=int)
-parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int)
-parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int)
+parser.add_argument(
+    "-i", "--input-embedding", dest="input_embedding", type=int)
+parser.add_argument(
+    "-o", "--output-embedding", dest="output_embedding", type=int)
 parser.add_argument("-t", "--threads", dest="threads", type=int)
 parser.add_argument("-m", "--output-model", dest="output_model")
 parser.add_argument("-r", "--output-dir", dest="output_dir")
@@ -35,94 +39,109 @@ parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
 
 
 parser.set_defaults(
-    working_dir = "working"
-    ,corpus_stem = "train.10k"
-    ,nplm_home = "/home/bhaddow/tools/nplm"
-    ,epochs = 10
-    ,ngram_size = 14
-    ,minibatch_size=1000
-    ,noise=100
-    ,hidden=750
-    ,input_embedding=150
-    ,output_embedding=150
-    ,threads=1
-    ,output_model = "train.10k"
-    ,output_dir = None
-    ,config_options_file = "config"
-    ,log_file = "log"
-    ,validation_file = None
-    ,activation_fn = "rectifier"
-    ,learning_rate = 1
-    ,input_words_file = None
-    ,output_words_file = None
-    ,input_vocab_size = 0
-    ,output_vocab_size = 0
+    working_dir="working",
+    corpus_stem="train.10k",
+    nplm_home="/home/bhaddow/tools/nplm",
+    epochs=10,
+    ngram_size=14,
+    minibatch_size=1000,
+    noise=100,
+    hidden=750,
+    input_embedding=150,
+    output_embedding=150,
+    threads=1,
+    output_model="train.10k",
+    output_dir=None,
+    config_options_file="config",
+    log_file="log",
+    validation_file=None,
+    activation_fn="rectifier",
+    learning_rate=1,
+    input_words_file=None,
+    output_words_file=None,
+    input_vocab_size=0,
+    output_vocab_size=0
     )
 
+
 def main(options):
 
-  vocab_command = []
-  if options.input_words_file is not None:
-    vocab_command += ['--input_words_file', options.input_words_file]
-  if options.output_words_file is not None:
-    vocab_command += ['--output_words_file', options.output_words_file]
-  if options.input_vocab_size:
-    vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
-  if options.output_vocab_size:
-    vocab_command += ['--output_vocab_size', str(options.output_vocab_size)]
-
-  # Set up validation command variable to use with validation set.
-  validations_command = []
-  if options.validation_file is not None:
-    validations_command =["--validation_file", (options.validation_file + ".numberized")]
-
-  # In order to allow for different models to be trained after the same
-  # preparation step, we should provide an option for multiple output directories
-  # If we have not set output_dir, set it to the same thing as the working dir
-
-  if options.output_dir is None:
-    options.output_dir = options.working_dir
-  else:
-    # Create output dir if necessary
-    if not os.path.exists(options.output_dir):
-      os.makedirs(options.output_dir)
-
-  config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model)
-  log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model)
-  log_file_write = open(log_file, 'w')
-  config_file_write = open(config_file, 'w')
-
-  config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
-
-  in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized")
-
-  model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm")
-  train_args = [options.nplm_home + "/src/trainNeuralNetwork",
-                "--train_file", in_file,
-                "--num_epochs", str(options.epochs),
-                "--model_prefix", model_prefix,
-                "--learning_rate", str(options.learning_rate),
-                "--minibatch_size", str(options.minibatch_size),
-                "--num_noise_samples", str(options.noise),
-                "--num_hidden", str(options.hidden),
-                "--input_embedding_dimension", str(options.input_embedding),
-                "--output_embedding_dimension", str(options.output_embedding),
-                "--num_threads", str(options.threads),
-                "--activation_function", options.activation_fn] + validations_command + vocab_command
-  print("Train model command: ")
-  print(', '.join(train_args))
-
-  config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
-  config_file_write.close()
-
-  log_file_write.write("Training output:\n")
-  ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
-  if ret: 
-      raise Exception("Training failed")
-
-  log_file_write.close()
+    vocab_command = []
+    if options.input_words_file is not None:
+        vocab_command += ['--input_words_file', options.input_words_file]
+    if options.output_words_file is not None:
+        vocab_command += ['--output_words_file', options.output_words_file]
+    if options.input_vocab_size:
+        vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
+    if options.output_vocab_size:
+        vocab_command += [
+            '--output_vocab_size', str(options.output_vocab_size)]
+
+    # Set up validation command variable to use with validation set.
+    validations_command = []
+    if options.validation_file is not None:
+        validations_command = [
+            "--validation_file", (options.validation_file + ".numberized")]
+
+    # In order to allow for different models to be trained after the same
+    # preparation step, we should provide an option for multiple output
+    # directories.
+    # If we have not set output_dir, set it to the same thing as the working
+    # dir.
+
+    if options.output_dir is None:
+        options.output_dir = options.working_dir
+    else:
+        # Create output dir if necessary
+        if not os.path.exists(options.output_dir):
+            os.makedirs(options.output_dir)
+
+    config_file = os.path.join(
+        options.output_dir,
+        options.config_options_file + '-' + options.output_model)
+    log_file = os.path.join(
+        options.output_dir, options.log_file + '-' + options.output_model)
+    log_file_write = open(log_file, 'w')
+    config_file_write = open(config_file, 'w')
+
+    config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
+
+    in_file = os.path.join(
+        options.working_dir,
+        os.path.basename(options.corpus_stem) + ".numberized")
+
+    model_prefix = os.path.join(
+        options.output_dir, options.output_model + ".model.nplm")
+    train_args = [
+        options.nplm_home + "/src/trainNeuralNetwork",
+        "--train_file", in_file,
+        "--num_epochs", str(options.epochs),
+        "--model_prefix", model_prefix,
+        "--learning_rate", str(options.learning_rate),
+        "--minibatch_size", str(options.minibatch_size),
+        "--num_noise_samples", str(options.noise),
+        "--num_hidden", str(options.hidden),
+        "--input_embedding_dimension", str(options.input_embedding),
+        "--output_embedding_dimension", str(options.output_embedding),
+        "--num_threads", str(options.threads),
+        "--activation_function",
+        options.activation_fn,
+    ] + validations_command + vocab_command
+    print("Train model command: ")
+    print(', '.join(train_args))
+
+    config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
+    config_file_write.close()
+
+    log_file_write.write("Training output:\n")
+    ret = subprocess.call(
+        train_args, stdout=log_file_write, stderr=log_file_write)
+    if ret:
+        raise Exception("Training failed")
+
+    log_file_write.close()
 
-if __name__ == "__main__":
-  options = parser.parse_args()
-  main(options)
 
+if __name__ == "__main__":
+    options = parser.parse_args()
+    main(options)