Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-05-16 10:58:03 +0300
committerJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-05-16 10:58:03 +0300
commit0ffe79579eca183161d86ad38bb34ba8bab3c855 (patch)
tree426ef93e43acef1ff9ffa1ad5e0c9efeb3142a8d /scripts/training/bilingual-lm/train_nplm.py
parentf1ed14eb33c86611a9d9355caf6439a087d71d03 (diff)
Fix some python lint.
I used mainly pocketlint, a very good Python linter, but also Syntastic, a vim plugin. Didn't get anywhere near fixing all of Syntastic's complaints though. Once I've cleaned up all (or at least most) of the Python lint, we can start doing regular automated lint checks and keep the code clean.
Diffstat (limited to 'scripts/training/bilingual-lm/train_nplm.py')
-rwxr-xr-xscripts/training/bilingual-lm/train_nplm.py195
1 files changed, 107 insertions, 88 deletions
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
index 356fd798d..7bc74429e 100755
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -8,7 +8,9 @@ import subprocess
import sys
import os
-logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+logging.basicConfig(
+ format='%(asctime)s %(levelname)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument("-w", "--working-dir", dest="working_dir")
parser.add_argument("-c", "--corpus", dest="corpus_stem")
@@ -18,8 +20,10 @@ parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int)
parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int)
parser.add_argument("-s", "--noise", dest="noise", type=int)
parser.add_argument("-d", "--hidden", dest="hidden", type=int)
-parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int)
-parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int)
+parser.add_argument(
+ "-i", "--input-embedding", dest="input_embedding", type=int)
+parser.add_argument(
+ "-o", "--output-embedding", dest="output_embedding", type=int)
parser.add_argument("-t", "--threads", dest="threads", type=int)
parser.add_argument("-m", "--output-model", dest="output_model")
parser.add_argument("-r", "--output-dir", dest="output_dir")
@@ -35,94 +39,109 @@ parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
parser.set_defaults(
- working_dir = "working"
- ,corpus_stem = "train.10k"
- ,nplm_home = "/home/bhaddow/tools/nplm"
- ,epochs = 10
- ,ngram_size = 14
- ,minibatch_size=1000
- ,noise=100
- ,hidden=750
- ,input_embedding=150
- ,output_embedding=150
- ,threads=1
- ,output_model = "train.10k"
- ,output_dir = None
- ,config_options_file = "config"
- ,log_file = "log"
- ,validation_file = None
- ,activation_fn = "rectifier"
- ,learning_rate = 1
- ,input_words_file = None
- ,output_words_file = None
- ,input_vocab_size = 0
- ,output_vocab_size = 0
+ working_dir="working",
+ corpus_stem="train.10k",
+ nplm_home="/home/bhaddow/tools/nplm",
+ epochs=10,
+ ngram_size=14,
+ minibatch_size=1000,
+ noise=100,
+ hidden=750,
+ input_embedding=150,
+ output_embedding=150,
+ threads=1,
+ output_model="train.10k",
+ output_dir=None,
+ config_options_file="config",
+ log_file="log",
+ validation_file=None,
+ activation_fn="rectifier",
+ learning_rate=1,
+ input_words_file=None,
+ output_words_file=None,
+ input_vocab_size=0,
+ output_vocab_size=0
)
+
def main(options):
- vocab_command = []
- if options.input_words_file is not None:
- vocab_command += ['--input_words_file', options.input_words_file]
- if options.output_words_file is not None:
- vocab_command += ['--output_words_file', options.output_words_file]
- if options.input_vocab_size:
- vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
- if options.output_vocab_size:
- vocab_command += ['--output_vocab_size', str(options.output_vocab_size)]
-
- # Set up validation command variable to use with validation set.
- validations_command = []
- if options.validation_file is not None:
- validations_command =["--validation_file", (options.validation_file + ".numberized")]
-
- # In order to allow for different models to be trained after the same
- # preparation step, we should provide an option for multiple output directories
- # If we have not set output_dir, set it to the same thing as the working dir
-
- if options.output_dir is None:
- options.output_dir = options.working_dir
- else:
- # Create output dir if necessary
- if not os.path.exists(options.output_dir):
- os.makedirs(options.output_dir)
-
- config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model)
- log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model)
- log_file_write = open(log_file, 'w')
- config_file_write = open(config_file, 'w')
-
- config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
-
- in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized")
-
- model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm")
- train_args = [options.nplm_home + "/src/trainNeuralNetwork",
- "--train_file", in_file,
- "--num_epochs", str(options.epochs),
- "--model_prefix", model_prefix,
- "--learning_rate", str(options.learning_rate),
- "--minibatch_size", str(options.minibatch_size),
- "--num_noise_samples", str(options.noise),
- "--num_hidden", str(options.hidden),
- "--input_embedding_dimension", str(options.input_embedding),
- "--output_embedding_dimension", str(options.output_embedding),
- "--num_threads", str(options.threads),
- "--activation_function", options.activation_fn] + validations_command + vocab_command
- print("Train model command: ")
- print(', '.join(train_args))
-
- config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
- config_file_write.close()
-
- log_file_write.write("Training output:\n")
- ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
- if ret:
- raise Exception("Training failed")
-
- log_file_write.close()
+ vocab_command = []
+ if options.input_words_file is not None:
+ vocab_command += ['--input_words_file', options.input_words_file]
+ if options.output_words_file is not None:
+ vocab_command += ['--output_words_file', options.output_words_file]
+ if options.input_vocab_size:
+ vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
+ if options.output_vocab_size:
+ vocab_command += [
+ '--output_vocab_size', str(options.output_vocab_size)]
+
+ # Set up validation command variable to use with validation set.
+ validations_command = []
+ if options.validation_file is not None:
+ validations_command = [
+ "--validation_file", (options.validation_file + ".numberized")]
+
+ # In order to allow for different models to be trained after the same
+ # preparation step, we should provide an option for multiple output
+ # directories.
+ # If we have not set output_dir, set it to the same thing as the working
+ # dir.
+
+ if options.output_dir is None:
+ options.output_dir = options.working_dir
+ else:
+ # Create output dir if necessary
+ if not os.path.exists(options.output_dir):
+ os.makedirs(options.output_dir)
+
+ config_file = os.path.join(
+ options.output_dir,
+ options.config_options_file + '-' + options.output_model)
+ log_file = os.path.join(
+ options.output_dir, options.log_file + '-' + options.output_model)
+ log_file_write = open(log_file, 'w')
+ config_file_write = open(config_file, 'w')
+
+ config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
+
+ in_file = os.path.join(
+ options.working_dir,
+ os.path.basename(options.corpus_stem) + ".numberized")
+
+ model_prefix = os.path.join(
+ options.output_dir, options.output_model + ".model.nplm")
+ train_args = [
+ options.nplm_home + "/src/trainNeuralNetwork",
+ "--train_file", in_file,
+ "--num_epochs", str(options.epochs),
+ "--model_prefix", model_prefix,
+ "--learning_rate", str(options.learning_rate),
+ "--minibatch_size", str(options.minibatch_size),
+ "--num_noise_samples", str(options.noise),
+ "--num_hidden", str(options.hidden),
+ "--input_embedding_dimension", str(options.input_embedding),
+ "--output_embedding_dimension", str(options.output_embedding),
+ "--num_threads", str(options.threads),
+ "--activation_function",
+ options.activation_fn,
+ ] + validations_command + vocab_command
+ print("Train model command: ")
+ print(', '.join(train_args))
+
+ config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
+ config_file_write.close()
+
+ log_file_write.write("Training output:\n")
+ ret = subprocess.call(
+ train_args, stdout=log_file_write, stderr=log_file_write)
+ if ret:
+ raise Exception("Training failed")
+
+ log_file_write.close()
-if __name__ == "__main__":
- options = parser.parse_args()
- main(options)
+if __name__ == "__main__":
+ options = parser.parse_args()
+ main(options)