diff options
author | Jeroen Vermeulen <jtv@precisiontranslationtools.com> | 2015-05-16 10:58:03 +0300 |
---|---|---|
committer | Jeroen Vermeulen <jtv@precisiontranslationtools.com> | 2015-05-16 10:58:03 +0300 |
commit | 0ffe79579eca183161d86ad38bb34ba8bab3c855 (patch) | |
tree | 426ef93e43acef1ff9ffa1ad5e0c9efeb3142a8d /scripts/training/bilingual-lm/train_nplm.py | |
parent | f1ed14eb33c86611a9d9355caf6439a087d71d03 (diff) |
Fix some python lint.
I used mainly pocketlint, a very good Python linter, but also Syntastic,
a vim plugin. Didn't get anywhere near fixing all of Syntastic's complaints
though.
Once I've cleaned up all (or at least most) of the Python lint, we can
start doing regular automated lint checks and keep the code clean.
Diffstat (limited to 'scripts/training/bilingual-lm/train_nplm.py')
-rwxr-xr-x | scripts/training/bilingual-lm/train_nplm.py | 195 |
1 files changed, 107 insertions, 88 deletions
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py index 356fd798d..7bc74429e 100755 --- a/scripts/training/bilingual-lm/train_nplm.py +++ b/scripts/training/bilingual-lm/train_nplm.py @@ -8,7 +8,9 @@ import subprocess import sys import os -logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) +logging.basicConfig( + format='%(asctime)s %(levelname)s: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument("-w", "--working-dir", dest="working_dir") parser.add_argument("-c", "--corpus", dest="corpus_stem") @@ -18,8 +20,10 @@ parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int) parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int) parser.add_argument("-s", "--noise", dest="noise", type=int) parser.add_argument("-d", "--hidden", dest="hidden", type=int) -parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int) -parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int) +parser.add_argument( + "-i", "--input-embedding", dest="input_embedding", type=int) +parser.add_argument( + "-o", "--output-embedding", dest="output_embedding", type=int) parser.add_argument("-t", "--threads", dest="threads", type=int) parser.add_argument("-m", "--output-model", dest="output_model") parser.add_argument("-r", "--output-dir", dest="output_dir") @@ -35,94 +39,109 @@ parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int) parser.set_defaults( - working_dir = "working" - ,corpus_stem = "train.10k" - ,nplm_home = "/home/bhaddow/tools/nplm" - ,epochs = 10 - ,ngram_size = 14 - ,minibatch_size=1000 - ,noise=100 - ,hidden=750 - ,input_embedding=150 - ,output_embedding=150 - ,threads=1 - ,output_model = "train.10k" - ,output_dir = None - ,config_options_file = "config" - ,log_file = "log" - ,validation_file = None - ,activation_fn = "rectifier" - ,learning_rate = 1 - ,input_words_file = None - ,output_words_file = None - ,input_vocab_size = 0 - ,output_vocab_size = 0 + working_dir="working", + corpus_stem="train.10k", + nplm_home="/home/bhaddow/tools/nplm", + epochs=10, + ngram_size=14, + minibatch_size=1000, + noise=100, + hidden=750, + input_embedding=150, + output_embedding=150, + threads=1, + output_model="train.10k", + output_dir=None, + config_options_file="config", + log_file="log", + validation_file=None, + activation_fn="rectifier", + learning_rate=1, + input_words_file=None, + output_words_file=None, + input_vocab_size=0, + output_vocab_size=0 ) + def main(options): - vocab_command = [] - if options.input_words_file is not None: - vocab_command += ['--input_words_file', options.input_words_file] - if options.output_words_file is not None: - vocab_command += ['--output_words_file', options.output_words_file] - if options.input_vocab_size: - vocab_command += ['--input_vocab_size', str(options.input_vocab_size)] - if options.output_vocab_size: - vocab_command += ['--output_vocab_size', str(options.output_vocab_size)] - - # Set up validation command variable to use with validation set. - validations_command = [] - if options.validation_file is not None: - validations_command =["--validation_file", (options.validation_file + ".numberized")] - - # In order to allow for different models to be trained after the same - # preparation step, we should provide an option for multiple output directories - # If we have not set output_dir, set it to the same thing as the working dir - - if options.output_dir is None: - options.output_dir = options.working_dir - else: - # Create output dir if necessary - if not os.path.exists(options.output_dir): - os.makedirs(options.output_dir) - - config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model) - log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model) - log_file_write = open(log_file, 'w') - config_file_write = open(config_file, 'w') - - config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n') - - in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized") - - model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm") - train_args = [options.nplm_home + "/src/trainNeuralNetwork", - "--train_file", in_file, - "--num_epochs", str(options.epochs), - "--model_prefix", model_prefix, - "--learning_rate", str(options.learning_rate), - "--minibatch_size", str(options.minibatch_size), - "--num_noise_samples", str(options.noise), - "--num_hidden", str(options.hidden), - "--input_embedding_dimension", str(options.input_embedding), - "--output_embedding_dimension", str(options.output_embedding), - "--num_threads", str(options.threads), - "--activation_function", options.activation_fn] + validations_command + vocab_command - print("Train model command: ") - print(', '.join(train_args)) - - config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n') - config_file_write.close() - - log_file_write.write("Training output:\n") - ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write) - if ret: - raise Exception("Training failed") - - log_file_write.close() + vocab_command = [] + if options.input_words_file is not None: + vocab_command += ['--input_words_file', options.input_words_file] + if options.output_words_file is not None: + vocab_command += ['--output_words_file', options.output_words_file] + if options.input_vocab_size: + vocab_command += ['--input_vocab_size', str(options.input_vocab_size)] + if options.output_vocab_size: + vocab_command += [ + '--output_vocab_size', str(options.output_vocab_size)] + + # Set up validation command variable to use with validation set. + validations_command = [] + if options.validation_file is not None: + validations_command = [ + "--validation_file", (options.validation_file + ".numberized")] + + # In order to allow for different models to be trained after the same + # preparation step, we should provide an option for multiple output + # directories. + # If we have not set output_dir, set it to the same thing as the working + # dir. + + if options.output_dir is None: + options.output_dir = options.working_dir + else: + # Create output dir if necessary + if not os.path.exists(options.output_dir): + os.makedirs(options.output_dir) + + config_file = os.path.join( + options.output_dir, + options.config_options_file + '-' + options.output_model) + log_file = os.path.join( + options.output_dir, options.log_file + '-' + options.output_model) + log_file_write = open(log_file, 'w') + config_file_write = open(config_file, 'w') + + config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n') + + in_file = os.path.join( + options.working_dir, + os.path.basename(options.corpus_stem) + ".numberized") + + model_prefix = os.path.join( + options.output_dir, options.output_model + ".model.nplm") + train_args = [ + options.nplm_home + "/src/trainNeuralNetwork", + "--train_file", in_file, + "--num_epochs", str(options.epochs), + "--model_prefix", model_prefix, + "--learning_rate", str(options.learning_rate), + "--minibatch_size", str(options.minibatch_size), + "--num_noise_samples", str(options.noise), + "--num_hidden", str(options.hidden), + "--input_embedding_dimension", str(options.input_embedding), + "--output_embedding_dimension", str(options.output_embedding), + "--num_threads", str(options.threads), + "--activation_function", + options.activation_fn, + ] + validations_command + vocab_command + print("Train model command: ") + print(', '.join(train_args)) + + config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n') + config_file_write.close() + + log_file_write.write("Training output:\n") + ret = subprocess.call( + train_args, stdout=log_file_write, stderr=log_file_write) + if ret: + raise Exception("Training failed") + + log_file_write.close() -if __name__ == "__main__": - options = parser.parse_args() - main(options) +if __name__ == "__main__": + options = parser.parse_args() + main(options) |