Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/training/bilingual-lm/train_nplm.py')
-rwxr-xr-xscripts/training/bilingual-lm/train_nplm.py195
1 files changed, 107 insertions, 88 deletions
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
index 356fd798d..7bc74429e 100755
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -8,7 +8,9 @@ import subprocess
import sys
import os
-logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+logging.basicConfig(
+ format='%(asctime)s %(levelname)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = argparse.ArgumentParser()
parser.add_argument("-w", "--working-dir", dest="working_dir")
parser.add_argument("-c", "--corpus", dest="corpus_stem")
@@ -18,8 +20,10 @@ parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int)
parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int)
parser.add_argument("-s", "--noise", dest="noise", type=int)
parser.add_argument("-d", "--hidden", dest="hidden", type=int)
-parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int)
-parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int)
+parser.add_argument(
+ "-i", "--input-embedding", dest="input_embedding", type=int)
+parser.add_argument(
+ "-o", "--output-embedding", dest="output_embedding", type=int)
parser.add_argument("-t", "--threads", dest="threads", type=int)
parser.add_argument("-m", "--output-model", dest="output_model")
parser.add_argument("-r", "--output-dir", dest="output_dir")
@@ -35,94 +39,109 @@ parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
parser.set_defaults(
- working_dir = "working"
- ,corpus_stem = "train.10k"
- ,nplm_home = "/home/bhaddow/tools/nplm"
- ,epochs = 10
- ,ngram_size = 14
- ,minibatch_size=1000
- ,noise=100
- ,hidden=750
- ,input_embedding=150
- ,output_embedding=150
- ,threads=1
- ,output_model = "train.10k"
- ,output_dir = None
- ,config_options_file = "config"
- ,log_file = "log"
- ,validation_file = None
- ,activation_fn = "rectifier"
- ,learning_rate = 1
- ,input_words_file = None
- ,output_words_file = None
- ,input_vocab_size = 0
- ,output_vocab_size = 0
+ working_dir="working",
+ corpus_stem="train.10k",
+ nplm_home="/home/bhaddow/tools/nplm",
+ epochs=10,
+ ngram_size=14,
+ minibatch_size=1000,
+ noise=100,
+ hidden=750,
+ input_embedding=150,
+ output_embedding=150,
+ threads=1,
+ output_model="train.10k",
+ output_dir=None,
+ config_options_file="config",
+ log_file="log",
+ validation_file=None,
+ activation_fn="rectifier",
+ learning_rate=1,
+ input_words_file=None,
+ output_words_file=None,
+ input_vocab_size=0,
+ output_vocab_size=0
)
+
def main(options):
- vocab_command = []
- if options.input_words_file is not None:
- vocab_command += ['--input_words_file', options.input_words_file]
- if options.output_words_file is not None:
- vocab_command += ['--output_words_file', options.output_words_file]
- if options.input_vocab_size:
- vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
- if options.output_vocab_size:
- vocab_command += ['--output_vocab_size', str(options.output_vocab_size)]
-
- # Set up validation command variable to use with validation set.
- validations_command = []
- if options.validation_file is not None:
- validations_command =["--validation_file", (options.validation_file + ".numberized")]
-
- # In order to allow for different models to be trained after the same
- # preparation step, we should provide an option for multiple output directories
- # If we have not set output_dir, set it to the same thing as the working dir
-
- if options.output_dir is None:
- options.output_dir = options.working_dir
- else:
- # Create output dir if necessary
- if not os.path.exists(options.output_dir):
- os.makedirs(options.output_dir)
-
- config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model)
- log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model)
- log_file_write = open(log_file, 'w')
- config_file_write = open(config_file, 'w')
-
- config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
-
- in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized")
-
- model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm")
- train_args = [options.nplm_home + "/src/trainNeuralNetwork",
- "--train_file", in_file,
- "--num_epochs", str(options.epochs),
- "--model_prefix", model_prefix,
- "--learning_rate", str(options.learning_rate),
- "--minibatch_size", str(options.minibatch_size),
- "--num_noise_samples", str(options.noise),
- "--num_hidden", str(options.hidden),
- "--input_embedding_dimension", str(options.input_embedding),
- "--output_embedding_dimension", str(options.output_embedding),
- "--num_threads", str(options.threads),
- "--activation_function", options.activation_fn] + validations_command + vocab_command
- print("Train model command: ")
- print(', '.join(train_args))
-
- config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
- config_file_write.close()
-
- log_file_write.write("Training output:\n")
- ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
- if ret:
- raise Exception("Training failed")
-
- log_file_write.close()
+ vocab_command = []
+ if options.input_words_file is not None:
+ vocab_command += ['--input_words_file', options.input_words_file]
+ if options.output_words_file is not None:
+ vocab_command += ['--output_words_file', options.output_words_file]
+ if options.input_vocab_size:
+ vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
+ if options.output_vocab_size:
+ vocab_command += [
+ '--output_vocab_size', str(options.output_vocab_size)]
+
+ # Set up validation command variable to use with validation set.
+ validations_command = []
+ if options.validation_file is not None:
+ validations_command = [
+ "--validation_file", (options.validation_file + ".numberized")]
+
+ # In order to allow for different models to be trained after the same
+ # preparation step, we should provide an option for multiple output
+ # directories.
+ # If we have not set output_dir, set it to the same thing as the working
+ # dir.
+
+ if options.output_dir is None:
+ options.output_dir = options.working_dir
+ else:
+ # Create output dir if necessary
+ if not os.path.exists(options.output_dir):
+ os.makedirs(options.output_dir)
+
+ config_file = os.path.join(
+ options.output_dir,
+ options.config_options_file + '-' + options.output_model)
+ log_file = os.path.join(
+ options.output_dir, options.log_file + '-' + options.output_model)
+ log_file_write = open(log_file, 'w')
+ config_file_write = open(config_file, 'w')
+
+ config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
+
+ in_file = os.path.join(
+ options.working_dir,
+ os.path.basename(options.corpus_stem) + ".numberized")
+
+ model_prefix = os.path.join(
+ options.output_dir, options.output_model + ".model.nplm")
+ train_args = [
+ options.nplm_home + "/src/trainNeuralNetwork",
+ "--train_file", in_file,
+ "--num_epochs", str(options.epochs),
+ "--model_prefix", model_prefix,
+ "--learning_rate", str(options.learning_rate),
+ "--minibatch_size", str(options.minibatch_size),
+ "--num_noise_samples", str(options.noise),
+ "--num_hidden", str(options.hidden),
+ "--input_embedding_dimension", str(options.input_embedding),
+ "--output_embedding_dimension", str(options.output_embedding),
+ "--num_threads", str(options.threads),
+ "--activation_function",
+ options.activation_fn,
+ ] + validations_command + vocab_command
+ print("Train model command: ")
+ print(', '.join(train_args))
+
+ config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
+ config_file_write.close()
+
+ log_file_write.write("Training output:\n")
+ ret = subprocess.call(
+ train_args, stdout=log_file_write, stderr=log_file_write)
+ if ret:
+ raise Exception("Training failed")
+
+ log_file_write.close()
-if __name__ == "__main__":
- options = parser.parse_args()
- main(options)
+if __name__ == "__main__":
+ options = parser.parse_args()
+ main(options)