Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRico Sennrich <rico.sennrich@gmx.ch>2015-05-29 18:07:26 +0300
committerRico Sennrich <rico.sennrich@gmx.ch>2015-05-29 18:07:26 +0300
commit5d8af9c2896d86785c5db2fd3a8029ae9b741e26 (patch)
treeb99868426a8c941b995d85a39d9378801e66f6a9 /scripts/training
parentef028446f3640e007215b4576a4dc52a9c9de6db (diff)
support memory-mapped files for NPLM training
Diffstat (limited to 'scripts/training')
-rwxr-xr-xscripts/training/bilingual-lm/train_nplm.py14
-rwxr-xr-xscripts/training/rdlm/train_rdlm.py33
-rwxr-xr-xscripts/training/train-neurallm.py33
3 files changed, 68 insertions, 12 deletions
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
index cb5980a91..572076006 100755
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -39,7 +39,8 @@ parser.add_argument("--input-words-file", dest="input_words_file")
parser.add_argument("--output-words-file", dest="output_words_file")
parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int)
parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
-
+parser.add_argument("--mmap", dest="mmap", action="store_true",
+ help="Use memory-mapped file (for lower memory consumption).")
parser.set_defaults(
working_dir="working",
@@ -113,6 +114,11 @@ def main(options):
options.working_dir,
os.path.basename(options.corpus_stem) + ".numberized")
+ mmap_command = []
+ if options.mmap:
+ in_file += '.mmap'
+ mmap_command = ['--mmap_file', '1']
+
model_prefix = os.path.join(
options.output_dir, options.output_model + ".model.nplm")
train_args = [
@@ -127,9 +133,9 @@ def main(options):
"--input_embedding_dimension", str(options.input_embedding),
"--output_embedding_dimension", str(options.output_embedding),
"--num_threads", str(options.threads),
- "--activation_function",
- options.activation_fn,
- ] + validations_command + vocab_command
+ "--activation_function", options.activation_fn,
+ "--ngram_size", str(options.ngram_size),
+ ] + validations_command + vocab_command + mmap_command
print("Train model command: ")
print(', '.join(train_args))
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
index a7edbab36..289ab405c 100755
--- a/scripts/training/rdlm/train_rdlm.py
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -94,11 +94,14 @@ parser.add_argument(
"--output-words-file", dest="output_words_file", metavar="PATH",
help="Output vocabulary (default: %(default)s).")
parser.add_argument(
- "--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT",
+ "--input-vocab-size", dest="input_vocab_size", type=int, metavar="INT",
help="Input vocabulary size (default: %(default)s).")
parser.add_argument(
"--output-vocab-size", dest="output_vocab_size", type=int, metavar="INT",
help="Output vocabulary size (default: %(default)s).")
+parser.add_argument(
+ "--mmap", dest="mmap", action="store_true",
+ help="Use memory-mapped file (for lower memory consumption).")
parser.set_defaults(
@@ -195,11 +198,14 @@ def main(options):
"extracting vocabulary from training text.\n")
prepare_vocabulary(options)
+ numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
+ train_file = numberized_file
+ if options.mmap:
+ train_file += '.mmap'
+
extract_options = extract_syntactic_ngrams.create_parser().parse_args([
'--input', options.corpus_stem,
- '--output', os.path.join(
- options.working_dir,
- os.path.basename(options.corpus_stem) + '.numberized'),
+ '--output', os.path.join(options.working_dir, numberized_file),
'--vocab', options.input_words_file,
'--output_vocab', options.output_words_file,
'--right_context', str(options.right_context_size),
@@ -222,6 +228,23 @@ def main(options):
else:
options.validation_file = None
+ if options.mmap:
+ try:
+ os.remove(os.path.join(options.working_dir, train_file))
+ except OSError:
+ pass
+ mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'),
+ '--input_file',
+ os.path.join(options.working_dir, numberized_file),
+ '--output_file',
+ os.path.join(options.working_dir, train_file)
+ ]
+ sys.stderr.write('creating memory-mapped file\n')
+ sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
+ ret = subprocess.call(mmap_cmd)
+ if ret:
+ raise Exception("creating memory-mapped file failed")
+
sys.stderr.write('training neural network\n')
train_nplm.main(options)
@@ -234,7 +257,7 @@ def main(options):
options.output_model + '.model.nplm.' + str(options.epochs)),
os.path.join(
options.working_dir,
- os.path.basename(options.corpus_stem) + '.numberized'),
+ numberized_file),
os.path.join(options.output_dir, options.output_model + '.model.nplm')
])
if ret:
diff --git a/scripts/training/train-neurallm.py b/scripts/training/train-neurallm.py
index fec859611..ae77a42af 100755
--- a/scripts/training/train-neurallm.py
+++ b/scripts/training/train-neurallm.py
@@ -87,6 +87,9 @@ parser.add_argument(
parser.add_argument(
"--vocab-size", dest="vocab_size", type=int, metavar="INT",
help="Vocabulary size (default: %(default)s).")
+parser.add_argument(
+ "--mmap", dest="mmap", action="store_true",
+ help="Use memory-mapped file (for lower memory consumption).")
parser.set_defaults(
working_dir="working",
@@ -121,20 +124,43 @@ def main(options):
if not os.path.exists(options.output_dir):
os.makedirs(options.output_dir)
+ numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
+ train_file = numberized_file
+ if options.mmap:
+ train_file += '.mmap'
+
extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
'--train_text', options.corpus_stem,
'--ngramize', '1',
'--ngram_size', str(options.ngram_size),
'--vocab_size', str(options.vocab_size),
'--write_words_file', os.path.join(options.working_dir, options.words_file),
- '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized')
+ '--train_file', os.path.join(options.working_dir, numberized_file)
]
sys.stderr.write('extracting n-grams\n')
+ sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
ret = subprocess.call(extraction_cmd)
if ret:
raise Exception("preparing neural LM failed")
-
+
+ if options.mmap:
+ try:
+ os.remove(os.path.join(options.working_dir, train_file))
+ except OSError:
+ pass
+ mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'),
+ '--input_file',
+ os.path.join(options.working_dir, numberized_file),
+ '--output_file',
+ os.path.join(options.working_dir, train_file)
+ ]
+ sys.stderr.write('creating memory-mapped file\n')
+ sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
+ ret = subprocess.call(mmap_cmd)
+ if ret:
+ raise Exception("creating memory-mapped file failed")
+
if options.validation_corpus:
extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
@@ -147,6 +173,7 @@ def main(options):
]
sys.stderr.write('extracting n-grams (validation file)\n')
+ sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
ret = subprocess.call(extraction_cmd)
if ret:
raise Exception("preparing neural LM failed")
@@ -166,7 +193,7 @@ def main(options):
average_options = averageNullEmbedding.parser.parse_args(
['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
'-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'),
- '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
+ '-t', os.path.join(options.working_dir, numberized_file),
'-p', os.path.join(options.nplm_home, 'python')])
averageNullEmbedding.main(average_options)