diff options
author | Ulrich Germann <ugermann@inf.ed.ac.uk> | 2015-03-20 21:57:00 +0300 |
---|---|---|
committer | Ulrich Germann <ugermann@inf.ed.ac.uk> | 2015-03-20 21:57:00 +0300 |
commit | 0854d3339e6972411cbc73ebb50b6b7be4bd14bd (patch) | |
tree | 2cc9d660f14a61aa593546d2f5c67a070054fb55 | |
parent | b1c9d8a5284ec505a112df52281639ff044830dd (diff) | |
parent | 3a673fc8dcd32e63c4539d72ee0261f6e7aa8a37 (diff) |
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
-rw-r--r-- | scripts/ems/experiment.meta | 32 | ||||
-rwxr-xr-x | scripts/training/bilingual-lm/train_nplm.py | 137 | ||||
-rw-r--r-- | scripts/training/rdlm/README | 38 | ||||
-rwxr-xr-x | scripts/training/rdlm/extract_syntactic_ngrams.py | 43 | ||||
-rwxr-xr-x | scripts/training/rdlm/extract_vocab.py | 60 | ||||
-rwxr-xr-x | scripts/training/rdlm/train_model_head.sh | 65 | ||||
-rwxr-xr-x | scripts/training/rdlm/train_model_label.sh | 72 | ||||
-rwxr-xr-x | scripts/training/rdlm/train_rdlm.py | 158 | ||||
-rwxr-xr-x | scripts/training/reduce-factors.perl | 78 | ||||
-rwxr-xr-x | scripts/training/wrappers/mosesxml2brackets.py | 51 |
10 files changed, 436 insertions, 298 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index f9a400eef..9ce378a1a 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -940,19 +940,34 @@ truecase-reference-devtest template: $output-truecaser -model IN1.$output-extension < IN > OUT split-reference in: truecased-reference SPLITTER:splitter-model - out: reference + out: split-ref default-name: tuning/reference.split pass-unless: output-splitter multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-splitter -model IN1.$output-extension < IN > OUT split-reference-devtest in: truecased-reference-devtest SPLITTER:splitter-model - out: reference-devtest + out: split-ref-devtest default-name: tuning/reference.devtest.split pass-unless: output-splitter ignore-unless: use-mira multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-splitter -model IN1.$output-extension < IN > OUT +reduce-reference + in: split-ref + out: reference + default-name: tuning/reference.reduced + pass-unless: mock-output-parser-references + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees +reduce-reference-devtest + in: split-ref-devtest + out: reference + default-name: tuning/reference.devtest.reduced + pass-unless: mock-output-parser-references + ignore-unless: use-mira + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees filter in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table out: filtered-dir @@ -1203,12 +1218,19 @@ mock-parse-reference template: $mock-output-parser-references < IN > OUT lowercase-reference in: mock-parsed-reference - out: reference - default-name: evaluation/reference + out: lowercased-reference + default-name: evaluation/reference.lowercased pass-unless: output-lowercaser - pass-if: recaser + pass-if: recaser multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl template: $output-lowercaser < IN > OUT +reduce-reference + in: lowercased-reference + out: reference + default-name: evaluation/reference + pass-unless: mock-output-parser-references + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees wade in: filtered-dir truecased-input tokenized-reference alignment system-output out: wade-analysis diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py index b19e7d94c..356fd798d 100755 --- a/scripts/training/bilingual-lm/train_nplm.py +++ b/scripts/training/bilingual-lm/train_nplm.py @@ -1,34 +1,40 @@ #!/usr/bin/env python +from __future__ import print_function, unicode_literals + import logging -import optparse +import argparse import subprocess import sys import os -def main(): - logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) - parser = optparse.OptionParser("%prog [options]") - parser.add_option("-w", "--working-dir", dest="working_dir") - parser.add_option("-c", "--corpus", dest="corpus_stem") - parser.add_option("-l", "--nplm-home", dest="nplm_home") - parser.add_option("-e", "--epochs", dest="epochs", type="int") - parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int") - parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int") - parser.add_option("-s", "--noise", dest="noise", type="int") - parser.add_option("-d", "--hidden", dest="hidden", type="int") - parser.add_option("-i", "--input-embedding", dest="input_embedding", type="int") - parser.add_option("-o", "--output-embedding", dest="output_embedding", type="int") - parser.add_option("-t", "--threads", dest="threads", type="int") - parser.add_option("-m", "--output-model", dest="output_model") - parser.add_option("-r", "--output-dir", dest="output_dir") - parser.add_option("-f", "--config-options-file", dest="config_options_file") - parser.add_option("-g", "--log-file", dest="log_file") - parser.add_option("-v", "--validation-ngrams", dest="validation_file") - parser.add_option("-a", "--activation-function", dest="activation_fn") - parser.add_option("-z", "--learning-rate", dest="learning_rate") - - parser.set_defaults( +logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) +parser = argparse.ArgumentParser() +parser.add_argument("-w", "--working-dir", dest="working_dir") +parser.add_argument("-c", "--corpus", dest="corpus_stem") +parser.add_argument("-l", "--nplm-home", dest="nplm_home") +parser.add_argument("-e", "--epochs", dest="epochs", type=int) +parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int) +parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int) +parser.add_argument("-s", "--noise", dest="noise", type=int) +parser.add_argument("-d", "--hidden", dest="hidden", type=int) +parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int) +parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int) +parser.add_argument("-t", "--threads", dest="threads", type=int) +parser.add_argument("-m", "--output-model", dest="output_model") +parser.add_argument("-r", "--output-dir", dest="output_dir") +parser.add_argument("-f", "--config-options-file", dest="config_options_file") +parser.add_argument("-g", "--log-file", dest="log_file") +parser.add_argument("-v", "--validation-ngrams", dest="validation_file") +parser.add_argument("-a", "--activation-function", dest="activation_fn") +parser.add_argument("-z", "--learning-rate", dest="learning_rate") +parser.add_argument("--input-words-file", dest="input_words_file") +parser.add_argument("--output-words-file", dest="output_words_file") +parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int) +parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int) + + +parser.set_defaults( working_dir = "working" ,corpus_stem = "train.10k" ,nplm_home = "/home/bhaddow/tools/nplm" @@ -46,16 +52,29 @@ def main(): ,log_file = "log" ,validation_file = None ,activation_fn = "rectifier" - ,learning_rate = "1" - ) + ,learning_rate = 1 + ,input_words_file = None + ,output_words_file = None + ,input_vocab_size = 0 + ,output_vocab_size = 0 + ) + +def main(options): + + vocab_command = [] + if options.input_words_file is not None: + vocab_command += ['--input_words_file', options.input_words_file] + if options.output_words_file is not None: + vocab_command += ['--output_words_file', options.output_words_file] + if options.input_vocab_size: + vocab_command += ['--input_vocab_size', str(options.input_vocab_size)] + if options.output_vocab_size: + vocab_command += ['--output_vocab_size', str(options.output_vocab_size)] - options,args = parser.parse_args(sys.argv) - # Set up validation command variable to use with validation set. validations_command = [] if options.validation_file is not None: validations_command =["--validation_file", (options.validation_file + ".numberized")] - # In order to allow for different models to be trained after the same # preparation step, we should provide an option for multiple output directories @@ -68,56 +87,42 @@ def main(): if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) - config_file = options.output_dir + "/" + options.config_options_file + '-' + options.output_model - log_file = options.output_dir + "/" + options.log_file + '-' + options.output_model + config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model) + log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model) log_file_write = open(log_file, 'w') config_file_write = open(config_file, 'w') config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n') - in_file = options.working_dir + "/" + options.corpus_stem + ".numberized" - - - model_prefix = options.output_dir + "/" + options.output_model + ".model.nplm" - train_args = [options.nplm_home + "/src/trainNeuralNetwork", "--train_file", in_file, "--num_epochs", str(options.epochs), - "--model_prefix", - model_prefix, "--learning_rate", options.learning_rate, "--minibatch_size", str(options.minibatch_size), - "--num_noise_samples", str(options.noise), "--num_hidden", str(options.hidden), "--input_embedding_dimension", - str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads", - str(options.threads), "--activation_function", options.activation_fn] + validations_command - print "Train model command: " - print ', '.join(train_args) + in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized") + + model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm") + train_args = [options.nplm_home + "/src/trainNeuralNetwork", + "--train_file", in_file, + "--num_epochs", str(options.epochs), + "--model_prefix", model_prefix, + "--learning_rate", str(options.learning_rate), + "--minibatch_size", str(options.minibatch_size), + "--num_noise_samples", str(options.noise), + "--num_hidden", str(options.hidden), + "--input_embedding_dimension", str(options.input_embedding), + "--output_embedding_dimension", str(options.output_embedding), + "--num_threads", str(options.threads), + "--activation_function", options.activation_fn] + validations_command + vocab_command + print("Train model command: ") + print(', '.join(train_args)) config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n') config_file_write.close() log_file_write.write("Training output:\n") ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write) - if ret: raise Exception("Training failed") + if ret: + raise Exception("Training failed") log_file_write.close() if __name__ == "__main__": - main() - - - - -#EPOCHS=10 -#NGRAM_SIZE=14 -#MINIBATCH_SIZE=1000 -#NOISE=100 -#HIDDEN=750 -#INPUT_EMBEDDING=150 -#OUTPUT_EMBEDDING=150 -#THREADS=8 -# - -#$ROOT/src/prepareNeuralLM --train_text $INFILE --ngram_size $NGRAM_SIZE --ngramize 0 --words_file $VOCAB --train_file $WORKDIR/train.ngrams || exit 1 - -#$ROOT/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams \ -# --num_epochs $EPOCHS --input_words_file $VOCAB --output_words_file $VOCAB --model_prefix $WORKDIR/$PREFIX \ -# --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \ -# --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1 - + options = parser.parse_args() + main(options) diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README index 347e71f6d..209daf1c0 100644 --- a/scripts/training/rdlm/README +++ b/scripts/training/rdlm/README @@ -1,10 +1,11 @@ RDLM: relational dependency language model ------------------------------------------ -This is a language model for the string-to-tree decoder with a dependency grammar. -It should work with any corpus with projective dependency annotation in ConLL format, -converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py -It depends on NPLM for neural network training and querying. +This is a language model for the string-to-tree decoder with a dependency +grammar. It should work with any corpus with projective dependency annotation in +ConLL format, converted into the Moses format with the script +mosesdecoder/scripts/training/wrappers/conll2mosesxml.py It depends on NPLM for +neural network training and querying. Prerequisites ------------- @@ -16,20 +17,27 @@ Install NPLM and compile moses with it. See the instructions in the Moses docume Training -------- -RDLM is designed for string-to-tree decoding with dependency annotation on the target side. -If you have such a system, you can train RDLM on the target side of the same parallel corpus -that is used for training the translation model. +RDLM is designed for string-to-tree decoding with dependency annotation on the +target side. If you have such a system, you can train RDLM on the target side of +the same parallel corpus that is used for training the translation model. -To train the model on additional monolingual data, or test it on some held-out test/dev data, -parse and process it in the same way that the parallel corpus has been processed. -This includes tokenization, parsing, truecasing, compound splitting etc. +To train the model on additional monolingual data, or test it on some held-out +test/dev data, parse and process it in the same way that the parallel corpus has +been processed. This includes tokenization, parsing, truecasing, compound +splitting etc. -RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh` -set the paths to NPLM, Moses, and the training/test files in the respective files, then execute: +RDLM is split into two neural network models, which can be trained with +`train_rdlm.py`. An example command for training follows: - ./train_model_head.sh rdlm_head.nnlm working_dir_head - ./train_model_label.sh rdlm_label.nnlm working_dir_label + mkdir working_dir_head + mkdir working_dir_label + ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise-samples 100 + ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50 +for more options, run `train_rdlm.py --help`. Parameters you may want to adjust +include the vocabulary size of the label model (depending on the number of +dependency relations in the grammar), the size of the models, and the number of +training epochs. Decoding -------- @@ -37,7 +45,7 @@ Decoding To use RDLM during decoding, add the following line to your moses.ini config: [feature] - RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0 + RDLM path_head_lm=/path/to/output_directory/rdlm_head.model.nplm path_label_lm=/path/to/output_directory/rdlm_label.model.nplm context_up=2 context_left=3 context_right=0 [weight] RDLM 0.1 0.1 diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py index 12d62d1e6..eca1b3a49 100755 --- a/scripts/training/rdlm/extract_syntactic_ngrams.py +++ b/scripts/training/rdlm/extract_syntactic_ngrams.py @@ -9,17 +9,24 @@ from __future__ import print_function, unicode_literals, division import sys import codecs -import io import argparse +# hack for python2/3 compatibility +from io import open +argparse.open = open + try: from lxml import etree as ET except ImportError: from xml.etree import cElementTree as ET -def parse_arguments(): +def create_parser(): parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM") + parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH', + help='input file (default: standard input).') + parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH', + help='output file (default: standard output).') parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)', choices=['label', 'head'], required=True) parser.add_argument('--vocab', metavar='PATH', type=str, required=True, @@ -40,7 +47,7 @@ def parse_arguments(): help='sentence end symbol. Will be skipped during extraction (default: %(default)s)') parser.add_argument('--ptkvz', action='store_true', help='special rule for German dependency trees: concatenate separable verb prefix and verb') - return parser.parse_args() + return parser def escape_text(s): @@ -203,7 +210,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p int_list.append(vocab.get(labels[i], 0)) int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0))) - sys.stdout.write(' '.join(map(str, int_list)) + '\n') + options.output.write(' '.join(map(str, int_list)) + '\n') parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0))) parent_labels.append(vocab.get(labels[i], 0)) @@ -216,18 +223,11 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p def load_vocab(path): v = {} - for i,line in enumerate(io.open(path, encoding="UTF-8")): + for i,line in enumerate(open(path, encoding="UTF-8")): v[line.strip()] = i return v -if __name__ == '__main__': - - if sys.version_info < (3, 0): - sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) - sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) - - options = parse_arguments() - +def main(options): vocab = load_vocab(options.vocab) if options.output_vocab is None: @@ -236,13 +236,17 @@ if __name__ == '__main__': else: output_vocab = load_vocab(options.output_vocab) + global start_head_idx + global start_label_idx + global stop_head_idx + global stop_label_idx start_head_idx = vocab.get("<start_head>", 0) start_label_idx = vocab.get("<start_label>", 0) stop_head_idx = vocab.get("<stop_head>", 0) stop_label_idx = vocab.get("<stop_label>", 0) i = 0 - for line in sys.stdin: + for line in options.input: if i and not i % 50000: sys.stderr.write('.') if i and not i % 1000000: @@ -260,3 +264,14 @@ if __name__ == '__main__': xml = ET.fromstring(line) get_syntactic_ngrams(xml, options, vocab, output_vocab) i += 1 + +if __name__ == '__main__': + + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + + parser = create_parser() + options = parser.parse_args() + + main(options)
\ No newline at end of file diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py index 684fdcd32..55ecbe554 100755 --- a/scripts/training/rdlm/extract_vocab.py +++ b/scripts/training/rdlm/extract_vocab.py @@ -7,16 +7,19 @@ from __future__ import print_function, unicode_literals, division import sys import codecs -import io import argparse from collections import Counter +# hack for python2/3 compatibility +from io import open +argparse.open = open + try: from lxml import etree as ET except ImportError: from xml.etree import cElementTree as ET -def parse_arguments(): +def create_parser(): help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n" help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n"; @@ -34,9 +37,7 @@ def parse_arguments(): parser.add_argument('--ptkvz', action="store_true", help='special rule for German dependency trees: attach separable verb prefixes to verb') - args = parser.parse_args() - - return args + return parser def escape_text(s): @@ -48,7 +49,7 @@ def escape_text(s): return s # deterministic heuristic to get head of subtree -def get_head(xml): +def get_head(xml, args): head = None preterminal = None for child in xml: @@ -70,11 +71,11 @@ def get_head(xml): return head, preterminal -def get_vocab(xml): +def get_vocab(xml, args): if len(xml): - head, preterminal = get_head(xml) + head, preterminal = get_head(xml, args) if not head: head = '<null>' preterminal = '<null>' @@ -89,18 +90,13 @@ def get_vocab(xml): for child in xml: if not len(child): continue - get_vocab(child) - - - -if __name__ == '__main__': + get_vocab(child, args) - if sys.version_info < (3, 0): - sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) - sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) - sys.stdin = codecs.getreader('UTF-8')(sys.stdin) +def main(args): - args = parse_arguments() + global heads + global preterminals + global nonterminals heads = Counter() preterminals = Counter() @@ -115,11 +111,8 @@ if __name__ == '__main__': if line == '\n': continue - # hack for older moses versions with inconsistent encoding of "|" - line = line.replace('&bar;', '|') - xml = ET.fromstring(line) - get_vocab(xml) + get_vocab(xml, args) i += 1 special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>'] @@ -127,27 +120,27 @@ if __name__ == '__main__': for i in range(30): special_tokens.append('<null_{0}>'.format(i)) - f = io.open(args.output + '.special', 'w', encoding='UTF-8') + f = open(args.output + '.special', 'w', encoding='UTF-8') for item in special_tokens: f.write(item + '\n') f.close() - f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8') + f = open(args.output + '.preterminals', 'w', encoding='UTF-8') for item in sorted(preterminals, key=preterminals.get, reverse=True): f.write(item + '\n') f.close() - f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8') + f = open(args.output + '.nonterminals', 'w', encoding='UTF-8') for item in sorted(nonterminals, key=nonterminals.get, reverse=True): f.write(item + '\n') f.close() - f = io.open(args.output + '.terminals', 'w', encoding='UTF-8') + f = open(args.output + '.terminals', 'w', encoding='UTF-8') for item in sorted(heads, key=heads.get, reverse=True): f.write(item + '\n') f.close() - f = io.open(args.output + '.all', 'w', encoding='UTF-8') + f = open(args.output + '.all', 'w', encoding='UTF-8') special_tokens_set = set(special_tokens) for item in sorted(nonterminals, key=nonterminals.get, reverse=True): if item not in special_tokens: @@ -167,3 +160,16 @@ if __name__ == '__main__': i += 1 f.write(item + '\n') f.close() + + + +if __name__ == '__main__': + + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + + parser = create_parser() + args = parser.parse_args() + main(args) diff --git a/scripts/training/rdlm/train_model_head.sh b/scripts/training/rdlm/train_model_head.sh deleted file mode 100755 index fdead9061..000000000 --- a/scripts/training/rdlm/train_model_head.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -if [ $# -eq 2 ]; then - OUTFILE=$1 - WORKDIR=$2 -else - echo "usage: $0 <outfile> <working_directory>" - exit 1 -fi - -NPLM=/path/to/nplm -MOSES_ROOT=/path/to/mosesdecoder - -INFILE=/path/to/file/in/moses/xml/format -VALIDATIONFILE=/path/to/file/in/moses/xml/format -#TESTFILE1=/path/to/file/in/moses/xml/format -#TESTFILE2=/path/to/file/in/moses/xml/format -PREFIX=$(basename $OUTFILE) - -EPOCHS=2 -INPUT_VOCAB_SIZE=500000 -OUTPUT_VOCAB_SIZE=500000 -MINIBATCH_SIZE=1000 -NOISE=100 -HIDDEN=0 -INPUT_EMBEDDING=150 -OUTPUT_EMBEDDING=750 -THREADS=4 -MODE=head -UP_CONTEXT=2 -LEFT_CONTEXT=3 -RIGHT_CONTEXT=0 - - -mkdir -p $WORKDIR - -python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1 - -head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input -head -n $OUTPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.output - -python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ - --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1 -python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ - --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1 - -$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \ - --num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \ - --input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \ - --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \ - --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1 - -python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1 - -if [[ $TESTFILE1 ]]; then - python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ - --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1 - $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1 -fi - -if [[ $TESTFILE2 ]]; then - python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ - --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1 - $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1 -fi diff --git a/scripts/training/rdlm/train_model_label.sh b/scripts/training/rdlm/train_model_label.sh deleted file mode 100755 index 371c69a3b..000000000 --- a/scripts/training/rdlm/train_model_label.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -if [ $# -eq 2 ]; then - OUTFILE=$1 - WORKDIR=$2 -else - echo "usage: $0 <outfile> <working_directory>" - exit 1 -fi - -NPLM=/path/to/nplm -MOSES_ROOT=/path/to/mosesdecoder - -INFILE=/path/to/file/in/moses/xml/format -VALIDATIONFILE=/path/to/file/in/moses/xml/format -#TESTFILE1=/path/to/file/in/moses/xml/format -#TESTFILE2=/path/to/file/in/moses/xml/format -PREFIX=$(basename $OUTFILE) - -EPOCHS=1 -INPUT_VOCAB_SIZE=500000 -OUTPUT_VOCAB_SIZE=75 -MINIBATCH_SIZE=1000 -NOISE=50 -HIDDEN=0 -INPUT_EMBEDDING=150 -OUTPUT_EMBEDDING=750 -THREADS=4 -MODE=label -UP_CONTEXT=2 -LEFT_CONTEXT=3 -RIGHT_CONTEXT=0 - - -mkdir -p $WORKDIR - -python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1 - -head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input -cat $WORKDIR/vocab_target.special $WORKDIR/vocab_target.nonterminals | - grep -v "^<null" | - grep -v "^<root" | - grep -v "^<start_head" | - grep -v "^<dummy" | - grep -v "^<head_head" | - grep -v "^<stop_head" | - head -n $OUTPUT_VOCAB_SIZE > $WORKDIR/vocab.output - -python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ - --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1 -python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ - --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1 - -$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \ - --num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \ - --input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \ - --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \ - --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1 - -python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1 - -if [[ $TESTFILE1 ]]; then - python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ - --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1 - $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1 -fi - -if [[ $TESTFILE2 ]]; then - python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \ - --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1 - $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1 -fi
\ No newline at end of file diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py new file mode 100755 index 000000000..1e7ecac52 --- /dev/null +++ b/scripts/training/rdlm/train_rdlm.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import print_function, unicode_literals + +import logging +import argparse +import subprocess +import sys +import os +import codecs +import copy + +# ../bilingual-lm +sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'bilingual-lm')) +import train_nplm +import extract_vocab +import extract_syntactic_ngrams + +logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) +parser = argparse.ArgumentParser() +parser.add_argument("--working-dir", dest="working_dir", metavar="PATH") +parser.add_argument("--corpus", dest="corpus_stem", metavar="PATH", help="input file") +parser.add_argument("--nplm-home", dest="nplm_home", metavar="PATH", help="location of NPLM", required=True) +parser.add_argument("--epochs", dest="epochs", type=int, metavar="INT", help="number of training epochs (default: %(default)s)") +parser.add_argument("--up-context-size", dest="up_context_size", type=int, metavar="INT", help="size of ancestor context (default: %(default)s)") +parser.add_argument("--left-context-size", dest="left_context_size", type=int, metavar="INT", help="size of sibling context (left) (default: %(default)s)") +parser.add_argument("--right-context-size", dest="right_context_size", type=int, metavar="INT", help="size of sibling context (right) (default: %(default)s)") +parser.add_argument("--mode", dest="mode", choices=['head', 'label'], help="type of RDLM to train (both are required for decoding)", required=True) +parser.add_argument("--minibatch-size", dest="minibatch_size", type=int, metavar="INT", help="minibatch size (default: %(default)s)") +parser.add_argument("--noise", dest="noise", type=int, metavar="INT", help="number of noise samples for NCE (default: %(default)s)") +parser.add_argument("--hidden", dest="hidden", type=int, metavar="INT", help="size of hidden layer (0 for single hidden layer) (default: %(default)s)") +parser.add_argument("--input-embedding", dest="input_embedding", type=int, metavar="INT", help="size of input embedding layer (default: %(default)s)") +parser.add_argument("--output-embedding", dest="output_embedding", type=int, metavar="INT", help="size of output embedding layer (default: %(default)s)") +parser.add_argument("--threads", "-t", dest="threads", type=int, metavar="INT", help="number of threads (default: %(default)s)") +parser.add_argument("--output-model", dest="output_model", metavar="PATH", help="name of output model (default: %(default)s)") +parser.add_argument("--output-dir", dest="output_dir", metavar="PATH", help="output directory (default: same as working-dir)") +parser.add_argument("--config-options-file", dest="config_options_file", metavar="PATH") +parser.add_argument("--log-file", dest="log_file", metavar="PATH", help="log file to write to (default: %(default)s)") +parser.add_argument("--validation-corpus", dest="validation_corpus", metavar="PATH", help="validation file (default: %(default)s)") +parser.add_argument("--activation-function", dest="activation_fn", choices=['identity', 'rectifier', 'tanh', 'hardtanh'], help="activation function (default: %(default)s)") +parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", help="learning rate (default: %(default)s)") +parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)") +parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)") +parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)") +parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)") + + +parser.set_defaults( + working_dir = "working" + ,corpus_stem = "train" + ,nplm_home = "/home/bhaddow/tools/nplm" + ,epochs = 2 + ,up_context_size = 2 + ,left_context_size = 3 + ,right_context_size = 0 + ,minibatch_size=1000 + ,noise=100 + ,hidden=0 + ,mode='head' + ,input_embedding=150 + ,output_embedding=750 + ,threads=4 + ,output_model = "train" + ,output_dir = None + ,config_options_file = "config" + ,log_file = "log" + ,validation_corpus = None + ,activation_fn = "rectifier" + ,learning_rate = 1 + ,input_words_file = None + ,output_words_file = None + ,input_vocab_size = 500000 + ,output_vocab_size = 500000 + ) + +def prepare_vocabulary(options): + vocab_prefix = os.path.join(options.working_dir, 'vocab') + extract_vocab_options = extract_vocab.create_parser().parse_args(['--input', options.corpus_stem, '--output', vocab_prefix]) + extract_vocab.main(extract_vocab_options) + + if options.input_words_file is None: + options.input_words_file = vocab_prefix + '.input' + orig = vocab_prefix + '.all' + filtered_vocab = open(orig).readlines() + if options.input_vocab_size: + filtered_vocab = filtered_vocab[:options.input_vocab_size] + open(options.input_words_file,'w').writelines(filtered_vocab) + + if options.output_words_file is None: + options.output_words_file = vocab_prefix + '.output' + if options.mode == 'label': + blacklist = ['<null', '<root', '<start_head', '<dummy', '<head_head', '<stop_head'] + orig = vocab_prefix + '.special' + filtered_vocab = open(orig).readlines() + orig = vocab_prefix + '.nonterminals' + filtered_vocab += open(orig).readlines() + filtered_vocab = [word for word in filtered_vocab if not word.startswith(prefix) for prefix in blacklist] + if options.output_vocab_size: + filtered_vocab = filtered_vocab[:options.output_vocab_size] + else: + orig = vocab_prefix + '.all' + filtered_vocab = open(orig).readlines()[:options.output_vocab_size] + open(options.output_words_file,'w').writelines(filtered_vocab) + +def main(options): + + options.ngram_size = 2*options.up_context_size + 2*options.left_context_size + 2*options.right_context_size + if options.mode == 'head': + options.ngram_size += 2 + elif options.mode == 'label': + options.ngram_size += 1 + + if options.input_words_file is None or options.output_words_file is None: + sys.stderr.write('either input vocabulary or output vocabulary not specified: extracting vocabulary from training text\n') + prepare_vocabulary(options) + + extract_options = extract_syntactic_ngrams.create_parser().parse_args(['--input', options.corpus_stem, + '--output', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'), + '--vocab', options.input_words_file, + '--output_vocab', options.output_words_file, + '--right_context', str(options.right_context_size), + '--left_context', str(options.left_context_size), + '--up_context', str(options.up_context_size), + '--mode', options.mode + ]) + sys.stderr.write('extracting syntactic n-grams\n') + extract_syntactic_ngrams.main(extract_options) + + if validation_corpus: + extract_options.input = options.validation_corpus + options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized') + extract_options.output = options.validation_file + sys.stderr.write('extracting syntactic n-grams (validation file)\n') + extract_syntactic_ngrams.main(extract_options) + + sys.stderr.write('training neural network\n') + train_nplm.main(options) + + sys.stderr.write('averaging null words\n') + ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'), + options.nplm_home, + os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)), + os.path.join(options.working_dir, options.corpus_stem + '.numberized'), + os.path.join(options.output_dir, options.output_model + '.model.nplm.') + ]) + if ret: + raise Exception("averaging null words failed") + +if __name__ == "__main__": + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + + options = parser.parse_args() + main(options) + diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl index c7269abf9..24c9be829 100755 --- a/scripts/training/reduce-factors.perl +++ b/scripts/training/reduce-factors.perl @@ -10,11 +10,12 @@ my $___FACTOR_DELIMITER = "|"; my $ZCAT = "gzip -cd"; my $BZCAT = "bzcat"; -my ($CORPUS,$REDUCED,$FACTOR); +my ($CORPUS,$REDUCED,$FACTOR,$_XML); die("ERROR: wrong syntax when invoking reduce-factors") unless &GetOptions('corpus=s' => \$CORPUS, 'reduced-corpus=s' => \$REDUCED, - 'factor=s' => \$FACTOR); + 'factor=s' => \$FACTOR, + 'xml' => \$_XML); &reduce_factors($CORPUS,$REDUCED,$FACTOR); @@ -24,9 +25,9 @@ sub reduce_factors { my @INCLUDE = sort {$a <=> $b} split(/,/,$factors); - print "Reducing factors to produce $reduced @ ".`date`; + print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`; while(-e $reduced.".lock") { - sleep(10); + sleep(10); } if (-e $reduced) { print STDERR " $reduced in place, reusing\n"; @@ -37,29 +38,31 @@ sub reduce_factors { return; } - # peek at input, to check if we are asked to produce exactly the - # available factors - my $inh = open_or_zcat($full); - my $firstline = <$inh>; - die "Corpus file $full is empty" unless $firstline; - close $inh; - # pick first word - $firstline =~ s/^\s*//; - $firstline =~ s/\s.*//; - # count factors - my @WORD = split(/ /,$firstline); - my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); - my $maxfactorindex = scalar(@FACTOR)-1; - if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { - # create just symlink; preserving compression - my $realfull = $full; - if (!-e $realfull && -e $realfull.".gz") { + unless ($_XML) { + # peek at input, to check if we are asked to produce exactly the + # available factors + my $inh = open_or_zcat($full); + my $firstline = <$inh>; + die "Corpus file $full is empty" unless $firstline; + close $inh; + # pick first word + $firstline =~ s/^\s*//; + $firstline =~ s/\s.*//; + # count factors + my @WORD = split(/ /,$firstline); + my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); + my $maxfactorindex = scalar(@FACTOR)-1; + if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { + # create just symlink; preserving compression + my $realfull = $full; + if (!-e $realfull && -e $realfull.".gz") { $realfull .= ".gz"; $reduced =~ s/(\.gz)?$/.gz/; - } - safesystem("ln -s '$realfull' '$reduced'") + } + safesystem("ln -s '$realfull' '$reduced'") or die "Failed to create symlink $realfull -> $reduced"; - return; + return; + } } # The default is to select the needed factors @@ -71,23 +74,30 @@ sub reduce_factors { $nr++; print STDERR "." if $nr % 10000 == 0; print STDERR "($nr)" if $nr % 100000 == 0; - chomp; s/ +/ /g; s/^ //; s/ $//; - my $first = 1; - foreach (split) { - my @FACTOR = split /\Q$___FACTOR_DELIMITER/; + s/<\S[^>]*>/ /g if $_XML; # remove xml + chomp; s/ +/ /g; s/^ //; s/ $//; + my $first = 1; + foreach (split) { + my @FACTOR = split /\Q$___FACTOR_DELIMITER/; # \Q causes to disable metacharacters in regex - print OUT " " unless $first; - $first = 0; - my $first_factor = 1; + print OUT " " unless $first; + $first = 0; + my $first_factor = 1; foreach my $outfactor (@INCLUDE) { - print OUT "|" unless $first_factor; + print OUT $___FACTOR_DELIMITER unless $first_factor; $first_factor = 0; my $out = $FACTOR[$outfactor]; die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out; print OUT $out; } - } - print OUT "\n"; + # for(my $factor=0;$factor<=$#FACTOR;$factor++) { + # next unless defined($INCLUDE{$factor}); + # print OUT "|" unless $first_factor; + # $first_factor = 0; + # print OUT $FACTOR[$factor]; + # } + } + print OUT "\n"; } print STDERR "\n"; close(OUT); diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py new file mode 100755 index 000000000..b82685638 --- /dev/null +++ b/scripts/training/wrappers/mosesxml2brackets.py @@ -0,0 +1,51 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich + +# convert trees in moses XML format to PTB-style bracketed format + +from __future__ import print_function, unicode_literals +import sys +import codecs + +from lxml import etree as ET + +def escape(word): + word = word.replace('|','|') # factor separator + word = word.replace('[','[') # syntax non-terminal + word = word.replace(']',']') # syntax non-terminal + word = word.replace('\'',''') + word = word.replace('\"','"') + + return word + +def make_brackets(xml): + + out = ' [' + xml.get('label') + + if xml.text and xml.text.strip(): + word = escape(xml.text.strip()) + out += ' ' + word + ']' + + else: + for child in xml: + out += make_brackets(child) + + out += ']' + + return out + + +if __name__ == '__main__': + + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + + for line in sys.stdin: + if line == '\n': + sys.stdout.write(line) + continue + out = make_brackets(ET.fromstring(line)).strip() + sys.stdout.write(out + '\n') |