Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUlrich Germann <ugermann@inf.ed.ac.uk>2015-03-20 21:57:00 +0300
committerUlrich Germann <ugermann@inf.ed.ac.uk>2015-03-20 21:57:00 +0300
commit0854d3339e6972411cbc73ebb50b6b7be4bd14bd (patch)
tree2cc9d660f14a61aa593546d2f5c67a070054fb55
parentb1c9d8a5284ec505a112df52281639ff044830dd (diff)
parent3a673fc8dcd32e63c4539d72ee0261f6e7aa8a37 (diff)
Merge branch 'master' of https://github.com/moses-smt/mosesdecoder
-rw-r--r--scripts/ems/experiment.meta32
-rwxr-xr-xscripts/training/bilingual-lm/train_nplm.py137
-rw-r--r--scripts/training/rdlm/README38
-rwxr-xr-xscripts/training/rdlm/extract_syntactic_ngrams.py43
-rwxr-xr-xscripts/training/rdlm/extract_vocab.py60
-rwxr-xr-xscripts/training/rdlm/train_model_head.sh65
-rwxr-xr-xscripts/training/rdlm/train_model_label.sh72
-rwxr-xr-xscripts/training/rdlm/train_rdlm.py158
-rwxr-xr-xscripts/training/reduce-factors.perl78
-rwxr-xr-xscripts/training/wrappers/mosesxml2brackets.py51
10 files changed, 436 insertions, 298 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index f9a400eef..9ce378a1a 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -940,19 +940,34 @@ truecase-reference-devtest
template: $output-truecaser -model IN1.$output-extension < IN > OUT
split-reference
in: truecased-reference SPLITTER:splitter-model
- out: reference
+ out: split-ref
default-name: tuning/reference.split
pass-unless: output-splitter
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
split-reference-devtest
in: truecased-reference-devtest SPLITTER:splitter-model
- out: reference-devtest
+ out: split-ref-devtest
default-name: tuning/reference.devtest.split
pass-unless: output-splitter
ignore-unless: use-mira
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
+reduce-reference
+ in: split-ref
+ out: reference
+ default-name: tuning/reference.reduced
+ pass-unless: mock-output-parser-references
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
+ template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
+reduce-reference-devtest
+ in: split-ref-devtest
+ out: reference
+ default-name: tuning/reference.devtest.reduced
+ pass-unless: mock-output-parser-references
+ ignore-unless: use-mira
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
+ template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
filter
in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains TRAINING:transliteration-table
out: filtered-dir
@@ -1203,12 +1218,19 @@ mock-parse-reference
template: $mock-output-parser-references < IN > OUT
lowercase-reference
in: mock-parsed-reference
- out: reference
- default-name: evaluation/reference
+ out: lowercased-reference
+ default-name: evaluation/reference.lowercased
pass-unless: output-lowercaser
- pass-if: recaser
+ pass-if: recaser
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
+reduce-reference
+ in: lowercased-reference
+ out: reference
+ default-name: evaluation/reference
+ pass-unless: mock-output-parser-references
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
+ template: $moses-script-dir/training/reduce-factors.perl --factor 0 --xml 1 --corpus IN --reduced-corpus OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
wade
in: filtered-dir truecased-input tokenized-reference alignment system-output
out: wade-analysis
diff --git a/scripts/training/bilingual-lm/train_nplm.py b/scripts/training/bilingual-lm/train_nplm.py
index b19e7d94c..356fd798d 100755
--- a/scripts/training/bilingual-lm/train_nplm.py
+++ b/scripts/training/bilingual-lm/train_nplm.py
@@ -1,34 +1,40 @@
#!/usr/bin/env python
+from __future__ import print_function, unicode_literals
+
import logging
-import optparse
+import argparse
import subprocess
import sys
import os
-def main():
- logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
- parser = optparse.OptionParser("%prog [options]")
- parser.add_option("-w", "--working-dir", dest="working_dir")
- parser.add_option("-c", "--corpus", dest="corpus_stem")
- parser.add_option("-l", "--nplm-home", dest="nplm_home")
- parser.add_option("-e", "--epochs", dest="epochs", type="int")
- parser.add_option("-n", "--ngram-size", dest="ngram_size", type="int")
- parser.add_option("-b", "--minibatch-size", dest="minibatch_size", type="int")
- parser.add_option("-s", "--noise", dest="noise", type="int")
- parser.add_option("-d", "--hidden", dest="hidden", type="int")
- parser.add_option("-i", "--input-embedding", dest="input_embedding", type="int")
- parser.add_option("-o", "--output-embedding", dest="output_embedding", type="int")
- parser.add_option("-t", "--threads", dest="threads", type="int")
- parser.add_option("-m", "--output-model", dest="output_model")
- parser.add_option("-r", "--output-dir", dest="output_dir")
- parser.add_option("-f", "--config-options-file", dest="config_options_file")
- parser.add_option("-g", "--log-file", dest="log_file")
- parser.add_option("-v", "--validation-ngrams", dest="validation_file")
- parser.add_option("-a", "--activation-function", dest="activation_fn")
- parser.add_option("-z", "--learning-rate", dest="learning_rate")
-
- parser.set_defaults(
+logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+parser = argparse.ArgumentParser()
+parser.add_argument("-w", "--working-dir", dest="working_dir")
+parser.add_argument("-c", "--corpus", dest="corpus_stem")
+parser.add_argument("-l", "--nplm-home", dest="nplm_home")
+parser.add_argument("-e", "--epochs", dest="epochs", type=int)
+parser.add_argument("-n", "--ngram-size", dest="ngram_size", type=int)
+parser.add_argument("-b", "--minibatch-size", dest="minibatch_size", type=int)
+parser.add_argument("-s", "--noise", dest="noise", type=int)
+parser.add_argument("-d", "--hidden", dest="hidden", type=int)
+parser.add_argument("-i", "--input-embedding", dest="input_embedding", type=int)
+parser.add_argument("-o", "--output-embedding", dest="output_embedding", type=int)
+parser.add_argument("-t", "--threads", dest="threads", type=int)
+parser.add_argument("-m", "--output-model", dest="output_model")
+parser.add_argument("-r", "--output-dir", dest="output_dir")
+parser.add_argument("-f", "--config-options-file", dest="config_options_file")
+parser.add_argument("-g", "--log-file", dest="log_file")
+parser.add_argument("-v", "--validation-ngrams", dest="validation_file")
+parser.add_argument("-a", "--activation-function", dest="activation_fn")
+parser.add_argument("-z", "--learning-rate", dest="learning_rate")
+parser.add_argument("--input-words-file", dest="input_words_file")
+parser.add_argument("--output-words-file", dest="output_words_file")
+parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int)
+parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int)
+
+
+parser.set_defaults(
working_dir = "working"
,corpus_stem = "train.10k"
,nplm_home = "/home/bhaddow/tools/nplm"
@@ -46,16 +52,29 @@ def main():
,log_file = "log"
,validation_file = None
,activation_fn = "rectifier"
- ,learning_rate = "1"
- )
+ ,learning_rate = 1
+ ,input_words_file = None
+ ,output_words_file = None
+ ,input_vocab_size = 0
+ ,output_vocab_size = 0
+ )
+
+def main(options):
+
+ vocab_command = []
+ if options.input_words_file is not None:
+ vocab_command += ['--input_words_file', options.input_words_file]
+ if options.output_words_file is not None:
+ vocab_command += ['--output_words_file', options.output_words_file]
+ if options.input_vocab_size:
+ vocab_command += ['--input_vocab_size', str(options.input_vocab_size)]
+ if options.output_vocab_size:
+ vocab_command += ['--output_vocab_size', str(options.output_vocab_size)]
- options,args = parser.parse_args(sys.argv)
-
# Set up validation command variable to use with validation set.
validations_command = []
if options.validation_file is not None:
validations_command =["--validation_file", (options.validation_file + ".numberized")]
-
# In order to allow for different models to be trained after the same
# preparation step, we should provide an option for multiple output directories
@@ -68,56 +87,42 @@ def main():
if not os.path.exists(options.output_dir):
os.makedirs(options.output_dir)
- config_file = options.output_dir + "/" + options.config_options_file + '-' + options.output_model
- log_file = options.output_dir + "/" + options.log_file + '-' + options.output_model
+ config_file = os.path.join(options.output_dir, options.config_options_file + '-' + options.output_model)
+ log_file = os.path.join(options.output_dir, options.log_file + '-' + options.output_model)
log_file_write = open(log_file, 'w')
config_file_write = open(config_file, 'w')
config_file_write.write("Called: " + ' '.join(sys.argv) + '\n\n')
- in_file = options.working_dir + "/" + options.corpus_stem + ".numberized"
-
-
- model_prefix = options.output_dir + "/" + options.output_model + ".model.nplm"
- train_args = [options.nplm_home + "/src/trainNeuralNetwork", "--train_file", in_file, "--num_epochs", str(options.epochs),
- "--model_prefix",
- model_prefix, "--learning_rate", options.learning_rate, "--minibatch_size", str(options.minibatch_size),
- "--num_noise_samples", str(options.noise), "--num_hidden", str(options.hidden), "--input_embedding_dimension",
- str(options.input_embedding), "--output_embedding_dimension", str(options.output_embedding), "--num_threads",
- str(options.threads), "--activation_function", options.activation_fn] + validations_command
- print "Train model command: "
- print ', '.join(train_args)
+ in_file = os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + ".numberized")
+
+ model_prefix = os.path.join(options.output_dir, options.output_model + ".model.nplm")
+ train_args = [options.nplm_home + "/src/trainNeuralNetwork",
+ "--train_file", in_file,
+ "--num_epochs", str(options.epochs),
+ "--model_prefix", model_prefix,
+ "--learning_rate", str(options.learning_rate),
+ "--minibatch_size", str(options.minibatch_size),
+ "--num_noise_samples", str(options.noise),
+ "--num_hidden", str(options.hidden),
+ "--input_embedding_dimension", str(options.input_embedding),
+ "--output_embedding_dimension", str(options.output_embedding),
+ "--num_threads", str(options.threads),
+ "--activation_function", options.activation_fn] + validations_command + vocab_command
+ print("Train model command: ")
+ print(', '.join(train_args))
config_file_write.write("Training step:\n" + ' '.join(train_args) + '\n')
config_file_write.close()
log_file_write.write("Training output:\n")
ret = subprocess.call(train_args, stdout=log_file_write, stderr=log_file_write)
- if ret: raise Exception("Training failed")
+ if ret:
+ raise Exception("Training failed")
log_file_write.close()
if __name__ == "__main__":
- main()
-
-
-
-
-#EPOCHS=10
-#NGRAM_SIZE=14
-#MINIBATCH_SIZE=1000
-#NOISE=100
-#HIDDEN=750
-#INPUT_EMBEDDING=150
-#OUTPUT_EMBEDDING=150
-#THREADS=8
-#
-
-#$ROOT/src/prepareNeuralLM --train_text $INFILE --ngram_size $NGRAM_SIZE --ngramize 0 --words_file $VOCAB --train_file $WORKDIR/train.ngrams || exit 1
-
-#$ROOT/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams \
-# --num_epochs $EPOCHS --input_words_file $VOCAB --output_words_file $VOCAB --model_prefix $WORKDIR/$PREFIX \
-# --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
-# --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
-
+ options = parser.parse_args()
+ main(options)
diff --git a/scripts/training/rdlm/README b/scripts/training/rdlm/README
index 347e71f6d..209daf1c0 100644
--- a/scripts/training/rdlm/README
+++ b/scripts/training/rdlm/README
@@ -1,10 +1,11 @@
RDLM: relational dependency language model
------------------------------------------
-This is a language model for the string-to-tree decoder with a dependency grammar.
-It should work with any corpus with projective dependency annotation in ConLL format,
-converted into the Moses format with the script mosesdecoder/scripts/training/wrappers/conll2mosesxml.py
-It depends on NPLM for neural network training and querying.
+This is a language model for the string-to-tree decoder with a dependency
+grammar. It should work with any corpus with projective dependency annotation in
+ConLL format, converted into the Moses format with the script
+mosesdecoder/scripts/training/wrappers/conll2mosesxml.py It depends on NPLM for
+neural network training and querying.
Prerequisites
-------------
@@ -16,20 +17,27 @@ Install NPLM and compile moses with it. See the instructions in the Moses docume
Training
--------
-RDLM is designed for string-to-tree decoding with dependency annotation on the target side.
-If you have such a system, you can train RDLM on the target side of the same parallel corpus
-that is used for training the translation model.
+RDLM is designed for string-to-tree decoding with dependency annotation on the
+target side. If you have such a system, you can train RDLM on the target side of
+the same parallel corpus that is used for training the translation model.
-To train the model on additional monolingual data, or test it on some held-out test/dev data,
-parse and process it in the same way that the parallel corpus has been processed.
-This includes tokenization, parsing, truecasing, compound splitting etc.
+To train the model on additional monolingual data, or test it on some held-out
+test/dev data, parse and process it in the same way that the parallel corpus has
+been processed. This includes tokenization, parsing, truecasing, compound
+splitting etc.
-RDLM is split into two neural network models, which can be trained with `train_model_head.sh` and `train_model_label.sh`
-set the paths to NPLM, Moses, and the training/test files in the respective files, then execute:
+RDLM is split into two neural network models, which can be trained with
+`train_rdlm.py`. An example command for training follows:
- ./train_model_head.sh rdlm_head.nnlm working_dir_head
- ./train_model_label.sh rdlm_label.nnlm working_dir_label
+ mkdir working_dir_head
+ mkdir working_dir_label
+ ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_head --output-dir /path/to/output_directory --output-model rdlm_head --mode head --output-vocab-size 500000 --noise-samples 100
+ ./train_rdlm.py --nplm-home /path/to/nplm --working-dir working_dir_label --output-dir /path/to/output_directory --output-model rdlm_label --mode label --output-vocab-size 75 --noise-samples 50
+for more options, run `train_rdlm.py --help`. Parameters you may want to adjust
+include the vocabulary size of the label model (depending on the number of
+dependency relations in the grammar), the size of the models, and the number of
+training epochs.
Decoding
--------
@@ -37,7 +45,7 @@ Decoding
To use RDLM during decoding, add the following line to your moses.ini config:
[feature]
- RDLM path_head_lm=/path/to/rdlm_head.nnlm path_label_lm=/path/to/rdlm_label.nnlm context_up=2 context_left=3 context_right=0
+ RDLM path_head_lm=/path/to/output_directory/rdlm_head.model.nplm path_label_lm=/path/to/output_directory/rdlm_label.model.nplm context_up=2 context_left=3 context_right=0
[weight]
RDLM 0.1 0.1
diff --git a/scripts/training/rdlm/extract_syntactic_ngrams.py b/scripts/training/rdlm/extract_syntactic_ngrams.py
index 12d62d1e6..eca1b3a49 100755
--- a/scripts/training/rdlm/extract_syntactic_ngrams.py
+++ b/scripts/training/rdlm/extract_syntactic_ngrams.py
@@ -9,17 +9,24 @@
from __future__ import print_function, unicode_literals, division
import sys
import codecs
-import io
import argparse
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+
try:
from lxml import etree as ET
except ImportError:
from xml.etree import cElementTree as ET
-def parse_arguments():
+def create_parser():
parser = argparse.ArgumentParser(description="extract syntactic n-grams from parsed corpus in Moses XML format for training RDLM")
+ parser.add_argument('--input', '-i', type=argparse.FileType('r'), default=sys.stdin, metavar='PATH',
+ help='input file (default: standard input).')
+ parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, metavar='PATH',
+ help='output file (default: standard output).')
parser.add_argument('--mode', type=str, help='predict terminals (head) or dependency labels (label)',
choices=['label', 'head'], required=True)
parser.add_argument('--vocab', metavar='PATH', type=str, required=True,
@@ -40,7 +47,7 @@ def parse_arguments():
help='sentence end symbol. Will be skipped during extraction (default: %(default)s)')
parser.add_argument('--ptkvz', action='store_true',
help='special rule for German dependency trees: concatenate separable verb prefix and verb')
- return parser.parse_args()
+ return parser
def escape_text(s):
@@ -203,7 +210,7 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
int_list.append(vocab.get(labels[i], 0))
int_list.append(output_vocab.get(heads[i], output_vocab.get(preterminals[i], 0)))
- sys.stdout.write(' '.join(map(str, int_list)) + '\n')
+ options.output.write(' '.join(map(str, int_list)) + '\n')
parent_heads.append(vocab.get(heads[i], vocab.get(preterminals[i], 0)))
parent_labels.append(vocab.get(labels[i], 0))
@@ -216,18 +223,11 @@ def get_syntactic_ngrams(xml, options, vocab, output_vocab, parent_heads=None, p
def load_vocab(path):
v = {}
- for i,line in enumerate(io.open(path, encoding="UTF-8")):
+ for i,line in enumerate(open(path, encoding="UTF-8")):
v[line.strip()] = i
return v
-if __name__ == '__main__':
-
- if sys.version_info < (3, 0):
- sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
- sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
-
- options = parse_arguments()
-
+def main(options):
vocab = load_vocab(options.vocab)
if options.output_vocab is None:
@@ -236,13 +236,17 @@ if __name__ == '__main__':
else:
output_vocab = load_vocab(options.output_vocab)
+ global start_head_idx
+ global start_label_idx
+ global stop_head_idx
+ global stop_label_idx
start_head_idx = vocab.get("<start_head>", 0)
start_label_idx = vocab.get("<start_label>", 0)
stop_head_idx = vocab.get("<stop_head>", 0)
stop_label_idx = vocab.get("<stop_label>", 0)
i = 0
- for line in sys.stdin:
+ for line in options.input:
if i and not i % 50000:
sys.stderr.write('.')
if i and not i % 1000000:
@@ -260,3 +264,14 @@ if __name__ == '__main__':
xml = ET.fromstring(line)
get_syntactic_ngrams(xml, options, vocab, output_vocab)
i += 1
+
+if __name__ == '__main__':
+
+ if sys.version_info < (3, 0):
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+
+ parser = create_parser()
+ options = parser.parse_args()
+
+ main(options) \ No newline at end of file
diff --git a/scripts/training/rdlm/extract_vocab.py b/scripts/training/rdlm/extract_vocab.py
index 684fdcd32..55ecbe554 100755
--- a/scripts/training/rdlm/extract_vocab.py
+++ b/scripts/training/rdlm/extract_vocab.py
@@ -7,16 +7,19 @@
from __future__ import print_function, unicode_literals, division
import sys
import codecs
-import io
import argparse
from collections import Counter
+# hack for python2/3 compatibility
+from io import open
+argparse.open = open
+
try:
from lxml import etree as ET
except ImportError:
from xml.etree import cElementTree as ET
-def parse_arguments():
+def create_parser():
help_text = "generate 5 vocabulary files from parsed corpus in moses XML format\n"
help_text += " [PREFIX].special: around 40 symbols reserved for RDLM\n";
@@ -34,9 +37,7 @@ def parse_arguments():
parser.add_argument('--ptkvz', action="store_true",
help='special rule for German dependency trees: attach separable verb prefixes to verb')
- args = parser.parse_args()
-
- return args
+ return parser
def escape_text(s):
@@ -48,7 +49,7 @@ def escape_text(s):
return s
# deterministic heuristic to get head of subtree
-def get_head(xml):
+def get_head(xml, args):
head = None
preterminal = None
for child in xml:
@@ -70,11 +71,11 @@ def get_head(xml):
return head, preterminal
-def get_vocab(xml):
+def get_vocab(xml, args):
if len(xml):
- head, preterminal = get_head(xml)
+ head, preterminal = get_head(xml, args)
if not head:
head = '<null>'
preterminal = '<null>'
@@ -89,18 +90,13 @@ def get_vocab(xml):
for child in xml:
if not len(child):
continue
- get_vocab(child)
-
-
-
-if __name__ == '__main__':
+ get_vocab(child, args)
- if sys.version_info < (3, 0):
- sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
- sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
- sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+def main(args):
- args = parse_arguments()
+ global heads
+ global preterminals
+ global nonterminals
heads = Counter()
preterminals = Counter()
@@ -115,11 +111,8 @@ if __name__ == '__main__':
if line == '\n':
continue
- # hack for older moses versions with inconsistent encoding of "|"
- line = line.replace('&bar;', '&#124;')
-
xml = ET.fromstring(line)
- get_vocab(xml)
+ get_vocab(xml, args)
i += 1
special_tokens = ['<unk>', '<null>', '<null_label>', '<null_head>', '<head_label>', '<root_label>', '<start_label>', '<stop_label>', '<head_head>', '<root_head>', '<start_head>', '<dummy_head>', '<stop_head>']
@@ -127,27 +120,27 @@ if __name__ == '__main__':
for i in range(30):
special_tokens.append('<null_{0}>'.format(i))
- f = io.open(args.output + '.special', 'w', encoding='UTF-8')
+ f = open(args.output + '.special', 'w', encoding='UTF-8')
for item in special_tokens:
f.write(item + '\n')
f.close()
- f = io.open(args.output + '.preterminals', 'w', encoding='UTF-8')
+ f = open(args.output + '.preterminals', 'w', encoding='UTF-8')
for item in sorted(preterminals, key=preterminals.get, reverse=True):
f.write(item + '\n')
f.close()
- f = io.open(args.output + '.nonterminals', 'w', encoding='UTF-8')
+ f = open(args.output + '.nonterminals', 'w', encoding='UTF-8')
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
f.write(item + '\n')
f.close()
- f = io.open(args.output + '.terminals', 'w', encoding='UTF-8')
+ f = open(args.output + '.terminals', 'w', encoding='UTF-8')
for item in sorted(heads, key=heads.get, reverse=True):
f.write(item + '\n')
f.close()
- f = io.open(args.output + '.all', 'w', encoding='UTF-8')
+ f = open(args.output + '.all', 'w', encoding='UTF-8')
special_tokens_set = set(special_tokens)
for item in sorted(nonterminals, key=nonterminals.get, reverse=True):
if item not in special_tokens:
@@ -167,3 +160,16 @@ if __name__ == '__main__':
i += 1
f.write(item + '\n')
f.close()
+
+
+
+if __name__ == '__main__':
+
+ if sys.version_info < (3, 0):
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+ parser = create_parser()
+ args = parser.parse_args()
+ main(args)
diff --git a/scripts/training/rdlm/train_model_head.sh b/scripts/training/rdlm/train_model_head.sh
deleted file mode 100755
index fdead9061..000000000
--- a/scripts/training/rdlm/train_model_head.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-if [ $# -eq 2 ]; then
- OUTFILE=$1
- WORKDIR=$2
-else
- echo "usage: $0 <outfile> <working_directory>"
- exit 1
-fi
-
-NPLM=/path/to/nplm
-MOSES_ROOT=/path/to/mosesdecoder
-
-INFILE=/path/to/file/in/moses/xml/format
-VALIDATIONFILE=/path/to/file/in/moses/xml/format
-#TESTFILE1=/path/to/file/in/moses/xml/format
-#TESTFILE2=/path/to/file/in/moses/xml/format
-PREFIX=$(basename $OUTFILE)
-
-EPOCHS=2
-INPUT_VOCAB_SIZE=500000
-OUTPUT_VOCAB_SIZE=500000
-MINIBATCH_SIZE=1000
-NOISE=100
-HIDDEN=0
-INPUT_EMBEDDING=150
-OUTPUT_EMBEDDING=750
-THREADS=4
-MODE=head
-UP_CONTEXT=2
-LEFT_CONTEXT=3
-RIGHT_CONTEXT=0
-
-
-mkdir -p $WORKDIR
-
-python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
-
-head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
-head -n $OUTPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.output
-
-python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
- --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
-python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
- --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
-
-$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
- --num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
- --input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
- --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
- --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
-
-python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
-
-if [[ $TESTFILE1 ]]; then
- python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
- --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
- $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
-fi
-
-if [[ $TESTFILE2 ]]; then
- python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
- --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
- $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
-fi
diff --git a/scripts/training/rdlm/train_model_label.sh b/scripts/training/rdlm/train_model_label.sh
deleted file mode 100755
index 371c69a3b..000000000
--- a/scripts/training/rdlm/train_model_label.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-
-if [ $# -eq 2 ]; then
- OUTFILE=$1
- WORKDIR=$2
-else
- echo "usage: $0 <outfile> <working_directory>"
- exit 1
-fi
-
-NPLM=/path/to/nplm
-MOSES_ROOT=/path/to/mosesdecoder
-
-INFILE=/path/to/file/in/moses/xml/format
-VALIDATIONFILE=/path/to/file/in/moses/xml/format
-#TESTFILE1=/path/to/file/in/moses/xml/format
-#TESTFILE2=/path/to/file/in/moses/xml/format
-PREFIX=$(basename $OUTFILE)
-
-EPOCHS=1
-INPUT_VOCAB_SIZE=500000
-OUTPUT_VOCAB_SIZE=75
-MINIBATCH_SIZE=1000
-NOISE=50
-HIDDEN=0
-INPUT_EMBEDDING=150
-OUTPUT_EMBEDDING=750
-THREADS=4
-MODE=label
-UP_CONTEXT=2
-LEFT_CONTEXT=3
-RIGHT_CONTEXT=0
-
-
-mkdir -p $WORKDIR
-
-python $MOSES_ROOT/scripts/training/rdlm/extract_vocab.py --output $WORKDIR/vocab < $INFILE || exit 1
-
-head -n $INPUT_VOCAB_SIZE $WORKDIR/vocab.all > $WORKDIR/vocab.input
-cat $WORKDIR/vocab_target.special $WORKDIR/vocab_target.nonterminals |
- grep -v "^<null" |
- grep -v "^<root" |
- grep -v "^<start_head" |
- grep -v "^<dummy" |
- grep -v "^<head_head" |
- grep -v "^<stop_head" |
- head -n $OUTPUT_VOCAB_SIZE > $WORKDIR/vocab.output
-
-python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
- --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $INFILE > $WORKDIR/train.ngrams || exit 1
-python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
- --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $VALIDATIONFILE > $WORKDIR/validation.ngrams || exit 1
-
-$NPLM/src/trainNeuralNetwork --train_file $WORKDIR/train.ngrams --validation_file $WORKDIR/validation.ngrams \
- --num_epochs $EPOCHS --input_words_file $WORKDIR/vocab.input --output_words_file $WORKDIR/vocab.output --model_prefix $WORKDIR/$PREFIX \
- --input_vocab_size $INPUT_VOCAB_SIZE --output_vocab_size $OUTPUT_VOCAB_SIZE \
- --learning_rate 1 --minibatch_size $MINIBATCH_SIZE --num_noise_samples $NOISE --num_hidden $HIDDEN \
- --input_embedding_dimension $INPUT_EMBEDDING --output_embedding_dimension $OUTPUT_EMBEDDING --num_threads $THREADS || exit 1
-
-python $MOSES_ROOT/scripts/training/rdlm/average_null_embedding.py $NPLM $WORKDIR/$PREFIX.$(($EPOCHS)) $WORKDIR/train.ngrams $OUTFILE || exit 1
-
-if [[ $TESTFILE1 ]]; then
- python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
- --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE1 > $WORKDIR/test1.ngrams || exit 1
- $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test1.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
-fi
-
-if [[ $TESTFILE2 ]]; then
- python $MOSES_ROOT/scripts/training/rdlm/extract_syntactic_ngrams.py --vocab $WORKDIR/vocab.input --output_vocab $WORKDIR/vocab.output \
- --mode $MODE --left_context $LEFT_CONTEXT --right_context $RIGHT_CONTEXT --up_context $UP_CONTEXT < $TESTFILE2 > $WORKDIR/test2.ngrams || exit 1
- $NPLM/src/testNeuralNetwork --test_file $WORKDIR/test2.ngrams --model_file $OUTFILE --minibatch_size $MINIBATCH_SIZE --num_threads $THREADS || exit 1
-fi \ No newline at end of file
diff --git a/scripts/training/rdlm/train_rdlm.py b/scripts/training/rdlm/train_rdlm.py
new file mode 100755
index 000000000..1e7ecac52
--- /dev/null
+++ b/scripts/training/rdlm/train_rdlm.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function, unicode_literals
+
+import logging
+import argparse
+import subprocess
+import sys
+import os
+import codecs
+import copy
+
+# ../bilingual-lm
+sys.path.append(os.path.join(os.path.dirname(sys.path[0]), 'bilingual-lm'))
+import train_nplm
+import extract_vocab
+import extract_syntactic_ngrams
+
+logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
+parser = argparse.ArgumentParser()
+parser.add_argument("--working-dir", dest="working_dir", metavar="PATH")
+parser.add_argument("--corpus", dest="corpus_stem", metavar="PATH", help="input file")
+parser.add_argument("--nplm-home", dest="nplm_home", metavar="PATH", help="location of NPLM", required=True)
+parser.add_argument("--epochs", dest="epochs", type=int, metavar="INT", help="number of training epochs (default: %(default)s)")
+parser.add_argument("--up-context-size", dest="up_context_size", type=int, metavar="INT", help="size of ancestor context (default: %(default)s)")
+parser.add_argument("--left-context-size", dest="left_context_size", type=int, metavar="INT", help="size of sibling context (left) (default: %(default)s)")
+parser.add_argument("--right-context-size", dest="right_context_size", type=int, metavar="INT", help="size of sibling context (right) (default: %(default)s)")
+parser.add_argument("--mode", dest="mode", choices=['head', 'label'], help="type of RDLM to train (both are required for decoding)", required=True)
+parser.add_argument("--minibatch-size", dest="minibatch_size", type=int, metavar="INT", help="minibatch size (default: %(default)s)")
+parser.add_argument("--noise", dest="noise", type=int, metavar="INT", help="number of noise samples for NCE (default: %(default)s)")
+parser.add_argument("--hidden", dest="hidden", type=int, metavar="INT", help="size of hidden layer (0 for single hidden layer) (default: %(default)s)")
+parser.add_argument("--input-embedding", dest="input_embedding", type=int, metavar="INT", help="size of input embedding layer (default: %(default)s)")
+parser.add_argument("--output-embedding", dest="output_embedding", type=int, metavar="INT", help="size of output embedding layer (default: %(default)s)")
+parser.add_argument("--threads", "-t", dest="threads", type=int, metavar="INT", help="number of threads (default: %(default)s)")
+parser.add_argument("--output-model", dest="output_model", metavar="PATH", help="name of output model (default: %(default)s)")
+parser.add_argument("--output-dir", dest="output_dir", metavar="PATH", help="output directory (default: same as working-dir)")
+parser.add_argument("--config-options-file", dest="config_options_file", metavar="PATH")
+parser.add_argument("--log-file", dest="log_file", metavar="PATH", help="log file to write to (default: %(default)s)")
+parser.add_argument("--validation-corpus", dest="validation_corpus", metavar="PATH", help="validation file (default: %(default)s)")
+parser.add_argument("--activation-function", dest="activation_fn", choices=['identity', 'rectifier', 'tanh', 'hardtanh'], help="activation function (default: %(default)s)")
+parser.add_argument("--learning-rate", dest="learning_rate", type=float, metavar="FLOAT", help="learning rate (default: %(default)s)")
+parser.add_argument("--input-words-file", dest="input_words_file", metavar="PATH", help="input vocabulary (default: %(default)s)")
+parser.add_argument("--output-words-file", dest="output_words_file", metavar="PATH", help="output vocabulary (default: %(default)s)")
+parser.add_argument("--input_vocab_size", dest="input_vocab_size", type=int, metavar="INT", help="input vocabulary size (default: %(default)s)")
+parser.add_argument("--output_vocab_size", dest="output_vocab_size", type=int, metavar="INT", help="output vocabulary size (default: %(default)s)")
+
+
+parser.set_defaults(
+ working_dir = "working"
+ ,corpus_stem = "train"
+ ,nplm_home = "/home/bhaddow/tools/nplm"
+ ,epochs = 2
+ ,up_context_size = 2
+ ,left_context_size = 3
+ ,right_context_size = 0
+ ,minibatch_size=1000
+ ,noise=100
+ ,hidden=0
+ ,mode='head'
+ ,input_embedding=150
+ ,output_embedding=750
+ ,threads=4
+ ,output_model = "train"
+ ,output_dir = None
+ ,config_options_file = "config"
+ ,log_file = "log"
+ ,validation_corpus = None
+ ,activation_fn = "rectifier"
+ ,learning_rate = 1
+ ,input_words_file = None
+ ,output_words_file = None
+ ,input_vocab_size = 500000
+ ,output_vocab_size = 500000
+ )
+
+def prepare_vocabulary(options):
+ vocab_prefix = os.path.join(options.working_dir, 'vocab')
+ extract_vocab_options = extract_vocab.create_parser().parse_args(['--input', options.corpus_stem, '--output', vocab_prefix])
+ extract_vocab.main(extract_vocab_options)
+
+ if options.input_words_file is None:
+ options.input_words_file = vocab_prefix + '.input'
+ orig = vocab_prefix + '.all'
+ filtered_vocab = open(orig).readlines()
+ if options.input_vocab_size:
+ filtered_vocab = filtered_vocab[:options.input_vocab_size]
+ open(options.input_words_file,'w').writelines(filtered_vocab)
+
+ if options.output_words_file is None:
+ options.output_words_file = vocab_prefix + '.output'
+ if options.mode == 'label':
+ blacklist = ['<null', '<root', '<start_head', '<dummy', '<head_head', '<stop_head']
+ orig = vocab_prefix + '.special'
+ filtered_vocab = open(orig).readlines()
+ orig = vocab_prefix + '.nonterminals'
+ filtered_vocab += open(orig).readlines()
+ filtered_vocab = [word for word in filtered_vocab if not word.startswith(prefix) for prefix in blacklist]
+ if options.output_vocab_size:
+ filtered_vocab = filtered_vocab[:options.output_vocab_size]
+ else:
+ orig = vocab_prefix + '.all'
+ filtered_vocab = open(orig).readlines()[:options.output_vocab_size]
+ open(options.output_words_file,'w').writelines(filtered_vocab)
+
+def main(options):
+
+ options.ngram_size = 2*options.up_context_size + 2*options.left_context_size + 2*options.right_context_size
+ if options.mode == 'head':
+ options.ngram_size += 2
+ elif options.mode == 'label':
+ options.ngram_size += 1
+
+ if options.input_words_file is None or options.output_words_file is None:
+ sys.stderr.write('either input vocabulary or output vocabulary not specified: extracting vocabulary from training text\n')
+ prepare_vocabulary(options)
+
+ extract_options = extract_syntactic_ngrams.create_parser().parse_args(['--input', options.corpus_stem,
+ '--output', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
+ '--vocab', options.input_words_file,
+ '--output_vocab', options.output_words_file,
+ '--right_context', str(options.right_context_size),
+ '--left_context', str(options.left_context_size),
+ '--up_context', str(options.up_context_size),
+ '--mode', options.mode
+ ])
+ sys.stderr.write('extracting syntactic n-grams\n')
+ extract_syntactic_ngrams.main(extract_options)
+
+ if validation_corpus:
+ extract_options.input = options.validation_corpus
+ options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
+ extract_options.output = options.validation_file
+ sys.stderr.write('extracting syntactic n-grams (validation file)\n')
+ extract_syntactic_ngrams.main(extract_options)
+
+ sys.stderr.write('training neural network\n')
+ train_nplm.main(options)
+
+ sys.stderr.write('averaging null words\n')
+ ret = subprocess.call([os.path.join(sys.path[0], 'average_null_embedding.py'),
+ options.nplm_home,
+ os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
+ os.path.join(options.working_dir, options.corpus_stem + '.numberized'),
+ os.path.join(options.output_dir, options.output_model + '.model.nplm.')
+ ])
+ if ret:
+ raise Exception("averaging null words failed")
+
+if __name__ == "__main__":
+ if sys.version_info < (3, 0):
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+ options = parser.parse_args()
+ main(options)
+
diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl
index c7269abf9..24c9be829 100755
--- a/scripts/training/reduce-factors.perl
+++ b/scripts/training/reduce-factors.perl
@@ -10,11 +10,12 @@ my $___FACTOR_DELIMITER = "|";
my $ZCAT = "gzip -cd";
my $BZCAT = "bzcat";
-my ($CORPUS,$REDUCED,$FACTOR);
+my ($CORPUS,$REDUCED,$FACTOR,$_XML);
die("ERROR: wrong syntax when invoking reduce-factors")
unless &GetOptions('corpus=s' => \$CORPUS,
'reduced-corpus=s' => \$REDUCED,
- 'factor=s' => \$FACTOR);
+ 'factor=s' => \$FACTOR,
+ 'xml' => \$_XML);
&reduce_factors($CORPUS,$REDUCED,$FACTOR);
@@ -24,9 +25,9 @@ sub reduce_factors {
my @INCLUDE = sort {$a <=> $b} split(/,/,$factors);
- print "Reducing factors to produce $reduced @ ".`date`;
+ print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`;
while(-e $reduced.".lock") {
- sleep(10);
+ sleep(10);
}
if (-e $reduced) {
print STDERR " $reduced in place, reusing\n";
@@ -37,29 +38,31 @@ sub reduce_factors {
return;
}
- # peek at input, to check if we are asked to produce exactly the
- # available factors
- my $inh = open_or_zcat($full);
- my $firstline = <$inh>;
- die "Corpus file $full is empty" unless $firstline;
- close $inh;
- # pick first word
- $firstline =~ s/^\s*//;
- $firstline =~ s/\s.*//;
- # count factors
- my @WORD = split(/ /,$firstline);
- my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
- my $maxfactorindex = scalar(@FACTOR)-1;
- if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
- # create just symlink; preserving compression
- my $realfull = $full;
- if (!-e $realfull && -e $realfull.".gz") {
+ unless ($_XML) {
+ # peek at input, to check if we are asked to produce exactly the
+ # available factors
+ my $inh = open_or_zcat($full);
+ my $firstline = <$inh>;
+ die "Corpus file $full is empty" unless $firstline;
+ close $inh;
+ # pick first word
+ $firstline =~ s/^\s*//;
+ $firstline =~ s/\s.*//;
+ # count factors
+ my @WORD = split(/ /,$firstline);
+ my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
+ my $maxfactorindex = scalar(@FACTOR)-1;
+ if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
+ # create just symlink; preserving compression
+ my $realfull = $full;
+ if (!-e $realfull && -e $realfull.".gz") {
$realfull .= ".gz";
$reduced =~ s/(\.gz)?$/.gz/;
- }
- safesystem("ln -s '$realfull' '$reduced'")
+ }
+ safesystem("ln -s '$realfull' '$reduced'")
or die "Failed to create symlink $realfull -> $reduced";
- return;
+ return;
+ }
}
# The default is to select the needed factors
@@ -71,23 +74,30 @@ sub reduce_factors {
$nr++;
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
- chomp; s/ +/ /g; s/^ //; s/ $//;
- my $first = 1;
- foreach (split) {
- my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
+ s/<\S[^>]*>/ /g if $_XML; # remove xml
+ chomp; s/ +/ /g; s/^ //; s/ $//;
+ my $first = 1;
+ foreach (split) {
+ my @FACTOR = split /\Q$___FACTOR_DELIMITER/;
# \Q causes to disable metacharacters in regex
- print OUT " " unless $first;
- $first = 0;
- my $first_factor = 1;
+ print OUT " " unless $first;
+ $first = 0;
+ my $first_factor = 1;
foreach my $outfactor (@INCLUDE) {
- print OUT "|" unless $first_factor;
+ print OUT $___FACTOR_DELIMITER unless $first_factor;
$first_factor = 0;
my $out = $FACTOR[$outfactor];
die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out;
print OUT $out;
}
- }
- print OUT "\n";
+ # for(my $factor=0;$factor<=$#FACTOR;$factor++) {
+ # next unless defined($INCLUDE{$factor});
+ # print OUT "|" unless $first_factor;
+ # $first_factor = 0;
+ # print OUT $FACTOR[$factor];
+ # }
+ }
+ print OUT "\n";
}
print STDERR "\n";
close(OUT);
diff --git a/scripts/training/wrappers/mosesxml2brackets.py b/scripts/training/wrappers/mosesxml2brackets.py
new file mode 100755
index 000000000..b82685638
--- /dev/null
+++ b/scripts/training/wrappers/mosesxml2brackets.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Author: Rico Sennrich
+
+# convert trees in moses XML format to PTB-style bracketed format
+
+from __future__ import print_function, unicode_literals
+import sys
+import codecs
+
+from lxml import etree as ET
+
+def escape(word):
+ word = word.replace('|','&#124;') # factor separator
+ word = word.replace('[','&#91;') # syntax non-terminal
+ word = word.replace(']','&#93;') # syntax non-terminal
+ word = word.replace('\'','&apos;')
+ word = word.replace('\"','&quot;')
+
+ return word
+
+def make_brackets(xml):
+
+ out = ' [' + xml.get('label')
+
+ if xml.text and xml.text.strip():
+ word = escape(xml.text.strip())
+ out += ' ' + word + ']'
+
+ else:
+ for child in xml:
+ out += make_brackets(child)
+
+ out += ']'
+
+ return out
+
+
+if __name__ == '__main__':
+
+ if sys.version_info < (3, 0):
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+
+ for line in sys.stdin:
+ if line == '\n':
+ sys.stdout.write(line)
+ continue
+ out = make_brackets(ET.fromstring(line)).strip()
+ sys.stdout.write(out + '\n')