Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2013-02-21 21:34:59 +0400
committerBarry Haddow <barry.haddow@gmail.com>2013-02-21 21:34:59 +0400
commit51ab9aa19dccefea54b45dc81a929301ba5d6ea5 (patch)
tree364018d46af083e52b0658359d535a055bdd92ac /scripts/ems
parent87d7294d50d69da1833b6a78829154c444f2be6e (diff)
parent5844fb21a758a492b0847ba0939a7856a9a5cb68 (diff)
Merge remote branch 'origin/master' into phrase-weighting
Diffstat (limited to 'scripts/ems')
-rw-r--r--scripts/ems/example/config.basic73
-rw-r--r--scripts/ems/example/config.factored72
-rw-r--r--scripts/ems/example/config.hierarchical71
-rw-r--r--scripts/ems/example/config.syntax71
-rw-r--r--scripts/ems/example/config.toy72
-rw-r--r--scripts/ems/experiment.machines2
-rw-r--r--scripts/ems/experiment.meta225
-rwxr-xr-xscripts/ems/experiment.perl170
-rwxr-xr-xscripts/ems/support/build-sparse-lexical-features.perl1
-rw-r--r--scripts/ems/support/defaultconfig.py53
-rwxr-xr-xscripts/ems/support/interpolate-lm.perl2
-rwxr-xr-xscripts/ems/support/mml-filter.perl46
-rwxr-xr-xscripts/ems/support/mml-filter.py156
-rwxr-xr-xscripts/ems/support/mml-score.perl118
-rwxr-xr-xscripts/ems/support/mml-train.perl84
-rwxr-xr-xscripts/ems/support/split-sentences.perl8
-rw-r--r--scripts/ems/support/train-irstlm.perl22
17 files changed, 1107 insertions, 139 deletions
diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic
index 9542026f8..86dad3c15 100644
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@@ -139,9 +139,10 @@ raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
-# irstlm
-#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
-#settings = "-s improved-kneser-ney"
+# irstlm training
+# msb = modified kneser ney; p=0 no singleton pruning
+#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
+#settings = "-s msb -p 0"
# order of the language model
order = 5
@@ -251,6 +252,35 @@ type = 8
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (if used at all, should be small as a percentage of corpus)
+#settings = "--line-count 100000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -310,11 +340,29 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
+### use of baseline alignment model (incremental training)
+#
+#baseline = 68
+#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
+# $working-dir/training/prepared.$baseline/$output-extension.vcb \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
+
### if word alignment should be skipped,
# point to word alignment files
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
@@ -331,6 +379,16 @@ lexicalized-reordering = msd-bidirectional-fe
### settings for rule extraction
#
#extract-settings = ""
+max-phrase-length = 5
+
+### add extracted phrases from baseline model
+#
+#baseline-extract = $working-dir/model/extract.$baseline
+#
+# requires aligned parallel corpus for re-estimating lexical translation probabilities
+#baseline-corpus = $working-dir/training/corpus.$baseline
+#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method
+
### unknown word labels (target syntax only)
# enables use of unknown word labels during decoding
@@ -369,6 +427,12 @@ score-settings = "--GoodTuring"
#
# reordering-table =
+### filtering the phrase table based on significance tests
+# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
+# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
+#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
+#sigtest-filter = "-l a+e -n 50"
+
### if training should be skipped,
# point to a configuration file that contains
# pointers to all relevant model files
@@ -543,6 +607,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
+#
+# visualization of the search graph in tree-based models
+#analyze-search-graph = yes
[EVALUATION:newstest2011]
diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored
index db990dd80..dbc783189 100644
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@@ -139,9 +139,10 @@ raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
-# irstlm
-#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
-#settings = "-s improved-kneser-ney"
+# irstlm training
+# msb = modified kneser ney; p=0 no singleton pruning
+#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
+#settings = "-s msb -p 0"
# order of the language model
order = 5
@@ -271,6 +272,35 @@ mxpost = /home/pkoehn/bin/mxpost
factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.perl -mxpost $mxpost"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (if used at all, should be small as a percentage of corpus)
+#settings = "--line-count 100000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -330,11 +360,29 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
+### use of baseline alignment model (incremental training)
+#
+#baseline = 68
+#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
+# $working-dir/training/prepared.$baseline/$output-extension.vcb \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
+
### if word alignment should be skipped,
# point to word alignment files
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
@@ -351,6 +399,15 @@ lexicalized-reordering = msd-bidirectional-fe
### settings for rule extraction
#
#extract-settings = ""
+max-phrase-length = 5
+
+### add extracted phrases from baseline model
+#
+#baseline-extract = $working-dir/model/extract.$baseline
+#
+# requires aligned parallel corpus for re-estimating lexical translation probabilities
+#baseline-corpus = $working-dir/training/corpus.$baseline
+#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method
### unknown word labels (target syntax only)
# enables use of unknown word labels during decoding
@@ -389,6 +446,12 @@ score-settings = "--GoodTuring"
#
# reordering-table =
+### filtering the phrase table based on significance tests
+# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
+# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
+#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
+#sigtest-filter = "-l a+e -n 50"
+
### if training should be skipped,
# point to a configuration file that contains
# pointers to all relevant model files
@@ -559,6 +622,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
+#
+# visualization of the search graph in tree-based models
+#analyze-search-graph = yes
[EVALUATION:newstest2011]
diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical
index 67578885d..90ae19432 100644
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@@ -139,9 +139,10 @@ raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
-# irstlm
-#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
-#settings = "-s improved-kneser-ney"
+# irstlm training
+# msb = modified kneser ney; p=0 no singleton pruning
+#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
+#settings = "-s msb -p 0"
# order of the language model
order = 5
@@ -251,6 +252,35 @@ type = 8
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (if used at all, should be small as a percentage of corpus)
+#settings = "--line-count 100000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -310,11 +340,29 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
+### use of baseline alignment model (incremental training)
+#
+#baseline = 68
+#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
+# $working-dir/training/prepared.$baseline/$output-extension.vcb \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
+
### if word alignment should be skipped,
# point to word alignment files
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
@@ -332,6 +380,14 @@ hierarchical-rule-set = true
#
#extract-settings = ""
+### add extracted phrases from baseline model
+#
+#baseline-extract = $working-dir/model/extract.$baseline
+#
+# requires aligned parallel corpus for re-estimating lexical translation probabilities
+#baseline-corpus = $working-dir/training/corpus.$baseline
+#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method
+
### unknown word labels (target syntax only)
# enables use of unknown word labels during decoding
# label file is generated during rule extraction
@@ -369,6 +425,12 @@ score-settings = "--GoodTuring"
#
# reordering-table =
+### filtering the phrase table based on significance tests
+# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
+# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
+#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
+#sigtest-filter = "-l a+e -n 50"
+
### if training should be skipped,
# point to a configuration file that contains
# pointers to all relevant model files
@@ -539,6 +601,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
+#
+# visualization of the search graph in tree-based models
+#analyze-search-graph = yes
[EVALUATION:newstest2011]
diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax
index 68a6eb4e8..2a0cce137 100644
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@@ -143,9 +143,10 @@ raw-stem = $wmt12-data/training/undoc.2000.$pair-extension
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
-# irstlm
-#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
-#settings = "-s improved-kneser-ney"
+# irstlm training
+# msb = modified kneser ney; p=0 no singleton pruning
+#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
+#settings = "-s msb -p 0"
# order of the language model
order = 5
@@ -255,6 +256,35 @@ type = 8
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (if used at all, should be small as a percentage of corpus)
+#settings = "--line-count 100000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -314,11 +344,29 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
+### use of baseline alignment model (incremental training)
+#
+#baseline = 68
+#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
+# $working-dir/training/prepared.$baseline/$output-extension.vcb \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
+
### if word alignment should be skipped,
# point to word alignment files
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
@@ -336,6 +384,14 @@ hierarchical-rule-set = true
#
extract-settings = "--MinHoleSource 1 --NonTermConsecSource"
+### add extracted phrases from baseline model
+#
+#baseline-extract = $working-dir/model/extract.$baseline
+#
+# requires aligned parallel corpus for re-estimating lexical translation probabilities
+#baseline-corpus = $working-dir/training/corpus.$baseline
+#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method
+
### unknown word labels (target syntax only)
# enables use of unknown word labels during decoding
# label file is generated during rule extraction
@@ -373,6 +429,12 @@ score-settings = "--GoodTuring"
#
# reordering-table =
+### filtering the phrase table based on significance tests
+# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
+# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
+#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
+#sigtest-filter = "-l a+e -n 50"
+
### if training should be skipped,
# point to a configuration file that contains
# pointers to all relevant model files
@@ -543,6 +605,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
+#
+# visualization of the search graph in tree-based models
+#analyze-search-graph = yes
[EVALUATION:newstest2011]
diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy
index 20575e1dc..17678b31f 100644
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@@ -133,9 +133,10 @@ raw-stem = $toy-data/nc-5k
lm-training = $srilm-dir/ngram-count
settings = "-interpolate -kndiscount -unk"
-# irstlm
-#lm-training = "$moses-script-dir/generic/trainlm-irst.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/lm"
-#settings = "-s improved-kneser-ney"
+# irstlm training
+# msb = modified kneser ney; p=0 no singleton pruning
+#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
+#settings = "-s msb -p 0"
# order of the language model
order = 5
@@ -235,6 +236,35 @@ type = 8
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (if used at all, should be small as a percentage of corpus)
+#settings = "--line-count 100000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -294,11 +324,29 @@ alignment-symmetrization-method = grow-diag-final-and
#berkeley-process-options = "-EMWordAligner.numThreads 8"
#berkeley-posterior = 0.5
+### use of baseline alignment model (incremental training)
+#
+#baseline = 68
+#baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \
+# $working-dir/training/prepared.$baseline/$output-extension.vcb \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \
+# $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \
+# $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5"
+
### if word alignment should be skipped,
# point to word alignment files
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
@@ -315,6 +363,15 @@ lexicalized-reordering = msd-bidirectional-fe
### settings for rule extraction
#
#extract-settings = ""
+max-phrase-length = 5
+
+### add extracted phrases from baseline model
+#
+#baseline-extract = $working-dir/model/extract.$baseline
+#
+# requires aligned parallel corpus for re-estimating lexical translation probabilities
+#baseline-corpus = $working-dir/training/corpus.$baseline
+#baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method
### unknown word labels (target syntax only)
# enables use of unknown word labels during decoding
@@ -353,6 +410,12 @@ score-settings = "--GoodTuring"
#
# reordering-table =
+### filtering the phrase table based on significance tests
+# Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable"
+# options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold
+#salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64
+#sigtest-filter = "-l a+e -n 50"
+
### if training should be skipped,
# point to a configuration file that contains
# pointers to all relevant model files
@@ -519,6 +582,9 @@ report-segmentation = yes
#
# further precision breakdown by factor
#precision-by-coverage-factor = pos
+#
+# visualization of the search graph in tree-based models
+#analyze-search-graph = yes
[EVALUATION:test]
diff --git a/scripts/ems/experiment.machines b/scripts/ems/experiment.machines
index 7fdecd9cd..6459be2c4 100644
--- a/scripts/ems/experiment.machines
+++ b/scripts/ems/experiment.machines
@@ -1,4 +1,4 @@
cluster: townhill seville hermes lion seville sannox lutzow frontend
multicore-8: tyr thor odin crom
multicore-16: saxnot vali vili freyja bragi hoenir
-multicore-24: syn hel skaol saga
+multicore-24: syn hel skaol saga buri loki sif magni
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 6df2701a0..214569206 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -129,7 +129,7 @@ train
[LM] multiple
get-corpus
- in: get-corpus-script
+ in: get-corpus-script
out: raw-corpus
pass-unless: get-corpus-script
default-name: lm/txt
@@ -166,7 +166,7 @@ truecase
default-name: lm/truecased
ignore-unless: output-truecaser
only-factor-0: yes
- template: $output-truecaser -model IN1.$output-extension < IN > OUT
+ template: $output-truecaser -model IN1.$output-extension < IN > OUT
parallelizable: yes
split
in: lowercased-corpus SPLITTER:splitter-model
@@ -208,7 +208,7 @@ binarize
rerun-on-change: lm
default-name: lm/binlm
template: $lm-binarizer IN OUT
- error: set KENLM_MAX_ORDER to at least this value
+ error: set KENLM_MAX_ORDER to at least this value
[INTERPOLATED-LM] single
tuning-from-sgm
@@ -276,6 +276,104 @@ binarize
rerun-on-change: lm
default-name: lm/interpolated-binlm
error: set kMaxOrder to at least this value
+[MML] single
+tokenize-indomain-source
+ in: raw-indomain-source
+ out: tokenized-indomain-source
+ default-name: mml/indomain-source.tok
+ pass-unless: input-tokenizer
+ template: $input-tokenizer < IN > OUT
+ parallelizable: yes
+factorize-indomain-source
+ in: tokenized-indomain-source
+ out: factorized-indomain-source
+ rerun-on-change: TRAINING:input-factors
+ default-name: mml/indomain-source.factored
+ pass-unless: factors
+ parallelizable: yes
+ error: can't open
+ error: incompatible number of words in factor
+lowercase-indomain-source
+ in: factorized-indomain-source
+ out: lowercased-indomain-source
+ default-name: mml/indomain-source.lowercased
+ pass-unless: input-lowercaser
+ ignore-if: input-truecaser
+ only-factor-0: yes
+ template: $input-lowercaser < IN > OUT
+ parallelizable: yes
+truecase-indomain-source
+ in: factorized-indomain-source TRUECASER:truecase-model
+ out: lowercased-indomain-source
+ rerun-on-change: input-truecaser
+ default-name: mml/indomain-source.truecased
+ ignore-unless: input-truecaser
+ only-factor-0: yes
+ template: $input-truecaser -model IN1.$input-extension < IN > OUT
+ parallelizable: yes
+split-indomain-source
+ in: lowercased-indomain-source SPLITTER:splitter-model
+ out: indomain-source
+ rerun-on-change: input-splitter
+ default-name: mml/indomain-source.split
+ pass-unless: input-splitter
+ template: $input-splitter -model IN1.$input-extension < IN > OUT
+tokenize-indomain-target
+ in: raw-indomain-target
+ out: tokenized-indomain-target
+ default-name: mml/indomain-target.tok
+ pass-unless: output-tokenizer
+ template: $output-tokenizer < IN > OUT
+ parallelizable: yes
+factorize-indomain-target
+ in: tokenized-indomain-target
+ out: factorized-indomain-target
+ rerun-on-change: TRAINING:output-factors
+ default-name: mml/indomain-target.factored
+ pass-unless: factors
+ parallelizable: yes
+ error: can't open
+ error: incompatible number of words in factor
+lowercase-indomain-target
+ in: factorized-indomain-target
+ out: lowercased-indomain-target
+ default-name: mml/indomain-target.lowercased
+ pass-unless: output-lowercaser
+ ignore-if: output-truecaser
+ only-factor-0: yes
+ template: $output-lowercaser < IN > OUT
+ parallelizable: yes
+truecase-indomain-target
+ in: factorized-indomain-target TRUECASER:truecase-model
+ out: lowercased-indomain-target
+ rerun-on-change: output-truecaser
+ default-name: mml/indomain-target.truecased
+ ignore-unless: output-truecaser
+ only-factor-0: yes
+ template: $output-truecaser -model IN1.$output-extension < IN > OUT
+ parallelizable: yes
+split-indomain-target
+ in: lowercased-indomain-target SPLITTER:splitter-model
+ out: indomain-target
+ rerun-on-change: output-splitter
+ default-name: mml/indomain-target.split
+ pass-unless: output-splitter
+ template: $output-splitter -model IN1.$output-extension < IN > OUT
+train
+ in: indomain-stem outdomain-stem
+ out: model
+ rerun-on-change: settings
+ ignore-unless: indomain-stem
+ default-name: mml/model
+ template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
+train-in-mono
+ in: indomain-source indomain-target outdomain-stem
+ out: model
+ rerun-on-change: settings
+ ignore-if: indomain-stem
+ default-name: mml/model
+ template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
+
[TRAINING] single
consolidate
in: CORPUS:clean-split-stem
@@ -286,30 +384,46 @@ build-domains
in: CORPUS:clean-split-stem
out: domains
default-name: model/domains
- ignore-unless: domain-features
+ ignore-unless: domain-features mml-filter-corpora
template: $moses-script-dir/ems/support/build-domain-file-from-subcorpora.perl $input-extension IN > OUT
+mml-score
+ in: MML:model corpus domains
+ out: mml-scores
+ ignore-unless: mml-before-wa mml-after-wa
+ rerun-on-change: mml-filter-corpora
+ default-name: training/corpus-mml-score
+ template: $moses-script-dir/ems/support/mml-score.perl -model IN -corpus IN1 -domains IN2 -input-extension $input-extension -output-extension $output-extension -query $MML:lm-query -filter-domains "$mml-filter-corpora" > OUT
+mml-filter-before-wa
+ in: corpus mml-scores domains
+ out: corpus-mml-prefilter
+ ignore-unless: mml-before-wa
+ rerun-on-change: mml-filter-corpora mml-before-wa
+ default-name: training/corpus-mml
+ template: $moses-script-dir/ems/support/mml-filter.perl -in IN -out OUT -score IN1 -domain IN2 -input-extension $input-extension -output-extension $output-extension $mml-before-wa
prepare-data
- in: corpus
+ in: corpus-mml-prefilter=OR=corpus
out: prepared-data
- rerun-on-change: alignment-factors training-options script
+ rerun-on-change: alignment-factors training-options script baseline-alignment-model external-bin-dr
ignore-if: use-berkeley
default-name: prepared
run-giza
in: prepared-data
out: giza-alignment
ignore-if: use-berkeley
- rerun-on-change: giza-settings training-options script
+ rerun-on-change: giza-settings training-options script baseline-alignment-model external-bin-dir
default-name: giza
error: not found
+ not-error: 0 not found
run-giza-inverse
in: prepared-data
out: giza-alignment-inverse
- rerun-on-change: giza-settings training-options script
+ rerun-on-change: giza-settings training-options script baseline-alignment-model external-bin-dir
ignore-if: use-berkeley
default-name: giza-inverse
error: not found
+ not-error: 0 not found
run-berkeley
- in: corpus
+ in: corpus-mml-prefilter
out: berkeley-alignment
ignore-unless: use-berkeley
rerun-on-change: berkeley-train berkeley-jar berkeley-training-options
@@ -317,7 +431,7 @@ run-berkeley
template: $berkeley-train " $berkeley-java-options " $berkeley-jar IN OUT $input-extension $output-extension $berkeley-training-options
not-error: 0 errors,
process-berkeley
- in: corpus berkeley-alignment
+ in: corpus-mml-prefilter berkeley-alignment
out: word-alignment
default-name: model/aligned
rerun-on-change: berkeley-process berkeley-jar berkeley-posterior berkeley-process-options
@@ -331,47 +445,54 @@ symmetrize-giza
rerun-on-change: alignment-symmetrization-method training-options script
default-name: model/aligned
error: skip=<[1-9]
+mml-filter-after-wa
+ in: corpus-mml-prefilter=OR=corpus word-alignment mml-scores corpus-mml-prefilter=OR=domains
+ out: corpus-mml-postfilter
+ ignore-unless: mml-after-wa
+ rerun-on-change: mml-filter-corpora mml-after-wa
+ default-name: model/corpus-mml
+ template: $moses-script-dir/ems/support/mml-filter.perl -in IN -out OUT -alignment IN1 -score IN2 -domain IN3 -input-extension $input-extension -output-extension $output-extension $mml-after-wa
build-biconcor
- in: word-alignment corpus
+ in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: biconcor-model
default-name: model/biconcor
ignore-unless: biconcor
error: usage
build-suffix-array
- in: word-alignment corpus
+ in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: phrase-translation-table
default-name: model/suffix-array
ignore-unless: suffix-array
error: usage
build-lex-trans
- in: word-alignment corpus
+ in: corpus-mml-postfilter=OR=word-alignment corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: lexical-translation-table
rerun-on-change: translation-factors training-options script
default-name: model/lex
parse-relax
- in: corpus
+ in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: parse-relaxed-corpus
default-name: model/parsed-relaxed
pass-unless: input-parse-relaxer output-parse-relaxer
template-if: input-parse-relaxer IN.$input-extension OUT.$input-extension
template-if: output-parse-relaxer IN.$output-extension OUT.$output-extension
pcfg-extract
- in: parse-relaxed-corpus
- out: pcfg
- default-name: model/pcfg
- ignore-unless: use-pcfg-feature
- rerun-on-change: use-pcfg-feature
- template: $moses-bin-dir/pcfg-extract < IN.$output-extension > OUT.$output-extension
+ in: parse-relaxed-corpus
+ out: pcfg
+ default-name: model/pcfg
+ ignore-unless: use-pcfg-feature
+ rerun-on-change: use-pcfg-feature
+ template: $moses-bin-dir/pcfg-extract < IN.$output-extension > OUT.$output-extension
pcfg-score
- in: parse-relaxed-corpus pcfg
- out: scored-corpus
- default-name: model/scored-corpus
- pass-unless: use-pcfg-feature
- template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
+ in: parse-relaxed-corpus pcfg
+ out: scored-corpus
+ default-name: model/scored-corpus
+ pass-unless: use-pcfg-feature
+ template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
extract-phrases
- in: word-alignment scored-corpus
+ in: corpus-mml-postfilter=OR=word-alignment scored-corpus
out: extracted-phrases
- rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm domain-features
+ rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm domain-features baseline-extract
only-existence-matters: domain-features
default-name: model/extract
ignore-if: suffix-array
@@ -382,13 +503,39 @@ build-reordering
rerun-on-change: lexicalized-reordering reordering-factors
default-name: model/reordering-table
build-ttable
- in: extracted-phrases lexical-translation-table domains
+ in: extracted-phrases lexical-translation-table corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains
out: phrase-translation-table
rerun-on-change: translation-factors hierarchical-rule-set score-settings training-options script EVALUATION:report-precision-by-coverage include-word-alignment-in-rules domain-features
default-name: model/phrase-table
ignore-if: suffix-array
+sigtest-filter-suffix-array
+ in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
+ out: sigtest-filter-suffix-array
+ default-name: training/corpus
+ template: $salm-index IN.$input-extension ; \
+ mv IN.${input-extension}.id_voc OUT.${input-extension}.id_voc ; \
+ mv IN.${input-extension}.sa_corpus OUT.${input-extension}.sa_corpus ; \
+ mv IN.${input-extension}.sa_offset OUT.${input-extension}.sa_offset ; \
+ mv IN.${input-extension}.sa_suffix OUT.${input-extension}.sa_suffix ; \
+ $salm-index IN.$output-extension ; \
+ mv IN.${output-extension}.id_voc OUT.${output-extension}.id_voc ; \
+ mv IN.${output-extension}.sa_corpus OUT.${output-extension}.sa_corpus ; \
+ mv IN.${output-extension}.sa_offset OUT.${output-extension}.sa_offset ; \
+ mv IN.${output-extension}.sa_suffix OUT.${output-extension}.sa_suffix
+ ignore-unless: sigtest-filter
+sigtest-filter-ttable
+ in: phrase-translation-table sigtest-filter-suffix-array
+ out: sigtest-filter-phrase-translation-table
+ default-name: model/phrase-table-sigtest-filter
+ pass-unless: sigtest-filter
+sigtest-filter-reordering
+ in: reordering-table sigtest-filter-suffix-array
+ out: sigtest-filter-reordering-table
+ default-name: model/reordering-table-sigtest-filter
+ pass-unless: sigtest-filter
+ ignore-unless: lexicalized-reordering
build-generation
- in: corpus
+ in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: generation-table
rerun-on-change: generation-factors generation-type training-options script
ignore-unless: generation-factors
@@ -401,14 +548,14 @@ build-generation-custom
ignore-unless: AND generation-factors generation-corpus
default-name: model/generation-table
build-sparse-lexical
- in: corpus
+ in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: sparse-lexical
ignore-unless: sparse-lexical-features
rerun-on-change: sparse-lexical-features
default-name: model/most-frequent-words
template: $moses-script-dir/ems/support/build-sparse-lexical-features.perl IN $input-extension $output-extension OUT "$sparse-lexical-features"
create-config
- in: reordering-table phrase-translation-table generation-table sparse-lexical domains INTERPOLATED-LM:binlm LM:binlm
+ in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table generation-table sparse-lexical corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains INTERPOLATED-LM:binlm LM:binlm
out: config
ignore-if: use-hiero
rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini
@@ -422,19 +569,19 @@ binarize-config
default-name: model/moses.bin.ini
template: $binarize-all $ttable-binarizer $rtable-binarizer OUT IN
hiero-compile-source-suffix-array
- in: corpus
+ in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: hiero-source-suffix-array
ignore-unless: use-hiero
default-name: hiero-model/f.sa.bin
template: $hiero-decode-dir/compile_bin.py -s IN.$input-extension OUT
hiero-compile-target
- in: corpus
+ in: corpus-mml-postfilter=OR=corpus-mml-prefilter=OR=corpus
out: hiero-target-array
ignore-unless: use-hiero
default-name: hiero-model/e.bin
template: $hiero-decode-dir/compile_bin.py IN.$output-extension OUT
hiero-compile-alignment
- in: word-alignment
+ in: corpus-mml-postfilter=OR=word-alignment
out: hiero-alignment-array
ignore-unless: use-hiero
default-name: hiero-model/a.bin
@@ -653,7 +800,7 @@ split-reference-devtest
multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
filter
- in: input TRAINING:phrase-translation-table TRAINING:reordering-table TRAINING:domains
+ in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains
out: filtered-dir
default-name: tuning/filtered
rerun-on-change: filter-settings
@@ -661,7 +808,7 @@ filter
ignore-if: use-hiero
error: already exists. Please delete
filter-devtest
- in: input-devtest TRAINING:phrase-translation-table TRAINING:reordering-table
+ in: input-devtest TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table
out: filtered-dir-devtest
default-name: tuning/filtered.devtest
rerun-on-change: filter-settings
@@ -772,7 +919,7 @@ split-input
pass-unless: input-splitter
template: $input-splitter -model IN1.$input-extension < IN > OUT
filter
- in: input TRAINING:phrase-translation-table TRAINING:reordering-table TRAINING:domains
+ in: input TRAINING:sigtest-filter-phrase-translation-table TRAINING:sigtest-filter-reordering-table TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus-mml-postfilter=OR=TRAINING:domains
out: filtered-dir
default-name: evaluation/filtered
rerun-on-change: filter-settings report-precision-by-coverage
@@ -941,13 +1088,13 @@ analysis
ignore-unless: analysis
rerun-on-change: analyze-search-graph
analysis-coverage
- in: input TRAINING:corpus TRAINING:phrase-translation-table
+ in: input TRAINING:corpus-mml-postfilter=OR=TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus TRAINING:sigtest-filter-phrase-translation-table
out: analysis-coverage
default-name: evaluation/analysis
ignore-unless: AND analysis analyze-coverage
rerun-on-change: score-settings
analysis-precision
- in: recased-output reference input TRAINING:corpus TRAINING:phrase-translation-table analysis-coverage
+ in: recased-output reference input TRAINING:corpus-mml-postfilter=OR=TRAINING:corpus-mml-prefilter=OR=TRAINING:corpus TRAINING:sigtest-filter-phrase-translation-table analysis-coverage
out: analysis
default-name: evaluation/analysis
ignore-unless: AND analysis analyze-coverage report-precision-by-coverage
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 0fac94f4b..ff619b0a3 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -17,7 +17,7 @@ sub trim($)
my $host = `hostname`; chop($host);
print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`;
-my ($CONFIG_FILE,$EXECUTE,$NO_GRAPH,$CONTINUE,$VERBOSE,$IGNORE_TIME);
+my ($CONFIG_FILE,$EXECUTE,$NO_GRAPH,$CONTINUE,$FINAL,$VERBOSE,$IGNORE_TIME);
my $SLEEP = 2;
my $META = "$RealBin/experiment.meta";
@@ -38,6 +38,7 @@ die("experiment.perl -config config-file [-exec] [-no-graph]")
'exec' => \$EXECUTE,
'cluster' => \$CLUSTER,
'multicore' => \$MULTICORE,
+ 'final=s' => \$FINAL,
'meta=s' => \$META,
'verbose' => \$VERBOSE,
'sleep=i' => \$SLEEP,
@@ -194,6 +195,10 @@ sub read_meta {
while(<META>) {
s/\#.*$//; # strip comments
next if /^\s*$/;
+ while (/\\\s*$/) {
+ $_ .= <META>;
+ s/\s*\\\s*[\n\r]*\s+/ /;
+ }
if (/^\[(.+)\]\s+(\S+)/) {
$module = $1;
push @MODULE,$module;
@@ -242,10 +247,18 @@ sub read_meta {
$MULTIREF{"$module:$step"} = $2;
}
elsif ($1 eq "template") {
- $TEMPLATE{"$module:$step"} = $2;
+ my $escaped_template = $2;
+ $escaped_template =~ s/^IN/EMS_IN_EMS/;
+ $escaped_template =~ s/ IN(\d*)/ EMS_IN$1_EMS/g;
+ $escaped_template =~ s/ OUT/ EMS_OUT_EMS/g;
+ $TEMPLATE{"$module:$step"} = $escaped_template;
}
elsif ($1 eq "template-if") {
- my @IF = split(/\s+/,$2);
+ my $escaped_template = $2;
+ $escaped_template =~ s/^IN/EMS_IN_EMS/;
+ $escaped_template =~ s/ IN(\d*)/ EMS_IN$1_EMS/g;
+ $escaped_template =~ s/ OUT/ EMS_OUT_EMS/g;
+ my @IF = split(/\s+/,$escaped_template);
push @{$TEMPLATE_IF{"$module:$step"}}, \@IF;
}
elsif ($1 eq "parallelizable") {
@@ -284,6 +297,10 @@ sub read_config {
$line_count++;
s/\#.*$//; # strip comments
next if /^\#/ || /^\s*$/;
+ while (/\\\s*$/) { # merge with next line
+ s/\s*\\\s*$/ /;
+ $_ .= <INI>;
+ }
if (/^\[(.+)\]/) {
$module = $1;
$ignore = /ignore/i;
@@ -316,7 +333,7 @@ sub read_config {
# resolve parameters used in values
my $resolve = 1;
my $loop_count = 0;
- while($resolve && $loop_count++ < 10) {
+ while($resolve && $loop_count++ < 100) {
$resolve = 0;
foreach my $parameter (keys %CONFIG) {
foreach (@{$CONFIG{$parameter}}) {
@@ -395,7 +412,12 @@ sub log_config {
sub find_steps {
# find final output to be produced by the experiment
- push @{$NEEDED{"REPORTING:report"}}, "final";
+ if (defined($FINAL)) {
+ push @{$NEEDED{$FINAL}}, "final";
+ }
+ else {
+ push @{$NEEDED{"REPORTING:report"}}, "final";
+ }
# go through each module
for(my $m=$#MODULE; $m>=0; $m--) {
@@ -531,14 +553,16 @@ sub find_steps_for_module {
# if multiple potential inputs, find first that matches
if ($in =~ /=OR=/) {
- foreach my $potential_in (split(/=OR=/,$in)) {
+ my @POTENTIAL_IN = split(/=OR=/,$in);
+ foreach my $potential_in (@POTENTIAL_IN) {
if (&check_producability($module,$set,$potential_in)) {
$in = $potential_in;
last;
}
+
}
- die("ERROR: none of potential inputs $in possible for $step")
- if $in =~ /=OR=/;
+ #die("ERROR: none of potential inputs $in possible for $step")
+ $in = $POTENTIAL_IN[$#POTENTIAL_IN] if $in =~ /=OR=/;
}
# define input(s) as needed by this step
@@ -947,6 +971,10 @@ sub define_step {
elsif ($DO_STEP[$i] eq 'TRAINING:build-generation') {
&define_training_build_generation($i);
}
+ elsif ($DO_STEP[$i] eq 'TRAINING:sigtest-filter-ttable' ||
+ $DO_STEP[$i] eq 'TRAINING:sigtest-filter-reordering') {
+ &define_training_sigtest_filter($i);
+ }
elsif ($DO_STEP[$i] eq 'TRAINING:create-config' || $DO_STEP[$i] eq 'TRAINING:create-config-interpolated-lm') {
&define_training_create_config($i);
}
@@ -1322,6 +1350,7 @@ sub check_if_crashed {
'no such file or directory','unknown option',
'died at','exit code','permission denied',
'segmentation fault','abort',
+ 'no space left on device',
'can\'t locate', 'unrecognized option') {
if (/$pattern/i) {
my $not_error = 0;
@@ -1635,7 +1664,6 @@ sub define_tuning_tune {
my $decoder_settings = &backoff_and_get("TUNING:decoder-settings");
$decoder_settings = "" unless $decoder_settings;
$decoder_settings .= " -v 0 " unless $CLUSTER && $jobs;
- $decoder_settings .= " -use-alignment-info " unless $hierarchical || defined($word_alignment) && $word_alignment eq "no";
my $tuning_settings = &backoff_and_get("TUNING:tuning-settings");
$tuning_settings = "" unless $tuning_settings;
@@ -1899,11 +1927,16 @@ sub define_training_build_lex_trans {
my ($step_id) = @_;
my ($lex, $aligned,$corpus) = &get_output_and_input($step_id);
+ my $baseline_alignment = &get("TRAINING:baseline-alignment");
+ my $baseline_corpus = &get("TRAINING:baseline-corpus");
+
my $cmd = &get_training_setting(4);
$cmd .= "-lexical-file $lex ";
$cmd .= "-alignment-file $aligned ";
$cmd .= "-alignment-stem ".&versionize(&long_file_name("aligned","model",""))." ";
$cmd .= "-corpus $corpus ";
+ $cmd .= "-baseline-corpus $baseline_corpus " if defined($baseline_corpus) && defined($baseline_alignment);
+ $cmd .= "-baseline-alignment $baseline_alignment " if defined($baseline_corpus) && defined($baseline_alignment);
&create_step($step_id,$cmd);
}
@@ -1938,6 +1971,9 @@ sub define_training_extract_phrases {
$extract_settings .= " --IncludeSentenceId " if &get("TRAINING:domain-features");
$cmd .= "-extract-options '".$extract_settings."' " if defined($extract_settings);
+ my $baseline_extract = &get("TRAINING:baseline-extract");
+ $cmd .= "-baseline-extract $baseline_extract" if defined($baseline_extract);
+
&create_step($step_id,$cmd);
}
@@ -1953,9 +1989,9 @@ sub define_training_build_ttable {
$cmd .= "-lexical-file $lex ";
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table",$phrase_table);
- $cmd .= "-phrase-word-alignment " unless (defined($word_alignment) && $word_alignment eq "no");
+ $cmd .= "-no-word-alignment " if defined($word_alignment) && $word_alignment eq "no";
- $cmd .= &define_domain_feature_score_option($domains) if $domains;
+ $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
&create_step($step_id,$cmd);
}
@@ -2009,6 +2045,37 @@ sub define_training_build_custom_generation {
&create_step($step_id,$cmd);
}
+sub define_training_sigtest_filter {
+ my ($step_id) = @_;
+ my ($filtered_table, $raw_table,$suffix_array) = &get_output_and_input($step_id);
+
+ my $hierarchical_flag = &get("TRAINING:hierarchical-rule-set") ? "-h" : "";
+ my $sigtest_filter = &get("TRAINING:sigtest-filter");
+ my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
+ my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
+ my $moses_src_dir = &check_and_get("GENERAL:moses-src-dir");
+
+ if ($DO_STEP[$step_id] =~ /reordering/) {
+ $raw_table = &get_table_name_settings("reordering-factors","reordering-table", $raw_table);
+ $filtered_table = &get_table_name_settings("reordering-factors","reordering-table", $filtered_table);
+ chop($raw_table);
+ chop($filtered_table);
+ $raw_table .= ".wbe-".&get("TRAINING:lexicalized-reordering"); # a bit of a hack
+ $filtered_table .= ".wbe-".&get("TRAINING:lexicalized-reordering");
+ }
+ else {
+ $raw_table = &get_table_name_settings("translation-factors","phrase-translation-table", $raw_table);
+ $filtered_table = &get_table_name_settings("translation-factors","phrase-translation-table", $filtered_table);
+ chop($raw_table);
+ chop($filtered_table);
+ }
+ $raw_table =~ s/\s*\-\S+\s*//; # remove switch
+ $filtered_table =~ s/\s*\-\S+\s*//;
+
+ my $cmd = "zcat $raw_table.gz | $moses_src_dir/contrib/sigtest-filter/filter-pt -e $suffix_array.$output_extension -f $suffix_array.$input_extension $sigtest_filter $hierarchical_flag | gzip - > $filtered_table.gz\n";
+ &create_step($step_id,$cmd);
+}
+
sub define_training_create_config {
my ($step_id) = @_;
@@ -2040,7 +2107,7 @@ sub define_training_create_config {
$ptCmd .= ":$ptImpl" if $ptImpl>0;
$ptCmd .= ":$numFF" if defined($numFF);
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
- $cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table) if $reordering_table;
+ $cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table) if $reordering_table;
$cmd .= &get_table_name_settings("generation-factors","generation-table",$generation_table) if $generation_table;
$cmd .= "-config $config ";
@@ -2144,7 +2211,7 @@ sub define_training_create_config {
# sparse lexical features provide additional content for config file
$cmd .= "-additional-ini-file $sparse_lexical_features.ini " if $sparse_lexical_features;
- $cmd .= &define_domain_feature_score_option($domains) if $domains;
+ $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
&create_step($step_id,$cmd);
}
@@ -2300,6 +2367,7 @@ sub get_training_setting {
my $score_settings = &get("TRAINING:score-settings");
my $parallel = &get("TRAINING:parallel");
my $pcfg = &get("TRAINING:use-pcfg-feature");
+ my $baseline_alignment = &get("TRAINING:baseline-alignment-model");
my $xml = $source_syntax || $target_syntax;
@@ -2323,6 +2391,7 @@ sub get_training_setting {
$cmd .= "-score-options '".$score_settings."' " if $score_settings;
$cmd .= "-parallel " if $parallel;
$cmd .= "-pcfg " if $pcfg;
+ $cmd .= "-baseline-alignment-model $baseline_alignment " if defined($baseline_alignment) && ($step == 1 || $step == 2);
# factored training
if (&backoff_and_get("TRAINING:input-factors")) {
@@ -2450,7 +2519,7 @@ sub define_tuningevaluation_filter {
$settings = &get("TUNING:filter-settings") if $tuning_flag;
$settings = "" unless $settings;
- $binarizer .= " -alignment-info" unless !defined ($binarizer) || $hierarchical || ( defined $word_alignment && $word_alignment eq "no");
+ $binarizer .= " -no-alignment-info" if defined ($binarizer) && !$hierarchical && defined $word_alignment && $word_alignment eq "no";
$settings .= " -Binarizer \"$binarizer\"" if $binarizer;
$settings .= " --Hierarchical" if $hierarchical;
@@ -2485,13 +2554,13 @@ sub define_tuningevaluation_filter {
$config = $tuning_flag ? "$dir/tuning/moses.table.ini.$VERSION" : "$dir/evaluation/$set.moses.table.ini.$VERSION";
$delete_config = 1;
$cmd = &get_training_setting(9);
- $cmd .= &define_domain_feature_score_option($domains) if $domains;
+ $cmd .= &define_domain_feature_score_option($domains) if &get("TRAINING:domain-features");
my $ptCmd = $phrase_translation_table;
$ptCmd .= ":$ptImpl" if $ptImpl>0;
$ptCmd .= ":$numFF" if defined($numFF);
$cmd .= &get_table_name_settings("translation-factors","phrase-translation-table", $ptCmd);
- $cmd .= &get_table_name_settings("reordering-factors","reordering-table",$reordering_table)
+ $cmd .= &get_table_name_settings("reordering-factors","reordering-table", $reordering_table)
if $reordering_table;
# additional settings for hierarchical models
if (&get("TRAINING:hierarchical-rule-set")) {
@@ -2552,7 +2621,6 @@ sub define_evaluation_decode {
my $hierarchical = &get("TRAINING:hierarchical-rule-set");
my $word_alignment = &backoff_and_get("TRAINING:include-word-alignment-in-rules");
- $settings .= " -use-alignment-info" unless $hierarchical || ( defined($word_alignment) && $word_alignment eq "no");
# specify additional output for analysis
if (defined($report_precision_by_coverage) && $report_precision_by_coverage eq "yes") {
@@ -2656,7 +2724,10 @@ sub define_evaluation_analysis_precision {
my $factors = &encode_factor_definition("translation-factors",\%IN,\%OUT);
my @FACTOR = split(/\+/,$factors);
my @SPECIFIED_NAME;
- if (&backoff_and_get("TRAINING:phrase-translation-table")) {
+ if (&backoff_and_get("TRAINING:sigtest-filter-phrase-translation-table")) {
+ @SPECIFIED_NAME = @{$CONFIG{"TRAINING:sigtest-filter-phrase-translation-table"}};
+ }
+ elsif (&backoff_and_get("TRAINING:phrase-translation-table")) {
@SPECIFIED_NAME = @{$CONFIG{"TRAINING:phrase-translation-table"}};
}
for(my $i=0;$i<scalar(split(/\+/,$factors));$i++) {
@@ -2706,7 +2777,10 @@ sub define_evaluation_analysis_coverage {
my $factors = &encode_factor_definition("translation-factors",\%IN,\%OUT);
my @FACTOR = split(/\+/,$factors);
my @SPECIFIED_NAME;
- if (&backoff_and_get("TRAINING:phrase-translation-table")) {
+ if (&backoff_and_get("TRAINING:sigtest-filter-phrase-translation-table")) {
+ @SPECIFIED_NAME = @{$CONFIG{"TRAINING:sigtest-filter-phrase-translation-table"}};
+ }
+ elsif (&backoff_and_get("TRAINING:phrase-translation-table")) {
@SPECIFIED_NAME = @{$CONFIG{"TRAINING:phrase-translation-table"}};
}
my $surface_ttable;
@@ -2855,31 +2929,17 @@ sub define_template {
$new_cmd .= $single_cmd."\n";
}
elsif ($single_cmd =~ /^.+$/) {
- # find IN and OUT files
- my $in;
- if ($single_cmd =~ /(IN)$/ ||
- $single_cmd =~ /(IN) / ||
- $single_cmd =~ /(IN[^\d]\S*)/) {
- $in = $1;
- }
- else {
- die("ERROR: could not find IN in $single_cmd");
- }
- $single_cmd =~ /(OUT\S*)/
+ # find IN and OUT files
+ $single_cmd =~ /(EMS_IN_EMS\S*)/
+ || die("ERROR: could not find EMS_IN_EMS in $single_cmd");
+ my $in = $1;
+ $single_cmd =~ /(EMS_OUT_EMS\S*)/
|| die("ERROR: could not find OUT in $single_cmd");
my $out = $1;
- # replace IN* and OUT* with %s
- if ($single_cmd =~ /IN$/) {
- $single_cmd =~ s/IN$/\%s/;
- }
- elsif ($single_cmd =~ /IN /) {
- $single_cmd =~ s/IN /\%s /;
- }
- else {
- $single_cmd =~ s/IN[^\d]\S*/\%s/;
- }
- $single_cmd =~ s/OUT\S*/\%s/;
- # build tmp
+ # replace IN and OUT with %s
+ $single_cmd =~ s/EMS_IN_EMS\S*/\%s/;
+ $single_cmd =~ s/EMS_OUT_EMS\S*/\%s/;
+ # build tmp
my $tmp_dir = $module;
$tmp_dir =~ tr/A-Z/a-z/;
$tmp_dir .= "/tmp.$set.$stepname.$VERSION-".($i++);
@@ -2901,30 +2961,28 @@ sub define_template {
# command to be run on multiple reference translations
if (defined($multiref)) {
- $cmd =~ s/^(.+)IN (.+)OUT(.*)$/$multiref '$1 mref-input-file $2 mref-output-file $3' IN OUT/;
- $cmd =~ s/^(.+)OUT(.+)IN (.*)$/$multiref '$1 mref-output-file $2 mref-input-file $3' IN OUT/;
+ $cmd =~ s/^(.*)EMS_IN_EMS (.+)EMS_OUT_EMS(.*)$/$multiref '$1 mref-input-file $2 mref-output-file $3' EMS_IN_EMS EMS_OUT_EMS/;
+ $cmd =~ s/^(.+)EMS_OUT_EMS(.+)EMS_IN_EMS (.*)$/$multiref '$1 mref-output-file $2 mref-input-file $3' EMS_IN_EMS EMS_OUT_EMS/;
}
# input is array, but just specified as IN
- if ($cmd !~ /IN1/ && (scalar @INPUT) > 1 ) {
+ if ($cmd !~ /EMS_IN1_EMS/ && (scalar @INPUT) > 1 ) {
my $in = join(" ",@INPUT);
- $cmd =~ s/([^AN])IN/$1$in/;
- $cmd =~ s/^IN/$in/;
+ $cmd =~ s/EMS_IN_EMS/$in/;
}
# input is defined as IN or IN0, IN1, IN2
else {
- if ($cmd =~ /([^ANS])IN/ && scalar(@INPUT) == 0) {
- die("ERROR: Step $step requires input from prior steps, but none defined.");
- }
- $cmd =~ s/([^ANS])IN(\d+)/$1$INPUT[$2]/g; # a bit trickier to
- $cmd =~ s/([^ANS])IN/$1$INPUT[0]/g; # avoid matching TRAINING, RECASING
- $cmd =~ s/^IN(\d+)/$INPUT[$2]/g;
- $cmd =~ s/^IN/$INPUT[0]/g;
+ if ($cmd =~ /EMS_IN\d*_EMS/ && scalar(@INPUT) == 0) {
+ die("ERROR: Step $step requires input from prior steps, but none defined.");
+ }
+ $cmd =~ s/EMS_IN(\d)_EMS/$INPUT[$1]/g;
+ $cmd =~ s/EMS_IN_EMS/$INPUT[0]/g;
}
- $cmd =~ s/OUT/$output/g;
+ $cmd =~ s/EMS_OUT_EMS/$output/g;
$cmd =~ s/VERSION/$VERSION/g;
print "\tcmd is $cmd\n" if $VERBOSE;
- while ($cmd =~ /^([\S\s]*)\$([^\s\/\"\']+)([\S\s]*)$/) {
+ while ($cmd =~ /^([\S\s]*)\$\{([^\s\/\"\']+)\}([\S\s]*)$/ ||
+ $cmd =~ /^([\S\s]*)\$([^\s\/\"\']+)([\S\s]*)$/) {
my ($pre,$variable,$post) = ($1,$2,$3);
$cmd = $pre
. &check_backoff_and_get(&extend_local_name($module,$set,$variable))
diff --git a/scripts/ems/support/build-sparse-lexical-features.perl b/scripts/ems/support/build-sparse-lexical-features.perl
index 3ba6ceb7e..ab8627d20 100755
--- a/scripts/ems/support/build-sparse-lexical-features.perl
+++ b/scripts/ems/support/build-sparse-lexical-features.perl
@@ -58,7 +58,6 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
open(INI,">$outfile_prefix.ini");
print INI $ini;
print INI "\n[report-sparse-features]\n$report\n";
-print INI "\n[use-alignment-info]\ntrue\n\n";
close(INI);
sub create_top_words {
diff --git a/scripts/ems/support/defaultconfig.py b/scripts/ems/support/defaultconfig.py
new file mode 100644
index 000000000..5d5187c47
--- /dev/null
+++ b/scripts/ems/support/defaultconfig.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+
+#
+# Version of ConfigParser which accepts default values
+#
+
+
+import ConfigParser
+
+
+class Config:
+ def __init__(self,filename):
+ self.config = ConfigParser.SafeConfigParser()
+ cfh = open(filename)
+ self.config.readfp(cfh)
+ cfh.close()
+
+ def get(self,section,name,default=None):
+ if default == None or self.config.has_option(section,name):
+ return self.config.get(section,name)
+ else:
+ return default
+
+ def getint(self,section,name,default=None):
+ if default == None or self.config.has_option(section,name):
+ return self.config.getint(section,name)
+ else:
+ return default
+
+
+ def getboolean(self,section,name,default=None):
+ if default == None or self.config.has_option(section,name):
+ return self.config.getboolean(section,name)
+ else:
+ return default
+
+
+ def getfloat(self,section,name,default=None):
+ if default == None or self.config.has_option(section,name):
+ return self.config.getfloat(section,name)
+ else:
+ return default
+
+
+ def __str__(self):
+ ret = ""
+ for section in self.config.sections():
+ for option in self.config.options(section):
+ ret = ret + "%s:%s = %s\n" % (section,option,self.config.get(section,option))
+ return ret
+
+
+
diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl
index 39eb1483b..155829556 100755
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@@ -45,7 +45,7 @@ foreach my $lm (@LM) {
open(LM,$lm) || die("ERROR: could not find language model file '$lm'");
}
while(<LM>) {
- $lm_order = $1 if /ngram (\d+)/;
+ $lm_order = $1 if /ngram\s+(\d+)/;
last if /1-grams/;
}
close(LM);
diff --git a/scripts/ems/support/mml-filter.perl b/scripts/ems/support/mml-filter.perl
new file mode 100755
index 000000000..f46b132a3
--- /dev/null
+++ b/scripts/ems/support/mml-filter.perl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+
+my ($in,$out,$score,$source_lang,$target_lang,$proportion,$domain,$alignment);
+
+use Getopt::Long;
+GetOptions('in=s' => \$in,
+ 'out=s' => \$out,
+ 'score=s' => \$score,
+ 'domain=s' => \$domain,
+ 'alignment=s' => \$alignment,
+ 'input-extension=s' => \$source_lang,
+ 'output-extension=s' => \$target_lang,
+ 'proportion=f' => \$proportion
+ ) or exit(1);
+
+die("ERROR: input corpus stem not specified (-in FILESTEM)") unless defined($in);
+die("ERROR: output corpus stem not specified (-out FILESTEM)") unless defined($out);
+die("ERROR: score file not specified (-score FILE)") unless defined($score);
+die("ERROR: domain file not specified (-domain FILE)") unless defined($domain);
+die("ERROR: input extension not specified (-input-extension STRING)") unless defined($source_lang);
+die("ERROR: output extension not specified (-output-extension STRING)") unless defined($target_lang);
+die("ERROR: proportion not specified (-proportion RATIO)") unless defined($proportion);
+
+open(CONFIG,">$out.ini");
+print CONFIG "[general]
+strategy = Score
+source_language = $source_lang
+target_language = $target_lang
+input_stem = $in
+".(defined($alignment) ? "alignment_stem = $alignment\n" : "").
+"output_stem = $out
+domain_file = $domain
+domain_file_out = $out
+
+[score]
+score_file = $score
+proportion = $proportion\n";
+close(CONFIG);
+
+my $cmd = "$RealBin/mml-filter.py $out.ini";
+print STDERR "$cmd\n";
+print STDERR `$cmd`;
+
diff --git a/scripts/ems/support/mml-filter.py b/scripts/ems/support/mml-filter.py
new file mode 100755
index 000000000..437c9dade
--- /dev/null
+++ b/scripts/ems/support/mml-filter.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+
+#
+# Filter a parallel corpus
+#
+
+import heapq
+import logging
+import math
+import optparse
+import random
+import sys
+
+from defaultconfig import Config
+
+logging.basicConfig(format = "%(asctime)-15s %(message)s")
+log = logging.getLogger("filter")
+log.setLevel(logging.DEBUG)
+
+class FilterStrategy(object):
+ def __init__(self,config):
+ pass
+
+ def filter(self,source,target):
+ return True
+
+
+class RandomFilterStrategy(FilterStrategy):
+ def __init__(self,config):
+ self.threshold = config.getfloat("random", "threshold", 0.1)
+ random.seed()
+
+ def filter(self, source, target):
+ return random.random() < self.threshold
+
+
+class ScoreFilterStrategy(FilterStrategy):
+ """Filter strategy that is based on a file with sentence scores. There are three
+ possible ways of specifying how to filter:
+ i) threshold - filter all sentence pairs whose score is less than the threshold
+ ii) proportion - filter all but a certain proportion (eg a tenth) of the sentences
+ iii) count - filter all but a given count of the sentences.
+ """
+ def __init__(self,config):
+ section = "score"
+ self.score_file = config.get(section,"score_file")
+ self.ignore_score = config.get(section, "ignore_score", "99999")
+ option_names = ("threshold", "proportion", "count")
+ options = [config.config.has_option(section,o) for o in option_names]
+ if sum(options) != 1:
+ raise RuntimeError("Must specify exactly one of %s for score filter" % str(option_names))
+ if options[0]:
+ # threshold
+ self.threshold = config.getfloat(section,option_names[0])
+ else:
+ # proportion or count
+ if options[2]:
+ count = config.getint(section,option_names[2])
+ else:
+ # need to count entries
+ count = 0
+ ignore_count = 0
+ for line in open(self.score_file):
+ if line[:-1] != self.ignore_score:
+ count = count + 1
+ else:
+ ignore_count = ignore_count + 1
+ count = int(count * config.getfloat(section,option_names[1]))
+ log.info("Retaining at least %d entries and ignoring %d" % (count, ignore_count))
+ # Find the threshold
+ self.threshold = sorted(\
+ [float(line[:-1]) for line in open(self.score_file)], reverse=True)[ignore_count + count]
+ #self.threshold = heapq.nlargest(count, \
+ # [float(line[:-1]) for line in open(self.score_file)])[-1]
+
+
+ self.sfh = open(self.score_file)
+ log.info("Thresholding scores at " + str(self.threshold))
+
+ def filter(self,source,target):
+ score = self.sfh.readline()
+ if not score:
+ raise RuntimeError("score file truncated")
+ return score[:-1] == self.ignore_score or float(score[:-1]) >= self.threshold
+
+
+def main():
+ parser = optparse.OptionParser(usage = "Usage: %prog [options] config-file")
+ (options,args) = parser.parse_args()
+ if len(args) < 1:
+ parser.error("No configuration file specified")
+
+ log.info("Loading configuration from " + args[0])
+ config = Config(args[0])
+ log.debug("Configuration:\n" + str(config))
+
+ # Required general parameters
+ source_lang = config.get("general", "source_language")
+ target_lang = config.get("general", "target_language")
+ input_stem = config.get("general", "input_stem")
+ output_stem = config.get("general", "output_stem")
+ strategy = config.get("general", "strategy", "")
+
+ # Optional general parameters
+ alignment_stem = config.get("general", "alignment_stem", "")
+ alignment_type = config.get("general", "alignment_type", "grow-diag-final-and")
+ domain_file_in = config.get("general", "domain_file", "")
+ domain_file_out = config.get("general", "domain_file_out", "")
+
+ strategy_class = globals()[strategy + "FilterStrategy"]
+ strategy = strategy_class(config)
+
+ source_input_fh = open(input_stem + "." + source_lang)
+ target_input_fh = open(input_stem + "." + target_lang)
+ source_output_fh = open(output_stem + "." + source_lang, "w")
+ target_output_fh = open(output_stem + "." + target_lang, "w")
+
+ alignment_input_fh = None
+ alignment_output_fh = None
+ if alignment_stem:
+ alignment_input_fh = open(alignment_stem + "." + alignment_type)
+ alignment_output_fh = open(output_stem + "." + alignment_type,"w")
+
+ domain_boundaries = {}
+ if domain_file_in:
+ dfh = open(domain_file_in)
+ for line in dfh:
+ line_no,name = line[:-1].split()
+ domain_boundaries[int(line_no)] = name
+
+ domain_output_fh = None
+ if domain_file_out:
+ domain_output_fh = open(domain_file_out, "w")
+
+ #log.info(str(domain_boundaries))
+
+ retained = 0
+ line_no = 0
+ for source_line in source_input_fh:
+ target_line = target_input_fh.readline()
+ if alignment_input_fh:
+ align_line = alignment_input_fh.readline()
+ if strategy.filter(source_line,target_line):
+ retained = retained + 1
+ print>>source_output_fh, source_line,
+ print>>target_output_fh, target_line,
+ if alignment_input_fh:
+ print>>alignment_output_fh, align_line,
+ line_no = line_no + 1
+ # check if this is a domain boundary
+ if domain_boundaries and domain_boundaries.has_key(line_no):
+ print>>domain_output_fh,"%d %s" % (retained,domain_boundaries[line_no])
+ log.info("Lines retained: %d" % retained)
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/ems/support/mml-score.perl b/scripts/ems/support/mml-score.perl
new file mode 100755
index 000000000..3e209c24b
--- /dev/null
+++ b/scripts/ems/support/mml-score.perl
@@ -0,0 +1,118 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+#
+# Calculate the perplexity or the modified Moore-Lewis scores
+# written by Barry Haddow
+# adapted for EMS by Philipp Koehn
+#
+
+my ($model,$corpus_stem,$query,$source_lang,$target_lang,$domain_file,$filter_domains);
+
+use Getopt::Long;
+GetOptions('corpus=s' => \$corpus_stem,
+ 'model=s' => \$model,
+ 'query=s' => \$query,
+ 'input-extension=s' => \$source_lang,
+ 'output-extension=s' => \$target_lang,
+ 'domains=s' => \$domain_file,
+ 'filter-domains=s' => \$filter_domains
+ ) or exit(1);
+
+die("ERROR: corpus not specified (-corpus FILE)") unless defined($corpus_stem);
+die("ERROR: model not specified (-model FILE)") unless defined($model);
+die("ERROR: query command not specified (-query CMD)") unless defined($query);
+die("ERROR: input extension not specified (-input-extension STRING)") unless defined($source_lang);
+die("ERROR: output extension not specified (-output-extension STRING)") unless defined($target_lang);
+
+my $source_inlm = "$model.in-source.binlm";
+my $target_inlm = "$model.in-target.binlm";
+my $source_outlm = "$model.out-source.binlm";
+my $target_outlm = "$model.out-target.binlm";
+
+my $source_corpus = "$corpus_stem.$source_lang";
+my $target_corpus = "$corpus_stem.$target_lang";
+
+print STDERR "querying language models...
+$query $source_inlm < $source_corpus
+$query $target_inlm < $target_corpus
+$query $source_outlm < $source_corpus
+$query $target_outlm < $target_corpus\n";
+
+open(INSOURCE, "$query $source_inlm < $source_corpus |") || die "Failed to open in lm query on source";
+open(INTARGET, "$query $target_inlm < $target_corpus |") || die "Failed to open in lm query on target";
+open(OUTSOURCE, "$query $source_outlm < $source_corpus |") || die "Failed to open out lm query on source";
+open(OUTTARGET, "$query $target_outlm < $target_corpus |") || die "Failed to open out lm query on target";
+
+open(SOURCE, "$source_corpus") || die "Unable to open source corpus";
+open(TARGET, "$target_corpus") || die "Unable to open target corpus";
+
+&load_domains() if defined($filter_domains);
+
+sub score {
+ my $fd = shift;
+ my $line = <$fd>;
+ #print "$line";
+ return 1 if !defined($line);
+ $line =~ /Total: ([\.\-0-9]+) /;
+ return $1;
+}
+
+sub line_length {
+ local *FH = shift;
+ my $line = <FH>;
+ chomp $line;
+ my @tokens = split /\s+/, $line;
+ return $#tokens+1;
+}
+
+my %DOMAIN_FILTERED;
+my %DOMAIN_NAME;
+my @DOMAIN;
+sub load_domains {
+ my %FILTER_DOMAIN;
+ foreach (split(/ /,$filter_domains)) {
+ $FILTER_DOMAIN{$_}++;
+ }
+ open(DOMAIN,$domain_file) || die("ERROR: could not open domain file '$domain_file'");
+ while(<DOMAIN>) {
+ chop;
+ my ($line_number,$name) = split;
+ push @DOMAIN, $line_number;
+ $DOMAIN_NAME{$line_number} = $name;
+ $DOMAIN_FILTERED{$line_number} = defined($FILTER_DOMAIN{$name});
+ }
+ close(DOMAIN);
+}
+
+sub check_sentence_filtered {
+ my ($sentence_number) = @_;
+ foreach my $last_sentence_number_of_domain (@DOMAIN) {
+ if ($sentence_number <= $last_sentence_number_of_domain) {
+ return $DOMAIN_FILTERED{$last_sentence_number_of_domain};
+ }
+ }
+ die("ERROR: domain file incomplete -- could not find sentence $sentence_number");
+}
+
+my $i=1;
+while(1) {
+ # This is actually the -ve of the modified M-L score, so we take sentence with
+ # highest scores
+ my $insource = score(*INSOURCE);
+ my $intarget = score(*INTARGET);
+ my $outsource = score(*OUTSOURCE);
+ my $outtarget = score(*OUTTARGET);
+ last if ($insource == 1 || $outsource == 1 || $intarget == 1 || $outtarget == 1);
+ my $source_length = line_length(*SOURCE);
+ my $target_length = line_length(*TARGET);
+ if (defined($filter_domains) && !&check_sentence_filtered($i)) {
+ print "99999\n"; # keep it
+ }
+ else {
+ my $total = $insource/$source_length - $outsource/$source_length + $intarget/$target_length - $outtarget/$target_length;
+ print "$total\n";
+ }
+ $i++;
+}
diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl
new file mode 100755
index 000000000..f68e0163f
--- /dev/null
+++ b/scripts/ems/support/mml-train.perl
@@ -0,0 +1,84 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my ($indomain_source,,$indomain_target,$outdomain_source,$outdomain_target,$lm_training,$lm_binarizer,$order,$lm_settings,$line_count,$model);
+
+use Getopt::Long;
+GetOptions('in-source=s' => \$indomain_source,
+ 'in-target=s' => \$indomain_target,
+ 'out-source=s' => \$outdomain_source,
+ 'out-target=s' => \$outdomain_target,
+ 'model=s' => \$model,
+ 'lm-training=s' => \$lm_training,
+ 'lm-binarizer=s' => \$lm_binarizer,
+ 'order=s' => \$order,
+ 'lm-settings=s' => \$lm_settings,
+ 'line-count=i' => \$line_count
+ ) or exit(1);
+
+die("ERROR: in-domain source file not specified (-in-source FILE)") unless defined($indomain_source);
+die("ERROR: in-domain target file not specified (-in-target FILE)") unless defined($indomain_target);
+die("ERROR: out-of-domain source file not specified (-out-source FILE)") unless defined($outdomain_source);
+die("ERROR: out-of-domain target file not specified (-out-target FILE)") unless defined($outdomain_target);
+
+die("ERROR: in-domain source file '$indomain_source' not found") unless -e $indomain_source || -e $indomain_source.".gz";
+die("ERROR: in-domain target file '$indomain_target' not found") unless -e $indomain_target || -e $indomain_target.".gz";
+die("ERROR: out-of-domain source file '$outdomain_source' not found") unless -e $outdomain_source || -e $outdomain_source.".gz";
+die("ERROR: out-of-domain target file '$outdomain_target' not found") unless -e $outdomain_target || -e $outdomain_target.".gz";
+
+die("ERROR: language model order not specified (-order NUM)") unless defined($order);
+die("ERROR: language model settings not specified (-lm-settings STRING)") unless defined($lm_settings);
+die("ERROR: language model command not specified (-lm-training CMD)") unless defined($lm_training);
+die("ERROR: language model binarizer not specified (-lm-binarizer CMD)") unless defined($lm_binarizer);
+die("ERROR: model not specified (-model FILESTEM)") unless defined($model);
+
+&train_lm($indomain_source,"in-source");
+&train_lm($indomain_target,"in-target");
+&extract_vocabulary("in-source");
+&extract_vocabulary("in-target");
+&train_lm($outdomain_source,"out-source","in-source");
+&train_lm($outdomain_target,"out-target","in-target");
+
+sub extract_vocabulary {
+ my ($type) = @_;
+ print STDERR "extracting vocabulary from $type language model\n";
+ open(LM,"$model.$type.lm");
+ open(VOCAB,">$model.$type.vocab");
+ my $unigrams = 0;
+ while(<LM>) {
+ $unigrams = 1 if /^\\1-grams:/;
+ last if /^\\2-grams:/;
+ next unless $unigrams;
+ my @TOKEN = split(/\s/);
+ next unless @TOKEN == 3;
+ next if $TOKEN[1] eq '<s>';
+ next if $TOKEN[1] eq '<unk>';
+ next if $TOKEN[1] eq '<\\s>';
+ print VOCAB $TOKEN[1]."\n";
+ }
+ close(LM);
+ close(VOCAB);
+}
+
+sub train_lm {
+ my ($file,$type,$vocab) = @_;
+ print STDERR "training $type language model\n";
+ if (defined($line_count)) {
+ my $cmd = (-e $file.".gz" ? "zcat $file.gz" : "cat $file");
+ $cmd .= " | shuf -n $line_count --random-source ".(-e $file.".gz" ? "$file.gz" : $file)." > $model.$type.tok";
+ print STDERR "extracting $line_count random lines from $file\n$cmd\n";
+ print STDERR `$cmd`;
+ $file = "$model.$type.tok";
+ }
+
+ my $cmd = "$lm_training -order $order $lm_settings -text $file -lm $model.$type.lm";
+ $cmd .= " -vocab $model.$vocab.vocab" if defined($vocab);
+ print STDERR $cmd."\n";
+ print STDERR `$cmd`;
+
+ $cmd = "$lm_binarizer $model.$type.lm $model.$type.binlm";
+ print STDERR $cmd."\n";
+ print STDERR `$cmd`;
+}
+
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index b366d3d7e..d73e58742 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -84,16 +84,16 @@ sub do_it_for {
}
sub preprocess {
+ #this is one paragraph
+ my($text) = @_;
+
# clean up spaces at head and tail of each line as well as any double-spacing
$text =~ s/ +/ /g;
$text =~ s/\n /\n/g;
$text =~ s/ \n/\n/g;
$text =~ s/^ //g;
$text =~ s/ $//g;
-
- #this is one paragraph
- my($text) = @_;
-
+
#####add sentence breaks as needed#####
#non-period end of sentence markers (?!) followed by sentence starters.
diff --git a/scripts/ems/support/train-irstlm.perl b/scripts/ems/support/train-irstlm.perl
deleted file mode 100644
index 5d2c05ce2..000000000
--- a/scripts/ems/support/train-irstlm.perl
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-
-# wrapper for irstlm training
-
-my $IRSTLM = shift @ARGV;
-
-my $settings = join(" ",@ARGV);
-$settings =~ s/\-order/\-n/;
-$settings =~ s/\-text/\-i/;
-$settings =~ s/\-lm/\-o/;
-
-if ($settings !~ /\-o +(\S+)/) {
- die("ERROR: no output file specified");
-}
-my $lm = $1;
-$settings =~ s/(\-o +\S+)/$1.iarpa.gz/;
-
-my $cmd = "IRSTLM=$IRSTLM $IRSTLM/scripts/build-lm.sh $settings ; ~/moses/irstlm/bin/compile-lm --text yes $lm.iarpa.gz $lm";
-print STDERR $cmd."\n";
-print `$cmd`;