Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--scripts/ems/example/config.basic35
-rw-r--r--scripts/ems/example/config.factored35
-rw-r--r--scripts/ems/example/config.hierarchical35
-rw-r--r--scripts/ems/example/config.syntax35
-rw-r--r--scripts/ems/example/config.toy35
-rw-r--r--scripts/ems/experiment.meta93
-rwxr-xr-xscripts/ems/support/mml-train.perl2
7 files changed, 267 insertions, 3 deletions
diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic
index 1fce185df..4af8664aa 100644
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@@ -252,6 +252,35 @@ type = 8
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (typically a million or so)
+#settings = "--line-count 1000000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -316,6 +345,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored
index 433fbb5de..5e3b51f4e 100644
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@@ -272,6 +272,35 @@ mxpost = /home/pkoehn/bin/mxpost
factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.perl -mxpost $mxpost"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (typically a million or so)
+#settings = "--line-count 1000000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -336,6 +365,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical
index d97027274..c3e3f5044 100644
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@@ -252,6 +252,35 @@ type = 8
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (typically a million or so)
+#settings = "--line-count 1000000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -316,6 +345,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax
index 754c68fb1..ebc4b6ec9 100644
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@@ -256,6 +256,35 @@ type = 8
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (typically a million or so)
+#settings = "--line-count 1000000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -320,6 +349,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy
index 8a012e4f2..a3f7fcbd8 100644
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@@ -236,6 +236,35 @@ type = 8
#lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8"
#################################################################
+# MODIFIED MOORE LEWIS FILTERING
+
+[MML] IGNORE
+
+### specifications for language models to be trained
+#
+#lm-training = $srilm-dir/ngram-count
+#lm-settings = "-interpolate -kndiscount -unk"
+#lm-binarizer = $moses-src-dir/bin/build_binary
+#lm-query = $moses-src-dir/bin/query
+#order = 5
+
+### in-/out-of-domain source/target corpora to train the 4 language model
+#
+# in-domain: point either to a parallel corpus
+#outdomain-stem = [CORPUS:toy:clean-split-stem]
+
+# ... or to two separate monolingual corpora
+#indomain-target = [LM:toy:lowercased-corpus]
+#raw-indomain-source = $toy-data/nc-5k.$input-extension
+
+# point to out-of-domain parallel corpus
+#outdomain-stem = [CORPUS:giga:clean-split-stem]
+
+# settings: number of lines sampled from the corpora to train each language model on
+# (typically a million or so)
+#settings = "--line-count 1000000"
+
+#################################################################
# TRANSLATION MODEL TRAINING
[TRAINING]
@@ -300,6 +329,12 @@ alignment-symmetrization-method = grow-diag-final-and
#
#word-alignment = $working-dir/model/aligned.1
+### filtering some corpora with modified Moore-Lewis
+# specify corpora to be filtered and ratio to be kept, either before or after word alignment
+#mml-filter-corpora = toy
+#mml-before-wa = "-proportion 0.9"
+#mml-after-wa = "-proportion 0.9"
+
### create a bilingual concordancer for the model
#
#biconcor = $moses-script-dir/ems/biconcor/biconcor
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index d1363feb7..f9ffafa3a 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -277,12 +277,101 @@ binarize
default-name: lm/interpolated-binlm
error: set kMaxOrder to at least this value
[MML] single
+tokenize-indomain-source
+ in: raw-indomain-source
+ out: tokenized-indomain-source
+ default-name: mml/indomain-source.tok
+ pass-unless: input-tokenizer
+ template: $input-tokenizer < IN > OUT
+ parallelizable: yes
+factorize-indomain-source
+ in: tokenized-indomain-source
+ out: factorized-indomain-source
+ rerun-on-change: TRAINING:input-factors
+ default-name: mml/indomain-source.factored
+ pass-unless: factors
+ parallelizable: yes
+ error: can't open
+ error: incompatible number of words in factor
+lowercase-indomain-source
+ in: factorized-indomain-source
+ out: lowercased-indomain-source
+ default-name: mml/indomain-source.lowercased
+ pass-unless: input-lowercaser
+ ignore-if: input-truecaser
+ only-factor-0: yes
+ template: $input-lowercaser < IN > OUT
+ parallelizable: yes
+truecase-indomain-source
+ in: factorized-indomain-source TRUECASER:truecase-model
+ out: lowercased-indomain-source
+ rerun-on-change: input-truecaser
+ default-name: mml/indomain-source.truecased
+ ignore-unless: input-truecaser
+ only-factor-0: yes
+ template: $input-truecaser -model IN1.$input-extension < IN > OUT
+ parallelizable: yes
+split-indomain-source
+ in: lowercased-indomain-source SPLITTER:splitter-model
+ out: indomain-source
+ rerun-on-change: input-splitter
+ default-name: mml/indomain-source.split
+ pass-unless: input-splitter
+ template: $input-splitter -model IN1.$input-extension < IN > OUT
+tokenize-indomain-target
+ in: raw-indomain-target
+ out: tokenized-indomain-target
+ default-name: mml/indomain-target.tok
+ pass-unless: output-tokenizer
+ template: $output-tokenizer < IN > OUT
+ parallelizable: yes
+factorize-indomain-target
+ in: tokenized-indomain-target
+ out: factorized-indomain-target
+ rerun-on-change: TRAINING:output-factors
+ default-name: mml/indomain-target.factored
+ pass-unless: factors
+ parallelizable: yes
+ error: can't open
+ error: incompatible number of words in factor
+lowercase-indomain-target
+ in: factorized-indomain-target
+ out: lowercased-indomain-target
+ default-name: mml/indomain-target.lowercased
+ pass-unless: output-lowercaser
+ ignore-if: output-truecaser
+ only-factor-0: yes
+ template: $output-lowercaser < IN > OUT
+ parallelizable: yes
+truecase-indomain-target
+ in: factorized-indomain-target TRUECASER:truecase-model
+ out: lowercased-indomain-target
+ rerun-on-change: output-truecaser
+ default-name: mml/indomain-target.truecased
+ ignore-unless: output-truecaser
+ only-factor-0: yes
+ template: $output-truecaser -model IN1.$output-extension < IN > OUT
+ parallelizable: yes
+split-indomain-target
+ in: lowercased-indomain-target SPLITTER:splitter-model
+ out: indomain-target
+ rerun-on-change: output-splitter
+ default-name: mml/indomain-target.split
+ pass-unless: output-splitter
+ template: $output-splitter -model IN1.$output-extension < IN > OUT
train
in: indomain-stem outdomain-stem
out: model
- ignore-unless: settings
- default-name: model/mml
+ ignore-unless: AND settings indomain-stem
+ default-name: mml/model
template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
+train-in-mono
+ in: indomain-source indomain-target outdomain-stem
+ out: model
+ ignore-unless: settings
+ ignore-if: indomain-stem
+ default-name: mml/model
+ template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings
[TRAINING] single
consolidate
diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl
index bacf70823..57ff5ab83 100755
--- a/scripts/ems/support/mml-train.perl
+++ b/scripts/ems/support/mml-train.perl
@@ -36,7 +36,7 @@ die("ERROR: model not specified (-model FILESTEM)") unless defined($model);
&train_lm($indomain_source,"in-source");
&train_lm($indomain_target,"in-target");
&train_lm($outdomain_source,"out-source");
-&train_lm($outdomain_source,"out-target");
+&train_lm($outdomain_target,"out-target");
sub train_lm {
my ($file,$type) = @_;