diff options
-rw-r--r-- | scripts/ems/example/config.basic | 35 | ||||
-rw-r--r-- | scripts/ems/example/config.factored | 35 | ||||
-rw-r--r-- | scripts/ems/example/config.hierarchical | 35 | ||||
-rw-r--r-- | scripts/ems/example/config.syntax | 35 | ||||
-rw-r--r-- | scripts/ems/example/config.toy | 35 | ||||
-rw-r--r-- | scripts/ems/experiment.meta | 93 | ||||
-rwxr-xr-x | scripts/ems/support/mml-train.perl | 2 |
7 files changed, 267 insertions, 3 deletions
diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic index 1fce185df..4af8664aa 100644 --- a/scripts/ems/example/config.basic +++ b/scripts/ems/example/config.basic @@ -252,6 +252,35 @@ type = 8 #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" ################################################################# +# MODIFIED MOORE LEWIS FILTERING + +[MML] IGNORE + +### specifications for language models to be trained +# +#lm-training = $srilm-dir/ngram-count +#lm-settings = "-interpolate -kndiscount -unk" +#lm-binarizer = $moses-src-dir/bin/build_binary +#lm-query = $moses-src-dir/bin/query +#order = 5 + +### in-/out-of-domain source/target corpora to train the 4 language model +# +# in-domain: point either to a parallel corpus +#outdomain-stem = [CORPUS:toy:clean-split-stem] + +# ... or to two separate monolingual corpora +#indomain-target = [LM:toy:lowercased-corpus] +#raw-indomain-source = $toy-data/nc-5k.$input-extension + +# point to out-of-domain parallel corpus +#outdomain-stem = [CORPUS:giga:clean-split-stem] + +# settings: number of lines sampled from the corpora to train each language model on +# (typically a million or so) +#settings = "--line-count 1000000" + +################################################################# # TRANSLATION MODEL TRAINING [TRAINING] @@ -316,6 +345,12 @@ alignment-symmetrization-method = grow-diag-final-and # #word-alignment = $working-dir/model/aligned.1 +### filtering some corpora with modified Moore-Lewis +# specify corpora to be filtered and ratio to be kept, either before or after word alignment +#mml-filter-corpora = toy +#mml-before-wa = "-proportion 0.9" +#mml-after-wa = "-proportion 0.9" + ### create a bilingual concordancer for the model # #biconcor = $moses-script-dir/ems/biconcor/biconcor diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored index 433fbb5de..5e3b51f4e 100644 --- a/scripts/ems/example/config.factored +++ b/scripts/ems/example/config.factored @@ -272,6 +272,35 @@ mxpost = /home/pkoehn/bin/mxpost factor-script = "$moses-script-dir/training/wrappers/make-factor-en-pos.mxpost.perl -mxpost $mxpost" ################################################################# +# MODIFIED MOORE LEWIS FILTERING + +[MML] IGNORE + +### specifications for language models to be trained +# +#lm-training = $srilm-dir/ngram-count +#lm-settings = "-interpolate -kndiscount -unk" +#lm-binarizer = $moses-src-dir/bin/build_binary +#lm-query = $moses-src-dir/bin/query +#order = 5 + +### in-/out-of-domain source/target corpora to train the 4 language model +# +# in-domain: point either to a parallel corpus +#outdomain-stem = [CORPUS:toy:clean-split-stem] + +# ... or to two separate monolingual corpora +#indomain-target = [LM:toy:lowercased-corpus] +#raw-indomain-source = $toy-data/nc-5k.$input-extension + +# point to out-of-domain parallel corpus +#outdomain-stem = [CORPUS:giga:clean-split-stem] + +# settings: number of lines sampled from the corpora to train each language model on +# (typically a million or so) +#settings = "--line-count 1000000" + +################################################################# # TRANSLATION MODEL TRAINING [TRAINING] @@ -336,6 +365,12 @@ alignment-symmetrization-method = grow-diag-final-and # #word-alignment = $working-dir/model/aligned.1 +### filtering some corpora with modified Moore-Lewis +# specify corpora to be filtered and ratio to be kept, either before or after word alignment +#mml-filter-corpora = toy +#mml-before-wa = "-proportion 0.9" +#mml-after-wa = "-proportion 0.9" + ### create a bilingual concordancer for the model # #biconcor = $moses-script-dir/ems/biconcor/biconcor diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical index d97027274..c3e3f5044 100644 --- a/scripts/ems/example/config.hierarchical +++ b/scripts/ems/example/config.hierarchical @@ -252,6 +252,35 @@ type = 8 #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" ################################################################# +# MODIFIED MOORE LEWIS FILTERING + +[MML] IGNORE + +### specifications for language models to be trained +# +#lm-training = $srilm-dir/ngram-count +#lm-settings = "-interpolate -kndiscount -unk" +#lm-binarizer = $moses-src-dir/bin/build_binary +#lm-query = $moses-src-dir/bin/query +#order = 5 + +### in-/out-of-domain source/target corpora to train the 4 language model +# +# in-domain: point either to a parallel corpus +#outdomain-stem = [CORPUS:toy:clean-split-stem] + +# ... or to two separate monolingual corpora +#indomain-target = [LM:toy:lowercased-corpus] +#raw-indomain-source = $toy-data/nc-5k.$input-extension + +# point to out-of-domain parallel corpus +#outdomain-stem = [CORPUS:giga:clean-split-stem] + +# settings: number of lines sampled from the corpora to train each language model on +# (typically a million or so) +#settings = "--line-count 1000000" + +################################################################# # TRANSLATION MODEL TRAINING [TRAINING] @@ -316,6 +345,12 @@ alignment-symmetrization-method = grow-diag-final-and # #word-alignment = $working-dir/model/aligned.1 +### filtering some corpora with modified Moore-Lewis +# specify corpora to be filtered and ratio to be kept, either before or after word alignment +#mml-filter-corpora = toy +#mml-before-wa = "-proportion 0.9" +#mml-after-wa = "-proportion 0.9" + ### create a bilingual concordancer for the model # #biconcor = $moses-script-dir/ems/biconcor/biconcor diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax index 754c68fb1..ebc4b6ec9 100644 --- a/scripts/ems/example/config.syntax +++ b/scripts/ems/example/config.syntax @@ -256,6 +256,35 @@ type = 8 #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" ################################################################# +# MODIFIED MOORE LEWIS FILTERING + +[MML] IGNORE + +### specifications for language models to be trained +# +#lm-training = $srilm-dir/ngram-count +#lm-settings = "-interpolate -kndiscount -unk" +#lm-binarizer = $moses-src-dir/bin/build_binary +#lm-query = $moses-src-dir/bin/query +#order = 5 + +### in-/out-of-domain source/target corpora to train the 4 language model +# +# in-domain: point either to a parallel corpus +#outdomain-stem = [CORPUS:toy:clean-split-stem] + +# ... or to two separate monolingual corpora +#indomain-target = [LM:toy:lowercased-corpus] +#raw-indomain-source = $toy-data/nc-5k.$input-extension + +# point to out-of-domain parallel corpus +#outdomain-stem = [CORPUS:giga:clean-split-stem] + +# settings: number of lines sampled from the corpora to train each language model on +# (typically a million or so) +#settings = "--line-count 1000000" + +################################################################# # TRANSLATION MODEL TRAINING [TRAINING] @@ -320,6 +349,12 @@ alignment-symmetrization-method = grow-diag-final-and # #word-alignment = $working-dir/model/aligned.1 +### filtering some corpora with modified Moore-Lewis +# specify corpora to be filtered and ratio to be kept, either before or after word alignment +#mml-filter-corpora = toy +#mml-before-wa = "-proportion 0.9" +#mml-after-wa = "-proportion 0.9" + ### create a bilingual concordancer for the model # #biconcor = $moses-script-dir/ems/biconcor/biconcor diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy index 8a012e4f2..a3f7fcbd8 100644 --- a/scripts/ems/example/config.toy +++ b/scripts/ems/example/config.toy @@ -236,6 +236,35 @@ type = 8 #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" ################################################################# +# MODIFIED MOORE LEWIS FILTERING + +[MML] IGNORE + +### specifications for language models to be trained +# +#lm-training = $srilm-dir/ngram-count +#lm-settings = "-interpolate -kndiscount -unk" +#lm-binarizer = $moses-src-dir/bin/build_binary +#lm-query = $moses-src-dir/bin/query +#order = 5 + +### in-/out-of-domain source/target corpora to train the 4 language model +# +# in-domain: point either to a parallel corpus +#outdomain-stem = [CORPUS:toy:clean-split-stem] + +# ... or to two separate monolingual corpora +#indomain-target = [LM:toy:lowercased-corpus] +#raw-indomain-source = $toy-data/nc-5k.$input-extension + +# point to out-of-domain parallel corpus +#outdomain-stem = [CORPUS:giga:clean-split-stem] + +# settings: number of lines sampled from the corpora to train each language model on +# (typically a million or so) +#settings = "--line-count 1000000" + +################################################################# # TRANSLATION MODEL TRAINING [TRAINING] @@ -300,6 +329,12 @@ alignment-symmetrization-method = grow-diag-final-and # #word-alignment = $working-dir/model/aligned.1 +### filtering some corpora with modified Moore-Lewis +# specify corpora to be filtered and ratio to be kept, either before or after word alignment +#mml-filter-corpora = toy +#mml-before-wa = "-proportion 0.9" +#mml-after-wa = "-proportion 0.9" + ### create a bilingual concordancer for the model # #biconcor = $moses-script-dir/ems/biconcor/biconcor diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index d1363feb7..f9ffafa3a 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -277,12 +277,101 @@ binarize default-name: lm/interpolated-binlm error: set kMaxOrder to at least this value [MML] single +tokenize-indomain-source + in: raw-indomain-source + out: tokenized-indomain-source + default-name: mml/indomain-source.tok + pass-unless: input-tokenizer + template: $input-tokenizer < IN > OUT + parallelizable: yes +factorize-indomain-source + in: tokenized-indomain-source + out: factorized-indomain-source + rerun-on-change: TRAINING:input-factors + default-name: mml/indomain-source.factored + pass-unless: factors + parallelizable: yes + error: can't open + error: incompatible number of words in factor +lowercase-indomain-source + in: factorized-indomain-source + out: lowercased-indomain-source + default-name: mml/indomain-source.lowercased + pass-unless: input-lowercaser + ignore-if: input-truecaser + only-factor-0: yes + template: $input-lowercaser < IN > OUT + parallelizable: yes +truecase-indomain-source + in: factorized-indomain-source TRUECASER:truecase-model + out: lowercased-indomain-source + rerun-on-change: input-truecaser + default-name: mml/indomain-source.truecased + ignore-unless: input-truecaser + only-factor-0: yes + template: $input-truecaser -model IN1.$input-extension < IN > OUT + parallelizable: yes +split-indomain-source + in: lowercased-indomain-source SPLITTER:splitter-model + out: indomain-source + rerun-on-change: input-splitter + default-name: mml/indomain-source.split + pass-unless: input-splitter + template: $input-splitter -model IN1.$input-extension < IN > OUT +tokenize-indomain-target + in: raw-indomain-target + out: tokenized-indomain-target + default-name: mml/indomain-target.tok + pass-unless: output-tokenizer + template: $output-tokenizer < IN > OUT + parallelizable: yes +factorize-indomain-target + in: tokenized-indomain-target + out: factorized-indomain-target + rerun-on-change: TRAINING:output-factors + default-name: mml/indomain-target.factored + pass-unless: factors + parallelizable: yes + error: can't open + error: incompatible number of words in factor +lowercase-indomain-target + in: factorized-indomain-target + out: lowercased-indomain-target + default-name: mml/indomain-target.lowercased + pass-unless: output-lowercaser + ignore-if: output-truecaser + only-factor-0: yes + template: $output-lowercaser < IN > OUT + parallelizable: yes +truecase-indomain-target + in: factorized-indomain-target TRUECASER:truecase-model + out: lowercased-indomain-target + rerun-on-change: output-truecaser + default-name: mml/indomain-target.truecased + ignore-unless: output-truecaser + only-factor-0: yes + template: $output-truecaser -model IN1.$output-extension < IN > OUT + parallelizable: yes +split-indomain-target + in: lowercased-indomain-target SPLITTER:splitter-model + out: indomain-target + rerun-on-change: output-splitter + default-name: mml/indomain-target.split + pass-unless: output-splitter + template: $output-splitter -model IN1.$output-extension < IN > OUT train in: indomain-stem outdomain-stem out: model - ignore-unless: settings - default-name: model/mml + ignore-unless: AND settings indomain-stem + default-name: mml/model template: $moses-script-dir/ems/support/mml-train.perl -in-source IN.$input-extension -in-target IN.$output-extension -out-source IN1.$input-extension -out-target IN1.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings +train-in-mono + in: indomain-source indomain-target outdomain-stem + out: model + ignore-unless: settings + ignore-if: indomain-stem + default-name: mml/model + template: $moses-script-dir/ems/support/mml-train.perl -in-source IN -in-target IN1 -out-source IN2.$input-extension -out-target IN2.$output-extension -model OUT -lm-training "$lm-training" -order $order -lm-settings "$lm-settings" -lm-binarizer $lm-binarizer $settings [TRAINING] single consolidate diff --git a/scripts/ems/support/mml-train.perl b/scripts/ems/support/mml-train.perl index bacf70823..57ff5ab83 100755 --- a/scripts/ems/support/mml-train.perl +++ b/scripts/ems/support/mml-train.perl @@ -36,7 +36,7 @@ die("ERROR: model not specified (-model FILESTEM)") unless defined($model); &train_lm($indomain_source,"in-source"); &train_lm($indomain_target,"in-target"); &train_lm($outdomain_source,"out-source"); -&train_lm($outdomain_source,"out-target"); +&train_lm($outdomain_target,"out-target"); sub train_lm { my ($file,$type) = @_; |