Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2010-05-18 21:39:16 +0400
committerphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2010-05-18 21:39:16 +0400
commitb271862d7cbb476ec58f9ea5e7432f29f4c4a5c9 (patch)
treeb6788d0dc7271647e9a2a4c43b44e20d653fc5ef /scripts/ems
parent9fec69ce312e518d7160b4d8fc3df8de8047c8d4 (diff)
various updates, mostly related to experiment.perl
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3262 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/ems')
-rw-r--r--scripts/ems/example/config.basic14
-rw-r--r--scripts/ems/example/config.factored14
-rw-r--r--scripts/ems/example/config.hierarchical16
-rw-r--r--scripts/ems/example/config.syntax16
-rw-r--r--scripts/ems/example/config.toy10
-rw-r--r--scripts/ems/experiment.meta9
-rwxr-xr-xscripts/ems/experiment.perl63
-rwxr-xr-xscripts/ems/support/interpolate-lm.perl6
-rwxr-xr-xscripts/ems/support/reference-from-sgm.perl3
-rwxr-xr-xscripts/ems/support/run-command-on-multiple-refsets.perl14
10 files changed, 152 insertions, 13 deletions
diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic
index 337a27e65..954b9c2a2 100644
--- a/scripts/ems/example/config.basic
+++ b/scripts/ems/example/config.basic
@@ -402,10 +402,16 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### number of jobs (if parallel execution on cluster)
#
-jobs = 10
+#jobs = 10
+### additional decoder settings
+# switches for the Moses decoder
+#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
+### multiple reference translations
+#
+multiref = yes
### prepare system output for scoring
# this may include detokenization and wrapping output in sgm
@@ -436,6 +442,12 @@ nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
+#
+# also report on input coverage
+analyze-coverage = yes
+#
+# also report on phrase mappings used
+report-segmentation = yes
[EVALUATION:newstest2009]
diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored
index a576b9f9a..967fbdea5 100644
--- a/scripts/ems/example/config.factored
+++ b/scripts/ems/example/config.factored
@@ -422,10 +422,16 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### number of jobs (if parallel execution on cluster)
#
-jobs = 10
+#jobs = 10
+### additional decoder settings
+# switches for the Moses decoder
+#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
+### multiple reference translations
+#
+multiref = yes
### prepare system output for scoring
# this may include detokenization and wrapping output in sgm
@@ -456,6 +462,12 @@ nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
+#
+# also report on input coverage
+analyze-coverage = yes
+#
+# also report on phrase mappings used
+report-segmentation = yes
[EVALUATION:newstest2009]
diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical
index 2f3f65634..682e8572b 100644
--- a/scripts/ems/example/config.hierarchical
+++ b/scripts/ems/example/config.hierarchical
@@ -402,10 +402,16 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### number of jobs (if parallel execution on cluster)
#
-jobs = 10
+#jobs = 10
-decoder-settings = ""
+### additional decoder settings
+# switches for the Moses decoder
+#
+#decoder-settings = ""
+### multiple reference translations
+#
+multiref = yes
### prepare system output for scoring
# this may include detokenization and wrapping output in sgm
@@ -436,6 +442,12 @@ nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
+#
+# also report on input coverage
+analyze-coverage = yes
+#
+# also report on phrase mappings used
+report-segmentation = yes
[EVALUATION:newstest2009]
diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax
index f6b1d8165..b0d791c14 100644
--- a/scripts/ems/example/config.syntax
+++ b/scripts/ems/example/config.syntax
@@ -406,10 +406,16 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
### number of jobs (if parallel execution on cluster)
#
-jobs = 10
+#jobs = 10
-decoder-settings = ""
+### additional decoder settings
+# switches for the Moses decoder
+#
+#decoder-settings = ""
+### multiple reference translations
+#
+multiref = yes
### prepare system output for scoring
# this may include detokenization and wrapping output in sgm
@@ -440,6 +446,12 @@ nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
+#
+# also report on input coverage
+analyze-coverage = yes
+#
+# also report on phrase mappings used
+report-segmentation = yes
[EVALUATION:newstest2009]
diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy
index 8ed7176fd..dbae0dd9c 100644
--- a/scripts/ems/example/config.toy
+++ b/scripts/ems/example/config.toy
@@ -389,6 +389,10 @@ trainer = $moses-script-dir/recaser/train-truecaser.perl
#
decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"
+### multiple reference translations
+#
+multiref = yes
+
### prepare system output for scoring
# this may include detokenization and wrapping output in sgm
# (needed for nist-bleu, ter, meteor)
@@ -418,6 +422,12 @@ nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
### Analysis: carry out various forms of analysis on the output
#
analysis = $moses-script-dir/ems/support/analysis.perl
+#
+# also report on input coverage
+analyze-coverage = yes
+#
+# also report on phrase mappings used
+report-segmentation = yes
[EVALUATION:test]
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index a4670a9d6..37b559e5f 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -473,6 +473,7 @@ tokenize-reference
out: tokenized-reference
default-name: tuning/reference.tok
pass-unless: output-tokenizer
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-tokenizer < IN > OUT
lowercase-reference
in: tokenized-reference
@@ -480,6 +481,7 @@ lowercase-reference
default-name: tuning/reference.lc
pass-unless: output-lowercaser
ignore-if: output-truecaser
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
truecase-reference
in: tokenized-reference TRUECASER:truecase-model
@@ -487,12 +489,14 @@ truecase-reference
rerun-on-change: output-truecaser
default-name: tuning/reference.tc
ignore-unless: output-truecaser
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-truecaser -model IN1.$output-extension < IN > OUT
split-reference
in: cased-reference SPLITTER:splitter-model
out: reference
default-name: tuning/reference.split
pass-unless: output-splitter
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-splitter -model IN1.$output-extension < IN > OUT
tune
in: TRAINING:config input reference
@@ -641,12 +645,14 @@ tokenize-reference
out: tokenized-reference
default-name: evaluation/reference.tok
pass-unless: output-tokenizer
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-tokenizer < IN > OUT
lowercase-reference
in: tokenized-reference
out: reference
default-name: evaluation/reference
pass-unless: output-lowercaser
+ multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
template: $output-lowercaser < IN > OUT
nist-bleu
in: wrapped-output reference-sgm
@@ -713,8 +719,7 @@ analysis-coverage
in: input TRAINING:corpus TRAINING:phrase-translation-table
out: analysis-coverage
default-name: evaluation/analysis
- ignore-unless: analysis analyze-coverage
- template: $analysis -input IN -input-corpus IN1.$input-extension -ttable IN2 -dir OUT
+ ignore-unless: AND analysis analyze-coverage
[REPORTING] single
report
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 8aaaeca67..48281ea5b 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -54,6 +54,7 @@ my (@MODULE,
%QSUB_SCRIPT, # flag if script contains qsub's when run on cluster
%QSUB_STEP, # flag if step contains qsub's when run on cluster
%RERUN_ON_CHANGE, # config parameter whose change invalidates old runs
+ %MULTIREF, # flag if step may be run on multiple sets (reference translations)
%TEMPLATE, # template if step follows a simple pattern
%TEMPLATE_IF, # part of template that is conditionally executed
%ONLY_FACTOR_0, # only run on a corpus that includes surface word
@@ -219,6 +220,9 @@ sub read_meta {
elsif ($1 eq "rerun-on-change") {
push @{$RERUN_ON_CHANGE{"$module:$step"}}, split(/\s+/,$2);
}
+ elsif ($1 eq "multiref") {
+ $MULTIREF{"$module:$step"} = $2;
+ }
elsif ($1 eq "template") {
$TEMPLATE{"$module:$step"} = $2;
}
@@ -434,7 +438,12 @@ sub find_steps_for_module {
# not needed, if optional and not specified
if (defined($STEP_IGNORE{$defined_step})) {
my $next = 0;
+ my $and = 0;
my @IGNORE = split(/ /,$STEP_IGNORE{$defined_step});
+ if ($IGNORE[0] eq "AND") {
+ $and = 1;
+ shift @IGNORE;
+ }
foreach my $ignore (@IGNORE) {
my $extended_name = &extend_local_name($module,$set,$ignore);
if (! &backoff_and_get($extended_name)) {
@@ -442,7 +451,8 @@ sub find_steps_for_module {
$next++;
}
}
- next if $next == scalar @IGNORE;
+ next if !$and && ($next == scalar @IGNORE); # OR: all parameters have to be missing
+ next if $and && $next; # AND: any parameter has to be missing
print "\t\t=> not all non-existant, not ignored" if $next && $VERBOSE;
}
@@ -919,6 +929,9 @@ sub define_step {
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):analysis$/) {
&define_evaluation_analysis($1,$i);
}
+ elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):analysis-coverage$/) {
+ &define_evaluation_analysis_coverage($1,$i);
+ }
elsif ($DO_STEP[$i] =~ /^EVALUATION:(.+):meteor$/) {
# &define_evaluation_meteor($1);
}
@@ -2015,6 +2028,41 @@ sub define_evaluation_analysis {
&create_step($step_id,$cmd);
}
+sub define_evaluation_analysis_coverage {
+ my ($set,$step_id) = @_;
+
+ my ($analysis,
+ $input,$corpus,$ttable) = &get_output_and_input($step_id);
+ my $script = &backoff_and_get("EVALUATION:$set:analysis");
+ my $input_extension = &check_backoff_and_get("TRAINING:input-extension");
+
+ # translation table name
+ if (&backoff_and_get("TRAINING:input-factors")) {
+ my %IN = &get_factor_id("input");
+ my %OUT = &get_factor_id("output");
+ my $factors = &encode_factor_definition("translation-factors",\%IN,\%OUT);
+ my @FACTOR = split(/\+/,$factors);
+ my @SPECIFIED_NAME;
+ if (&backoff_and_get("TRAINING:phrase-translation-table")) {
+ @SPECIFIED_NAME = @{$CONFIG{"TRAINING:phrase-translation-table"}};
+ }
+ for(my $i=0;$i<scalar(@FACTOR);$i++) {
+ if ($FACTOR[$i] =~ /^0-/) {
+ if (scalar(@SPECIFIED_NAME) > $i) {
+ $ttable = $SPECIFIED_NAME[$i];
+ }
+ else {
+ $ttable .= ".".$FACTOR[$i];
+ }
+ last;
+ }
+ }
+ }
+
+ my $cmd = "$script -input $input -input-corpus $corpus.$input_extension -ttable $ttable -dir $analysis";
+ &create_step($step_id,$cmd);
+}
+
sub define_reporting_report {
my ($step_id) = @_;
@@ -2101,6 +2149,12 @@ sub define_template {
my ($module,$set,$stepname) = &deconstruct_name($step);
+ my $multiref = undef;
+ if ($MULTIREF{$defined_step} && # step needs to be run differently if multiple ref
+ &backoff_and_get(&extend_local_name($module,$set,"multiref"))) { # there are multiple ref
+ $multiref = $MULTIREF{$defined_step};
+ }
+
my ($output,@INPUT) = &get_output_and_input($step_id);
my $cmd;
@@ -2162,6 +2216,13 @@ sub define_template {
$cmd = $new_cmd;
$QSUB_STEP{$step_id}++;
}
+
+ # command to be run on multiple reference translations
+ if (defined($multiref)) {
+ $cmd =~ s/^(.+)IN (.+)OUT(.*)$/$multiref '$1 mref-input-file $2 mref-output-file $3' IN OUT/;
+ $cmd =~ s/^(.+)OUT(.+)IN (.*)$/$multiref '$1 mref-output-file $2 mref-input-file $3' IN OUT/;
+ }
+
# input is array, but just specified as IN
if ($cmd !~ /IN1/ && (scalar @INPUT) > 1 ) {
my $in = join(" ",@INPUT);
diff --git a/scripts/ems/support/interpolate-lm.perl b/scripts/ems/support/interpolate-lm.perl
index ff5f7289c..2c79d9b84 100755
--- a/scripts/ems/support/interpolate-lm.perl
+++ b/scripts/ems/support/interpolate-lm.perl
@@ -37,10 +37,10 @@ foreach my $lm (@LM) {
my $lm_order;
$lm .= ".gz" if (! -e $lm && -e "$lm.gz");
if ($lm =~ /gz$/) {
- open(LM,"zcat $lm|") || die("could not find language model file '$lm'");
+ open(LM,"zcat $lm|") || die("ERROR: could not find language model file '$lm'");
}
else {
- open(LM,$lm) || die("could not find language model file '$lm'");
+ open(LM,$lm) || die("ERROR: could not find language model file '$lm'");
}
while(<LM>) {
$lm_order = $1 if /ngram (\d+)/;
@@ -73,7 +73,7 @@ my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd);
die "Failed to mix models: $mixerr" if $mixexitcode != 0;
my $mix = $mixout;
`rm $tmp/iplm.$$.*`;
-$mix =~ /best lambda \(([\d\. ]+)\)/ || die("computing lambdas failed: $mix");
+$mix =~ /best lambda \(([\d\. ]+)\)/ || die("ERROR: computing lambdas failed: $mix");
my @LAMBDA = split(/ /,$1);
# create new language models
diff --git a/scripts/ems/support/reference-from-sgm.perl b/scripts/ems/support/reference-from-sgm.perl
index a02d755a0..87987b264 100755
--- a/scripts/ems/support/reference-from-sgm.perl
+++ b/scripts/ems/support/reference-from-sgm.perl
@@ -37,7 +37,7 @@ foreach my $system (keys %DOC) {
my $outfile = $txt;
if (scalar keys %DOC > 1) {
if ($outfile =~ /\.\d+$/) {
- $outfile =~ s/(\.\d+)$/$i$1/;
+ $outfile .= ".ref$i";
}
else {
$outfile .= $i;
@@ -51,4 +51,5 @@ foreach my $system (keys %DOC) {
}
}
close(TXT);
+ $i++;
}
diff --git a/scripts/ems/support/run-command-on-multiple-refsets.perl b/scripts/ems/support/run-command-on-multiple-refsets.perl
new file mode 100755
index 000000000..1a7ef9e8e
--- /dev/null
+++ b/scripts/ems/support/run-command-on-multiple-refsets.perl
@@ -0,0 +1,14 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+die("ERROR: syntax: run-command-on-multiple-refsets.perl cmd in out")
+ unless scalar @ARGV == 3;
+my ($cmd,$in,$out) = @ARGV;
+
+for(my $i=0;-e "$in.ref$i";$i++) {
+ my $single_cmd = $cmd;
+ $single_cmd =~ s/mref-input-file/$in.ref$i/g;
+ $single_cmd =~ s/mref-output-file/$out.ref$i/g;
+ system($single_cmd);
+}