Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2013-02-21 21:34:59 +0400
committerBarry Haddow <barry.haddow@gmail.com>2013-02-21 21:34:59 +0400
commit51ab9aa19dccefea54b45dc81a929301ba5d6ea5 (patch)
tree364018d46af083e52b0658359d535a055bdd92ac /scripts/training
parent87d7294d50d69da1833b6a78829154c444f2be6e (diff)
parent5844fb21a758a492b0847ba0939a7856a9a5cb68 (diff)
Merge remote branch 'origin/master' into phrase-weighting
Diffstat (limited to 'scripts/training')
-rw-r--r--scripts/training/LexicalTranslationModel.pm94
-rwxr-xr-xscripts/training/filter-model-given-input.pl28
-rwxr-xr-xscripts/training/mert-moses.pl12
-rwxr-xr-xscripts/training/train-model.perl175
-rwxr-xr-xscripts/training/wrappers/make-factor-en-pos.mxpost.perl5
-rwxr-xr-xscripts/training/zmert-moses.pl1121
6 files changed, 218 insertions, 1217 deletions
diff --git a/scripts/training/LexicalTranslationModel.pm b/scripts/training/LexicalTranslationModel.pm
index c0570df5c..08d161cc1 100644
--- a/scripts/training/LexicalTranslationModel.pm
+++ b/scripts/training/LexicalTranslationModel.pm
@@ -38,20 +38,59 @@ sub fix_spaces {
}
sub get_lexical {
- my ($alignment_file_f,$alignment_file_e,$alignment_file_a,$lexical_file,$write_counts) = @_;
+ my ($alignment_file_f,$alignment_file_e,$alignment_file_a,$lexical_file,$write_counts,$baseline_corpus_f,$baseline_corpus_e,$baseline_alignment, $instance_weights_file) = @_;
print STDERR "($alignment_file_f,$alignment_file_e,$lexical_file)\n";
+ print STDERR "baseline ($baseline_corpus_f,$baseline_corpus_e,$baseline_alignment)\n" if defined $baseline_alignment;
+ print STDERR "instance weights ($instance_weights_file)\n" if defined $instance_weights_file;
# my $alignment_file_a = $___ALIGNMENT_FILE.".".$___ALIGNMENT;
- my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);
if (-e "$lexical_file.f2e" && -e "$lexical_file.e2f" && (!$write_counts || -e "$lexical_file.counts.f2e" && -e "$lexical_file.counts.e2f")) {
print STDERR " reusing: $lexical_file.f2e and $lexical_file.e2f\n";
return;
}
+ my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);
+ &get_lexical_counts($alignment_file_e,$alignment_file_f,$alignment_file_a,$instance_weights_file,\%WORD_TRANSLATION,\%TOTAL_FOREIGN,\%TOTAL_ENGLISH);
+ if (defined($baseline_alignment)) {
+ &get_lexical_counts($baseline_corpus_e,$baseline_corpus_f,$baseline_alignment,undef,\%WORD_TRANSLATION,\%TOTAL_FOREIGN,\%TOTAL_ENGLISH);
+ }
+
+ open(F2E,">$lexical_file.f2e") or die "ERROR: Can't write $lexical_file.f2e";
+ open(E2F,">$lexical_file.e2f") or die "ERROR: Can't write $lexical_file.e2f";
+ if ($write_counts) {
+ open(F2E2,">$lexical_file.counts.f2e") or die "ERROR: Can't write $lexical_file.counts.f2e";
+ open(E2F2,">$lexical_file.counts.e2f") or die "ERROR: Can't write $lexical_file.counts.e2f";
+ }
+
+ foreach my $f (keys %WORD_TRANSLATION) {
+ foreach my $e (keys %{$WORD_TRANSLATION{$f}}) {
+ printf F2E "%s %s %.7f\n",$e,$f,$WORD_TRANSLATION{$f}{$e}/$TOTAL_FOREIGN{$f};
+ printf E2F "%s %s %.7f\n",$f,$e,$WORD_TRANSLATION{$f}{$e}/$TOTAL_ENGLISH{$e};
+ if ($write_counts) {
+ printf F2E2 "%s %s %i %i\n",$e,$f,$WORD_TRANSLATION{$f}{$e},$TOTAL_FOREIGN{$f};
+ printf E2F2 "%s %s %i %i\n",$f,$e,$WORD_TRANSLATION{$f}{$e},$TOTAL_ENGLISH{$e};
+ }
+ }
+ }
+ close(E2F);
+ close(F2E);
+ if ($write_counts) {
+ close(E2F2);
+ close(F2E2);
+ }
+ print STDERR "Saved: $lexical_file.f2e and $lexical_file.e2f\n";
+}
+
+sub get_lexical_counts {
+ my ($alignment_file_e,$alignment_file_f,$alignment_file_a,$instance_weights_file,$WORD_TRANSLATION,$TOTAL_FOREIGN,$TOTAL_ENGLISH) = @_;
open(E,&open_compressed($alignment_file_e)) or die "ERROR: Can't read $alignment_file_e";
open(F,&open_compressed($alignment_file_f)) or die "ERROR: Can't read $alignment_file_f";
open(A,&open_compressed($alignment_file_a)) or die "ERROR: Can't read $alignment_file_a";
+ my $W = undef;
+ if (defined($instance_weights_file) && $instance_weights_file) {
+ open($W, $instance_weights_file) or die "ERROR: Can't read $instance_weights_file";
+ }
my $alignment_id = 0;
while(my $e = <E>) {
@@ -61,7 +100,8 @@ sub get_lexical {
my $f = <F>; chomp($f); fix_spaces(\$f);
my @FOREIGN = split(/ /,$f);
my $a = <A>; chomp($a); fix_spaces(\$a);
-
+ my $iw = 1; # instance weight
+ $iw = <$W> if defined $W;
my (%FOREIGN_ALIGNED,%ENGLISH_ALIGNED);
foreach (split(/ /,$a)) {
my ($fi,$ei) = split(/\-/);
@@ -70,62 +110,36 @@ sub get_lexical {
}
else {
# local counts
- $FOREIGN_ALIGNED{$fi}++;
- $ENGLISH_ALIGNED{$ei}++;
+ $FOREIGN_ALIGNED{$fi}+=$iw;
+ $ENGLISH_ALIGNED{$ei}+=$iw;
# global counts
- $WORD_TRANSLATION{$FOREIGN[$fi]}{$ENGLISH[$ei]}++;
- $TOTAL_FOREIGN{$FOREIGN[$fi]}++;
- $TOTAL_ENGLISH{$ENGLISH[$ei]}++;
+ $$WORD_TRANSLATION{$FOREIGN[$fi]}{$ENGLISH[$ei]}+=$iw;
+ $$TOTAL_FOREIGN{$FOREIGN[$fi]}+=$iw;
+ $$TOTAL_ENGLISH{$ENGLISH[$ei]}+=$iw;
}
}
# unaligned words
for(my $ei=0;$ei<scalar(@ENGLISH);$ei++) {
next if defined($ENGLISH_ALIGNED{$ei});
- $WORD_TRANSLATION{"NULL"}{$ENGLISH[$ei]}++;
- $TOTAL_ENGLISH{$ENGLISH[$ei]}++;
- $TOTAL_FOREIGN{"NULL"}++;
+ $$WORD_TRANSLATION{"NULL"}{$ENGLISH[$ei]}+=$iw;
+ $$TOTAL_ENGLISH{$ENGLISH[$ei]}+=$iw;
+ $$TOTAL_FOREIGN{"NULL"}+=$iw;
}
for(my $fi=0;$fi<scalar(@FOREIGN);$fi++) {
next if defined($FOREIGN_ALIGNED{$fi});
- $WORD_TRANSLATION{$FOREIGN[$fi]}{"NULL"}++;
- $TOTAL_FOREIGN{$FOREIGN[$fi]}++;
- $TOTAL_ENGLISH{"NULL"}++;
+ $$WORD_TRANSLATION{$FOREIGN[$fi]}{"NULL"}+=$iw;
+ $$TOTAL_FOREIGN{$FOREIGN[$fi]}+=$iw;
+ $$TOTAL_ENGLISH{"NULL"}+=$iw;
}
}
print STDERR "\n";
close(A);
close(F);
close(E);
-
- open(F2E,">$lexical_file.f2e") or die "ERROR: Can't write $lexical_file.f2e";
- open(E2F,">$lexical_file.e2f") or die "ERROR: Can't write $lexical_file.e2f";
- if ($write_counts) {
- open(F2E2,">$lexical_file.counts.f2e") or die "ERROR: Can't write $lexical_file.counts.f2e";
- open(E2F2,">$lexical_file.counts.e2f") or die "ERROR: Can't write $lexical_file.counts.e2f";
- }
-
- foreach my $f (keys %WORD_TRANSLATION) {
- foreach my $e (keys %{$WORD_TRANSLATION{$f}}) {
- printf F2E "%s %s %.7f\n",$e,$f,$WORD_TRANSLATION{$f}{$e}/$TOTAL_FOREIGN{$f};
- printf E2F "%s %s %.7f\n",$f,$e,$WORD_TRANSLATION{$f}{$e}/$TOTAL_ENGLISH{$e};
- if ($write_counts) {
- printf F2E2 "%s %s %i %i\n",$e,$f,$WORD_TRANSLATION{$f}{$e},$TOTAL_FOREIGN{$f};
- printf E2F2 "%s %s %i %i\n",$f,$e,$WORD_TRANSLATION{$f}{$e},$TOTAL_ENGLISH{$e};
- }
- }
- }
- close(E2F);
- close(F2E);
- if ($write_counts) {
- close(E2F2);
- close(F2E2);
- }
- print STDERR "Saved: $lexical_file.f2e and $lexical_file.e2f\n";
}
-
END {
}
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index df9c528e0..d994fbcef 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -119,7 +119,11 @@ while(<INI>) {
print INI_OUT "2 $source_factor $t $w $new_name.bin$table_flag\n";
}
elsif ($binarizer && $phrase_table_impl == 0) {
- print INI_OUT "1 $source_factor $t $w $new_name$table_flag\n";
+ if ($binarizer =~ /processPhraseTableMin/) {
+ print INI_OUT "12 $source_factor $t $w $new_name$table_flag\n";
+ } else {
+ print INI_OUT "1 $source_factor $t $w $new_name$table_flag\n";
+ }
} else {
$new_name .= ".gz" if $opt_gzip;
print INI_OUT "$phrase_table_impl $source_factor $t $w $new_name$table_flag\n";
@@ -147,7 +151,7 @@ while(<INI>) {
$file =~ s/^.*\/+([^\/]+)/$1/g;
my $new_name = "$dir/$file";
- $new_name =~ s/\.gz//;
+ $new_name =~ s/\.gz//;
print INI_OUT "$factors $t $w $new_name\n";
push @TABLE_NEW_NAME,$new_name;
@@ -275,11 +279,16 @@ for(my $i=0;$i<=$#TABLE;$i++) {
# ... hierarchical translation model
if ($opt_hierarchical) {
my $cmd = "$binarizer $new_file $new_file.bin";
- print STDERR $cmd."\n";
- print STDERR `$cmd`;
+ print STDERR $cmd."\n";
+ print STDERR `$cmd`;
}
# ... phrase translation model
- else {
+ elsif ($binarizer =~ /processPhraseTableMin/) {
+ #compact phrase table
+ my $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
+ print STDERR $cmd."\n";
+ print STDERR `$cmd`;
+ } else {
my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
print STDERR $cmd."\n";
print STDERR `$cmd`;
@@ -289,8 +298,13 @@ for(my $i=0;$i<=$#TABLE;$i++) {
else {
my $lexbin = $binarizer;
$lexbin =~ s/PhraseTable/LexicalTable/;
- $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
- my $cmd = "$lexbin -in $new_file -out $new_file";
+ my $cmd;
+ if ($lexbin =~ /processLexicalTableMin/) {
+ $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
+ } else {
+ $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
+ $cmd = "$lexbin -in $new_file -out $new_file";
+ }
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index c5492498c..6797b57f4 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -158,6 +158,7 @@ my $mertdir = undef; # path to new mert directory
my $mertargs = undef; # args to pass through to mert & extractor
my $mertmertargs = undef; # args to pass through to mert only
my $extractorargs = undef; # args to pass through to extractor only
+my $proargs = undef; # args to pass through to pro only
# Args to pass through to batch mira only. This flags is useful to
# change MIRA's hyperparameters such as regularization parameter C,
@@ -207,6 +208,7 @@ GetOptions(
"mertdir=s" => \$mertdir,
"mertargs=s" => \$mertargs,
"extractorargs=s" => \$extractorargs,
+ "proargs=s" => \$proargs,
"mertmertargs=s" => \$mertmertargs,
"rootdir=s" => \$SCRIPTS_ROOTDIR,
"filtercmd=s" => \$filtercmd, # allow to override the default location
@@ -376,12 +378,12 @@ my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt"); # or set t
if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) {
print "Could not find $pro_optimizer, installing it in $mertdir\n";
- my $megam_url = "http://www.cs.utah.edu/~hal/megam/";
+ my $megam_url = "http://www.umiacs.umd.edu/~hal/megam/";
if (&is_mac_osx()) {
die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details.";
}
- `cd $mertdir; wget http://www.cs.utah.edu/~hal/megam/megam_i686.opt.gz;`;
+ `cd $mertdir; wget $megam_url/megam_i686.opt.gz;`;
`gunzip $pro_optimizer.gz`;
`chmod +x $pro_optimizer`;
die("ERROR: Installation of megam_i686.opt failed! Install by hand from $megam_url") unless -x $pro_optimizer;
@@ -432,6 +434,8 @@ $mert_extract_args .= " $extractorargs";
$mertmertargs = "" if !defined $mertmertargs;
+$proargs = "" unless $proargs;
+
my $mert_mert_args = "$mertargs $mertmertargs";
$mert_mert_args =~ s/\-+(binary|b)\b//;
$mert_mert_args .= " $scconfig";
@@ -904,11 +908,11 @@ while (1) {
my %sparse_weights; # sparse features
my $pro_optimizer_cmd = "$pro_optimizer $megam_default_options run$run.pro.data";
if ($___PAIRWISE_RANKED_OPTIMIZER) { # pro optimization
- $cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd";
+ $cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd";
&submit_or_exec($cmd, $mert_outfile, $mert_logfile);
} elsif ($___PRO_STARTING_POINT) { # First, run pro, then mert
# run pro...
- my $pro_cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd";
+ my $pro_cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd";
&submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err");
# ... get results ...
($bestpoint,$devbleu) = &get_weights_from_mert("run$run.pro.out","run$run.pro.err",scalar @{$featlist->{"names"}},\%sparse_weights, \@phrase_weighting_mix_weights);
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 05287afee..5b0553581 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -34,12 +34,12 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG,
$_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2,
- $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
+ $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
@_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
- $_SPARSE_TRANSLATION_TABLE,
- $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $IGNORE);
+ $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_CORPUS, $_BASELINE_ALIGNMENT,
+ $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $IGNORE);
my $_CORES = 1;
my $debug = 0; # debug this script, do not delete any files in debug mode
@@ -116,7 +116,7 @@ $_HELP = 1
'source-syntax' => \$_SOURCE_SYNTAX,
'target-syntax' => \$_TARGET_SYNTAX,
'xml' => \$_XML,
- 'phrase-word-alignment' => \$_PHRASE_WORD_ALIGNMENT,
+ 'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
'config=s' => \$_CONFIG,
'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
'do-steps=s' => \$_DO_STEPS,
@@ -128,7 +128,13 @@ $_HELP = 1
'additional-ini=s' => \@_ADDITIONAL_INI,
'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE,
'sparse-translation-table' => \$_SPARSE_TRANSLATION_TABLE,
- 'cores=i' => \$_CORES
+ 'baseline-alignment-model=s{8}' => \@_BASELINE_ALIGNMENT_MODEL,
+ 'baseline-extract=s' => \$_BASELINE_EXTRACT,
+ 'baseline-corpus=s' => \$_BASELINE_CORPUS,
+ 'baseline-alignment=s' => \$_BASELINE_ALIGNMENT,
+ 'cores=i' => \$_CORES,
+ 'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
+ 'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
);
if ($_HELP) {
@@ -240,7 +246,12 @@ if ($STEPS[1] || $STEPS[2])
}
print STDERR "Using single-thread GIZA\n";
} else {
- $GIZA = "$_EXTERNAL_BINDIR/mgiza";
+ # accept either "mgiza" or "mgizapp" and either "snt2cooc.out" or "snt2cooc"
+ if (-x "$_EXTERNAL_BINDIR/mgiza") {
+ $GIZA = "$_EXTERNAL_BINDIR/mgiza";
+ } elsif (-x "$_EXTERNAL_BINDIR/mgizapp") {
+ $GIZA = "$_EXTERNAL_BINDIR/mgizapp";
+ }
if (-x "$_EXTERNAL_BINDIR/snt2cooc") {
$SNT2COOC = "$_EXTERNAL_BINDIR/snt2cooc";
} elsif (-x "$_EXTERNAL_BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $_EXTERNAL_BINDIR
@@ -373,6 +384,11 @@ my $___ALIGNMENT = "grow-diag-final";
$___ALIGNMENT = $_ALIGNMENT if $_ALIGNMENT;
my $___NOTE_ALIGNMENT_DROPS = 1;
+# baseline alignment model for incremetal updating
+die "ERROR: buggy definition of baseline alignment model, should have 8 values:\n\t".join("\n\t",@_BASELINE_ALIGNMENT_MODEL)."\n"
+ unless scalar(@_BASELINE_ALIGNMENT_MODEL) == 8 || scalar(@_BASELINE_ALIGNMENT_MODEL) == 0;
+die "ERROR: use of baseline alignment model limited to HMM training (-hmm-align)\n"
+ if defined($___FINAL_ALIGNMENT_MODEL) && $___FINAL_ALIGNMENT_MODEL ne 'hmm' && scalar(@_BASELINE_ALIGNMENT_MODEL) == 8;
# model dir and alignment/extract file
my $___MODEL_DIR = $___ROOT_DIR."/model";
@@ -620,8 +636,8 @@ sub prepare {
&make_classes($corpus.".".$___F,$___VCB_F.".classes");
&make_classes($corpus.".".$___E,$___VCB_E.".classes");
- $VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F);
- $VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E);
+ $VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
+ $VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
&numberize_txt_file($VCB_F,$corpus.".".$___F,
$VCB_E,$corpus.".".$___E,
@@ -659,8 +675,8 @@ sub prepare {
exit 0;
}
- $VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F);
- $VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E);
+ $VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
+ $VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
&numberize_txt_file($VCB_F,$corpus.".".$___F,
$VCB_E,$corpus.".".$___E,
@@ -787,7 +803,7 @@ sub make_classes {
sub get_vocabulary {
# return unless $___LEXICAL_WEIGHTING;
- my($corpus,$vcb) = @_;
+ my($corpus,$vcb,$is_target) = @_;
print STDERR "(1.2) creating vcb file $vcb @ ".`date`;
my %WORD;
@@ -797,17 +813,37 @@ sub get_vocabulary {
foreach (split) { $WORD{$_}++; }
}
close(TXT);
-
+
+ my ($id,%VCB);
+ open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
+
+ # words from baseline alignment model when incrementally updating
+ if (scalar @_BASELINE_ALIGNMENT_MODEL) {
+ open(BASELINE_VCB,$_BASELINE_ALIGNMENT_MODEL[$is_target]);
+ while(<BASELINE_VCB>) {
+ chop;
+ my ($i,$word,$count) = split;
+ if (defined($WORD{$word})) {
+ $count += $WORD{$word};
+ delete($WORD{$word});
+ }
+ printf VCB "%d\t%s\t%d\n",$i,$word,$count;
+ $VCB{$word} = $i;
+ $id = $i+1;
+ }
+ close(BASELINE_VCB);
+ }
+ # not incrementally updating
+ else {
+ print VCB "1\tUNK\t0\n";
+ $id=2;
+ }
+
my @NUM;
foreach my $word (keys %WORD) {
my $vcb_with_number = sprintf("%07d %s",$WORD{$word},$word);
push @NUM,$vcb_with_number;
}
-
- my %VCB;
- open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
- print VCB "1\tUNK\t0\n";
- my $id=2;
foreach (reverse sort @NUM) {
my($count,$word) = split;
printf VCB "%d\t%s\t%d\n",$id,$word,$count;
@@ -986,15 +1022,30 @@ sub run_single_giza_on_parts {
close(SNT);
# run snt2cooc in parts
+ my @COOC_PART_FILE_NAME;
for(my $i=1;$i<=$___PARTS;$i++) {
&run_single_snt2cooc("$dir/part$i",$e,$f,$vcb_e,$vcb_f,"$___CORPUS_DIR/part$i/$f-$e-int-train.snt");
+ push @COOC_PART_FILE_NAME, "$dir/part$i/$f-$e.cooc";
}
+ # include baseline cooc, if baseline alignment model (incremental training)
+ if (scalar @_BASELINE_ALIGNMENT_MODEL) {
+ push @COOC_PART_FILE_NAME, $_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)];
+ }
+ &merge_cooc_files($dir,$e,$f,@COOC_PART_FILE_NAME);
+
+ # run giza
+ &run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
+}
+
+sub merge_cooc_files {
+ my ($dir,$e,$f,@COOC_PART_FILE_NAME) = @_;
# merge parts
open(COOC,">$dir/$f-$e.cooc") or die "ERROR: Can't write $dir/$f-$e.cooc";
my(@PF,@CURRENT);
- for(my $i=1;$i<=$___PARTS;$i++) {
- open($PF[$i],"$dir/part$i/$f-$e.cooc")or die "ERROR: Can't read $dir/part$i/$f-$e.cooc";
+ for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
+ print STDERR "merging cooc file $COOC_PART_FILE_NAME[$i]...\n";
+ open($PF[$i],$COOC_PART_FILE_NAME[$i]) or die "ERROR: Can't read $COOC_PART_FILE_NAME[$i]";
my $pf = $PF[$i];
$CURRENT[$i] = <$pf>;
chop($CURRENT[$i]) if $CURRENT[$i];
@@ -1002,7 +1053,7 @@ sub run_single_giza_on_parts {
while(1) {
my ($min1,$min2) = (1e20,1e20);
- for(my $i=1;$i<=$___PARTS;$i++) {
+ for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
next unless $CURRENT[$i];
my ($w1,$w2) = split(/ /,$CURRENT[$i]);
if ($w1 < $min1 || ($w1 == $min1 && $w2 < $min2)) {
@@ -1012,7 +1063,7 @@ sub run_single_giza_on_parts {
}
last if $min1 == 1e20;
print COOC "$min1 $min2\n";
- for(my $i=1;$i<=$___PARTS;$i++) {
+ for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
next unless $CURRENT[$i];
my ($w1,$w2) = split(/ /,$CURRENT[$i]);
if ($w1 == $min1 && $w2 == $min2) {
@@ -1022,13 +1073,10 @@ sub run_single_giza_on_parts {
}
}
}
- for(my $i=1;$i<=$___PARTS;$i++) {
+ for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
close($PF[$i]);
}
close(COOC);
-
- # run giza
- &run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
}
sub run_single_giza {
@@ -1083,6 +1131,12 @@ sub run_single_giza {
$GizaDefaultOptions{m5} = ($___FINAL_ALIGNMENT_MODEL eq '5')? 3: 0;
}
+ if (scalar(@_BASELINE_ALIGNMENT_MODEL)) {
+ $GizaDefaultOptions{oldTrPrbs} = $_BASELINE_ALIGNMENT_MODEL[4 + ($dir eq $___GIZA_F2E?2:0)];
+ $GizaDefaultOptions{oldAlPrbs} = $_BASELINE_ALIGNMENT_MODEL[5 + ($dir eq $___GIZA_F2E?2:0)];
+ $GizaDefaultOptions{step_k} = 1;
+ }
+
if ($___GIZA_OPTION) {
foreach (split(/[ ,]+/,$___GIZA_OPTION)) {
my ($option,$value) = split(/=/,$_,2);
@@ -1123,16 +1177,19 @@ sub run_single_giza {
}
sub run_single_snt2cooc {
- my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
- print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
- safesystem("mkdir -p $dir") or die("ERROR");
- if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
- print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc\n";
- safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc") or die("ERROR");
- } else {
- print "$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train\n";
- safesystem("$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train") or die("ERROR");
- }
+ my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
+ print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
+ my $suffix = (scalar @_BASELINE_ALIGNMENT_MODEL) ? ".new" : "";
+ safesystem("mkdir -p $dir") or die("ERROR");
+ if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
+ print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix\n";
+ safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix") or die("ERROR");
+ } else {
+ print "$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train\n";
+ safesystem("$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train") or die("ERROR");
+ }
+ &merge_cooc_files($dir,$e,$f,"$dir/$f-$e.cooc.new",$_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)])
+ if scalar @_BASELINE_ALIGNMENT_MODEL;
}
### (3) CREATE WORD ALIGNMENT FROM GIZA ALIGNMENTS
@@ -1200,7 +1257,11 @@ sub get_lexical_factored {
$___CORPUS.".".$___E,
$___ALIGNMENT_FILE.".".$___ALIGNMENT,
$___LEXICAL_FILE,
- $___LEXICAL_COUNTS);
+ $___LEXICAL_COUNTS,
+ $_BASELINE_CORPUS.".".$___F,
+ $_BASELINE_CORPUS.".".$___E,
+ $_BASELINE_ALIGNMENT,
+ $_INSTANCE_WEIGHTS_FILE);
}
else {
foreach my $factor (split(/\+/,$___TRANSLATION_FACTORS)) {
@@ -1218,7 +1279,11 @@ sub get_lexical_factored {
$___ALIGNMENT_STEM.".".$factor_e.".".$___E,
$___ALIGNMENT_FILE.".".$___ALIGNMENT,
$lexical_file,
- $___LEXICAL_COUNTS);
+ $___LEXICAL_COUNTS,
+ $_BASELINE_CORPUS.".".$factor_f.".".$___F,
+ $_BASELINE_CORPUS.".".$factor_e.".".$___E,
+ $_BASELINE_ALIGNMENT,
+ $_INSTANCE_WEIGHTS_FILE);
}
}
}
@@ -1326,11 +1391,12 @@ sub extract_phrase {
}
}
my $cmd;
+ my $suffix = (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) ? ".new" : "";
if ($_HIERARCHICAL)
{
my $max_length = &get_max_phrase_length($table_number);
- $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file";
+ $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix";
$cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR;
$cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
$cmd .= " --PCFG" if $_PCFG;
@@ -1347,28 +1413,43 @@ sub extract_phrase {
{
if ( $_EPPEX ) {
# eppex sets max_phrase_length itself (as the maximum phrase length for which any Lossy Counter is defined)
- $cmd = "$EPPEX $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $_EPPEX";
+ $cmd = "$EPPEX $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix $_EPPEX";
}
else {
my $max_length = &get_max_phrase_length($table_number);
print "MAX $max_length $reordering_flag $table_number\n";
$max_length = &get_max_phrase_length(-1) if $reordering_flag;
- $cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $max_length";
+ $cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix $max_length";
}
if ($reordering_flag) {
$cmd .= " orientation";
$cmd .= get_extract_reordering_flags();
$cmd .= " --NoTTable" if !$ttable_flag;
- $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
+ $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
$cmd .= " --GZOutput ";
+ $cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
+ $cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
print STDERR "$cmd\n";
safesystem("$cmd") or die "ERROR: Phrase extraction failed (missing input files?)";
+
+ if (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) {
+ print STDERR "merging with baseline extract from $_BASELINE_EXTRACT\n";
+ safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | gzip > $extract_file.gz");
+ safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | gzip > $extract_file.inv.gz");
+ safesystem("$ZCAT $_BASELINE_EXTRACT.o.gz $extract_file$suffix.o.gz | gzip > $extract_file.o.gz")
+ if -e "$extract_file$suffix.o.gz";
+ safesystem("rm $extract_file$suffix.gz");
+ safesystem("rm $extract_file$suffix.inv.gz");
+ safesystem("rm $extract_file$suffix.o.gz")
+ if -e "$extract_file$suffix.o.gz";
+ }
+
foreach my $f (@tempfiles) {
unlink $f;
}
@@ -1471,7 +1552,7 @@ sub score_phrase_phrase_extract {
my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction.gz $inverse";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
- $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
+ $cmd .= " --NoWordAlignment" if $_OMIT_WORD_ALIGNMENT;
$cmd .= " --KneserNey" if $KNESER_NEY;
$cmd .= " --GoodTuring" if $GOOD_TURING && $inverse eq "";
$cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
@@ -1491,7 +1572,7 @@ sub score_phrase_phrase_extract {
$cmd .= " 0 ";
}
- print $cmd."\n";
+ print STDERR $cmd."\n";
safesystem($cmd) or die "ERROR: Scoring of phrases failed";
exit();
@@ -1909,8 +1990,12 @@ sub create_ini {
}
print INI "\n# language model weights\n[weight-l]\n";
my $lmweighttotal = 0.5;
+ my $lmoovweighttotal = 0.1;
foreach(1..scalar @___LM) {
printf INI "%.4f\n", $lmweighttotal / scalar @___LM;
+ if ($_LMODEL_OOV_FEATURE) {
+ printf INI "%.4f\n", $lmoovweighttotal / scalar @___LM;
+ }
}
print INI "\n\n# translation model weights\n[weight-t]\n";
@@ -1954,6 +2039,10 @@ sub create_ini {
print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
}
+ if ($_LMODEL_OOV_FEATURE) {
+ print INI "\n# language model OOV feature enabled\n[lmodel-oov-feature]\n1\n\n";
+ }
+
# get addititional content for config file from switch or file
if ($_ADDITIONAL_INI) {
print INI "\n# additional settings\n\n";
diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
index 305a6ec52..c3c309bad 100755
--- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
+++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
@@ -1,6 +1,7 @@
#!/usr/bin/perl -w
use strict;
+use FindBin qw($RealBin);
use Getopt::Long "GetOptions";
my ($IN,$OUT,$MXPOST);
@@ -14,8 +15,8 @@ if (!&GetOptions('mxpost=s' => \$MXPOST) ||
my $pipeline = "perl -ne 'chop; tr/\\x20-\\x7f/\?/c; print \$_.\"\\n\";' | tee debug | ";
$pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
-open(TAGGER,"cat $IN | $pipeline");
-open(OUT,">$OUT");
+open(TAGGER,"$RealBin/../../tokenizer/deescape-special-chars.perl < $IN | $pipeline");
+open(OUT,"| $RealBin/../../tokenizer/escape-special-chars.perl > $OUT");
while(<TAGGER>) {
foreach my $word_pos (split) {
$word_pos =~ s/\/([^\/]+)$/_$1/;
diff --git a/scripts/training/zmert-moses.pl b/scripts/training/zmert-moses.pl
deleted file mode 100755
index ecd783fa2..000000000
--- a/scripts/training/zmert-moses.pl
+++ /dev/null
@@ -1,1121 +0,0 @@
-#!/usr/bin/perl -w
-
-# Usage:
-# zmert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
-# For other options see below or run 'zmert-moses.pl --help'
-
-# Notes:
-# <foreign> and <english> should be raw text files, one sentence per line
-# <english> can be a prefix, in which case the files are <english>0, <english>1, etc. are used
-
-# Revision history
-
-# 29 Dec 2009 Derived from mert-moses-new.pl (Kamil Kos)
-
-use FindBin qw($RealBin);
-use File::Basename;
-my $SCRIPTS_ROOTDIR = $RealBin;
-$SCRIPTS_ROOTDIR =~ s/\/training$//;
-$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
-
-# for each _d_istortion, _l_anguage _m_odel, _t_ranslation _m_odel and _w_ord penalty, there is a list
-# of [ default value, lower bound, upper bound ]-triples. In most cases, only one triple is used,
-# but the translation model has currently 5 features
-
-# defaults for initial values and ranges are:
-
-my $default_triples = {
- # these two basic models exist even if not specified, they are
- # not associated with any model file
- "w" => [ [ 0.0, -1.0, 1.0 ] ], # word penalty
-};
-
-my $additional_triples = {
- # if the more lambda parameters for the weights are needed
- # (due to additional tables) use the following values for them
- "d" => [ [ 1.0, 0.0, 2.0 ], # lexicalized reordering model
- [ 1.0, 0.0, 2.0 ],
- [ 1.0, 0.0, 2.0 ],
- [ 1.0, 0.0, 2.0 ],
- [ 1.0, 0.0, 2.0 ],
- [ 1.0, 0.0, 2.0 ],
- [ 1.0, 0.0, 2.0 ] ],
- "lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model
- "g" => [ [ 1.0, 0.0, 2.0 ], # generation model
- [ 1.0, 0.0, 2.0 ] ],
- "tm" => [ [ 0.3, 0.0, 0.5 ], # translation model
- [ 0.2, 0.0, 0.5 ],
- [ 0.3, 0.0, 0.5 ],
- [ 0.2, 0.0, 0.5 ],
- [ 0.0,-1.0, 1.0 ] ], # ... last weight is phrase penalty
- "lex"=> [ [ 0.1, 0.0, 0.2 ] ], # global lexical model
-};
-
-# moses.ini file uses FULL names for lambdas, while this training script internally (and on the command line)
-# uses ABBR names.
-my $ABBR_FULL_MAP = "d=weight-d lm=weight-l tm=weight-t w=weight-w g=weight-generation lex=weight-lex";
-my %ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP;
-my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
-
-# We parse moses.ini to figure out how many weights do we need to optimize.
-# For this, we must know the correspondence between options defining files
-# for models and options assigning weights to these models.
-my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d generation-file=g global-lexical-file=lex";
-my %TABLECONFIG2ABBR = map {split(/=/,$_,2)} split /\s+/, $TABLECONFIG_ABBR_MAP;
-
-# There are weights that do not correspond to any input file, they just increase the total number of lambdas we optimize
-#my $extra_lambdas_for_model = {
-# "w" => 1, # word penalty
-# "d" => 1, # basic distortion
-#};
-
-my $verbose = 0;
-my $___MERT_VERBOSE = 1; # verbosity of zmert (values: 0-2)
-my $___DECODER_VERBOSE = 1; # should decoder output be included? - 0:no,1:yes
-my $___SAVE_INTER = 2; # save intermediate nbest-lists
-my $usage = 0; # request for --help
-my $___WORKING_DIR = "mert-work";
-my $___DEV_F = undef; # required, input text to decode
-my $___DEV_E = undef; # required, basename of files with references
-my $___DECODER = undef; # required, pathname to the decoder executable
-my $___CONFIG = undef; # required, pathname to startup ini file
-my $___N_BEST_LIST_SIZE = 100;
-my $___MAX_MERT_ITER = 0; # do not limit the number of iterations
-my $queue_flags = "-l mem_free=0.5G -hard"; # extra parameters for parallelizer
- # the -l ws0ssmt is relevant only to JHU workshop
-my $___JOBS = undef; # if parallel, number of jobs to use (undef -> serial)
-my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
-my $___LAMBDA = undef; # string specifying the seed weights and boundaries of all lambdas
-my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert)
-my $___FILTER_PHRASE_TABLE = 1; # filter phrase table
-my $___PREDICTABLE_SEEDS = 0;
-my $___METRIC = "BLEU 4 shortest"; # name of metric that will be used for minimum error training, followed by metric parameters (see zmert documentation)
-my $___SEMPOSBLEU_WEIGHTS = "1 1"; # weights of SemPOS and BLEU
-my $___LAMBDAS_OUT = undef; # file where final lambdas should be written
-my $___EXTRACT_SEMPOS = "none"; # how shall we get the SemPOS factor (only for SemPOS metric)
- # options: 1) 'none' - moses generates SemPOS factor in required format
- # (<word_form>|<SemPOS>)
- # 2) 'factors:<factor_index_list>' - extract factors from decoder output on positions from <factor_index_list>
- # <factor_index_list> contains indices of factors separated by comma, e.g. '0,1,4'
- # 3) 'tmt' - moses outputs only <word_form> and we need to
- # generate factors like SemPOS with TectoMT (see http://ufal.mff.cuni.cz/tectomt/)
-
-# set 1 if using with async decoder
-my $___ASYNC = 0;
-
-# Use "--norm" to select normalization in mert
-my $___NORM = "none";
-
-# set 0 if input type is text, set 1 if input type is confusion network
-my $___INPUTTYPE = 0;
-
-my $mertdir = "$SCRIPTS_ROOTDIR/../zmert/"; # path to zmert directory
-my $filtercmd = undef; # path to filter-model-given-input.pl
-my $clonecmd = "$SCRIPTS_ROOTDIR/training/clone_moses_model.pl"; # executable clone_moses_model.pl
-my $qsubwrapper = undef;
-my $moses_parallel_cmd = undef;
-my $old_sge = 0; # assume sge<6.0
-my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on
- # if undef work on all features
- # (others are fixed to the starting values)
-my %active_features; # hash with features to optimize; optimize all if empty
-
-use strict;
-use Getopt::Long;
-GetOptions(
- "working-dir=s" => \$___WORKING_DIR,
- "input=s" => \$___DEV_F,
- "inputtype=i" => \$___INPUTTYPE,
- "refs=s" => \$___DEV_E,
- "decoder=s" => \$___DECODER,
- "config=s" => \$___CONFIG,
- "nbest:i" => \$___N_BEST_LIST_SIZE,
- "maxiter:i" => \$___MAX_MERT_ITER,
- "queue-flags:s" => \$queue_flags,
- "jobs=i" => \$___JOBS,
- "decoder-flags=s" => \$___DECODER_FLAGS,
- "lambdas=s" => \$___LAMBDA,
- "metric=s" => \$___METRIC,
- "semposbleu-weights:s" => \$___SEMPOSBLEU_WEIGHTS,
- "extract-sempos=s" => \$___EXTRACT_SEMPOS,
- "norm:s" => \$___NORM,
- "help" => \$usage,
- "verbose" => \$verbose,
- "mert-verbose:i" => \$___MERT_VERBOSE,
- "decoder-verbose:i" => \$___DECODER_VERBOSE,
- "mertdir:s" => \$mertdir, # allow to override the default location of zmert.jar
- "lambdas-out:s" => \$___LAMBDAS_OUT,
- "rootdir=s" => \$SCRIPTS_ROOTDIR,
- "filtercmd=s" => \$filtercmd, # allow to override the default location
- "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location
- "mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location
- "old-sge" => \$old_sge, #passed to moses-parallel
- "filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # allow (disallow)filtering of phrase tables
- "predictable-seeds:s" => \$___PREDICTABLE_SEEDS, # allow (disallow) switch on/off reseeding of random restarts
- "async=i" => \$___ASYNC, #whether script to be used with async decoder
- "activate-features=s" => \$___ACTIVATE_FEATURES #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values)
-) or exit(1);
-
-print "Predict $___PREDICTABLE_SEEDS\n";
-
-# the 4 required parameters can be supplied on the command line directly
-# or using the --options
-if (scalar @ARGV == 4) {
- # required parameters: input_file references_basename decoder_executable
- $___DEV_F = shift;
- $___DEV_E = shift;
- $___DECODER = shift;
- $___CONFIG = shift;
-}
-
-if ($___ASYNC) {
- delete $default_triples->{"w"};
- $additional_triples->{"w"} = [ [ 0.0, -1.0, 1.0 ] ];
-}
-
-print STDERR "After default: $queue_flags\n";
-
-if ($usage || !defined $___DEV_F || !defined$___DEV_E || !defined$___DECODER || !defined $___CONFIG) {
- print STDERR "usage: zmert-moses.pl input-text references decoder-executable decoder.ini
-Options:
- --working-dir=mert-dir ... where all the files are created
- --nbest=100 ... how big nbestlist to generate
- --maxiter=N ... maximum number of zmert iterations
- --jobs=N ... set this to anything to run moses in parallel
- --mosesparallelcmd=STRING ... use a different script instead of moses-parallel
- --queue-flags=STRING ... anything you with to pass to
- qsub, eg. '-l ws06osssmt=true'
- The default is
- -l mem_free=0.5G -hard
- To reset the parameters, please use \"--queue-flags=' '\" (i.e. a space between
- the quotes).
- --decoder-flags=STRING ... extra parameters for the decoder
- --lambdas=STRING ... default values and ranges for lambdas, a complex string
- such as 'd:1,0.5-1.5 lm:1,0.5-1.5 tm:0.3,0.25-0.75;0.2,0.25-0.75;0.2,0.25-0.75;0.3,0.25-0.75;0,-0.5-0.5 w:0,-0.5-0.5'
- --allow-unknown-lambdas ... keep going even if someone supplies a new lambda
- in the lambdas option (such as 'superbmodel:1,0-1'); optimize it, too
- --lambdas-out=STRING ... file where final lambdas should be written
- --metric=STRING ... metric name for optimization with metric parameters
- such as 'BLEU 4 closest' or 'SemPOS 0 1'. Use default parameters by specifying 'BLEU' or 'SemPOS'
- --semposbleu-weights=STRING ... weights for SemPOS and BLEU in format 'N:M' where 'N' is SemPOS weight and 'M' BLEU weight
- used only with SemPOS_BLEU metric
- --extract-sempos=STRING ... none|factors:<factor_list>|tmt
- 'none' ... decoder generates all required factors for optimization metric
- 'factors:<factor_list>' ... extract factors with index in <factor_list> from decoder output
- e.g. 'factors:0,2,3' to extract first, third and fourth factor from decoder output
- 'tmt' ... use TectoMT (see http://ufal.mff.cuni.cz/tectomt) to generate required factors
- --norm ... Select normalization for zmert
- --mert-verbose=N ... verbosity of zmert [0|1|2]
- --decoder-verbose=N ... decoder verbosity [0|1] - 1=decoder output included
- --mertdir=STRING ... directory with zmert.jar
- --filtercmd=STRING ... path to filter-model-given-input.pl
- --rootdir=STRING ... where do helpers reside (if not given explicitly)
- --mertdir=STRING ... path to zmert implementation
- --scorenbestcmd=STRING ... path to score-nbest.py
- --old-sge ... passed to moses-parallel, assume Sun Grid Engine < 6.0
- --inputtype=[0|1|2] ... Handle different input types (0 for text, 1 for confusion network, 2 for lattices, default is 0)
- --no-filter-phrase-table ... disallow filtering of phrase tables
- (useful if binary phrase tables are available)
- --predictable-seeds ... provide predictable seeds to mert so that random restarts are the same on every run
- --activate-features=STRING ... comma-separated list of features to work on
- (if undef work on all features)
- # (others are fixed to the starting values)
- --verbose ... verbosity of this script
- --help ... print this help
-
-";
- exit 1;
-}
-
-# ensure we know where is tectomt, if we need it
-if( !defined $ENV{"TMT_ROOT"} && $___EXTRACT_SEMPOS =~ /tmt/) {
- die "Cannot find TMT_ROOT. Is TectoMT really initialized?";
-}
-my $TMT_ROOT = $ENV{"TMT_ROOT"};
-
-my $srunblocks = "$TMT_ROOT/tools/srunblocks_streaming/srunblocks";
-my $scenario_file = "scenario";
-my $qruncmd = "/home/bojar/diplomka/bin/qruncmd";
-my $srunblocks_cmd = "$srunblocks --errorlevel=FATAL $scenario_file czech_source_sentence factored_output";
-if (defined $___JOBS && $___JOBS > 1) {
- die "Can't run $qruncmd" if ! -x $qruncmd;
- $srunblocks_cmd = "$qruncmd --jobs=$___JOBS --join '$srunblocks_cmd'";
-}
-
-
-# update variables if input is confusion network
-if ($___INPUTTYPE == 1)
-{
- $ABBR_FULL_MAP = "$ABBR_FULL_MAP I=weight-i";
- %ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP;
- %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
-
- push @{$default_triples -> {"I"}}, [ 1.0, 0.0, 2.0 ];
- #$extra_lambdas_for_model -> {"I"} = 1; #Confusion network posterior
-}
-
-# update variables if input is lattice
-if ($___INPUTTYPE == 2)
-{
-# TODO
-}
-
-if (defined $___ACTIVATE_FEATURES)
-{
- %active_features = map {$_ => 1} split( /,/, $___ACTIVATE_FEATURES);
-}
-
-# Check validity of input parameters and set defaults if needed
-
-print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
-
-# path of script for filtering phrase tables and running the decoder
-$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd;
-
-$qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper;
-
-$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl"
- if !defined $moses_parallel_cmd;
-
-
-
-die "Error: need to specify the zmert.jar directory" if !defined $mertdir;
-
-my $zmert_classpath = ensure_full_path("$mertdir/zmert.jar");
-die "File not found: $mertdir/zmert.jar (interpreted as $zmert_classpath)"
- if ! -e $zmert_classpath;
-
-my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd);
-die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd;
-die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd;
-die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
-die "Not executable: $___DECODER" if ! -x $___DECODER;
-
-my $input_abs = ensure_full_path($___DEV_F);
-die "File not found: $___DEV_F (interpreted as $input_abs)."
- if ! -e $input_abs;
-$___DEV_F = $input_abs;
-
-
-# Option to pass to qsubwrapper and moses-parallel
-my $pass_old_sge = $old_sge ? "-old-sge" : "";
-
-my $decoder_abs = ensure_full_path($___DECODER);
-die "File not found: $___DECODER (interpreted as $decoder_abs)."
- if ! -x $decoder_abs;
-$___DECODER = $decoder_abs;
-
-
-my $ref_abs = ensure_full_path($___DEV_E);
-# check if English dev set (reference translations) exist and store a list of all references
-my @references;
-my @references_factored;
-if (-e $ref_abs) {
- push @references, $ref_abs;
-}
-else {
- # if multiple file, get a full list of the files
- my $part = 0;
- while (-e $ref_abs.$part) {
- push @references, $ref_abs.$part;
- $part++;
- }
- die("Reference translations not found: $___DEV_E (interpreted as $ref_abs)") unless $part;
-}
-
-my $config_abs = ensure_full_path($___CONFIG);
-die "File not found: $___CONFIG (interpreted as $config_abs)."
- if ! -e $config_abs;
-$___CONFIG = $config_abs;
-
-
-
-# check validity of moses.ini and collect number of models and lambdas per model
-# need to make a copy of $extra_lambdas_for_model, scan_config spoils it
-#my %copy_of_extra_lambdas_for_model = %$extra_lambdas_for_model;
-my %used_triples = %{$default_triples};
-my ($models_used) = scan_config($___CONFIG);
-
-# Parse the lambda config string and convert it to a nice structure in the same format as $used_triples
-if (defined $___LAMBDA) {
- my %specified_triples;
- # interpreting lambdas from command line
- foreach (split(/\s+/,$___LAMBDA)) {
- my ($name,$values) = split(/:/);
- die "Malformed setting: '$_', expected name:values\n" if !defined $name || !defined $values;
- foreach my $startminmax (split/;/,$values) {
- if ($startminmax =~ /^(-?[\.\d]+),(-?[\.\d]+)-(-?[\.\d]+)$/) {
- my $start = $1;
- my $min = $2;
- my $max = $3;
- push @{$specified_triples{$name}}, [$start, $min, $max];
- }
- else {
- die "Malformed feature range definition: $name => $startminmax\n";
- }
- }
- }
- # sanity checks for specified lambda triples
- foreach my $name (keys %used_triples) {
- die "No lambdas specified for '$name', but ".($#{$used_triples{$name}}+1)." needed.\n"
- unless defined($specified_triples{$name});
- die "Number of lambdas specified for '$name' (".($#{$specified_triples{$name}}+1).") does not match number needed (".($#{$used_triples{$name}}+1).")\n"
- if (($#{$used_triples{$name}}) != ($#{$specified_triples{$name}}));
- }
- foreach my $name (keys %specified_triples) {
- die "Lambdas specified for '$name' ".(@{$specified_triples{$name}}).", but none needed.\n"
- unless defined($used_triples{$name});
- }
- %used_triples = %specified_triples;
-}
-
-# moses should use our config
-if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(generation-file) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) /
-) {
- die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files.";
-}
-
-#store current directory and create the working directory (if needed)
-my $cwd = `pawd 2>/dev/null`;
-if(!$cwd){$cwd = `pwd`;}
-chomp($cwd);
-
-safesystem("mkdir -p $___WORKING_DIR") or die "Can't mkdir $___WORKING_DIR";
-
-{
-# open local scope
-
-#chdir to the working directory
-chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR";
-
-# fixed file names
-my $mert_logfile = "zmert.log";
-
-if ($___FILTER_PHRASE_TABLE){
- # filter the phrase tables wih respect to input, use --decoder-flags
- print "filtering the phrase tables... ".`date`;
- my $cmd = "$filtercmd ./filtered $___CONFIG $___DEV_F";
- if (defined $___JOBS) {
- safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=filterphrases.out -stderr=filterphrases.err" )
- or die "Failed to submit filtering of tables to the queue (via $qsubwrapper)";
- } else {
- safesystem($cmd) or die "Failed to filter the tables.";
- }
-
- # the decoder should now use the filtered model
- $___CONFIG = "filtered/moses.ini";
-}
-else{
- # make a local clone of moses.ini
- safesystem("$clonecmd $___CONFIG");
- $___CONFIG = "moses.ini";
-}
-
-$___CONFIG = ensure_full_path($___CONFIG);
-
-my $PARAMETERS;
-$PARAMETERS = $___DECODER_FLAGS;
-
-my $nbest_file = "zmert.best$___N_BEST_LIST_SIZE.out";
-
-# Run zmert to optimize lambdas
-# We need to prepare:
-# 1) decoder launch script (decoder_cmd) - must be executable
-# 2) zmert configuration file (zmert_cfg.txt)
-# 3) parameters we want to optimize (params.txt)
-# 4) decoder configuration file (decoder_cfg_inter.txt)
-
-
-my $zmert_cfg = ensure_full_path("zmert_cfg.txt");
-my $opt_params = "params.txt"; # zmert requires path relative to launch path
-my $decoder_cfg_inter = "decoder_cfg_inter.txt"; # zmert requires path relative to launch path
-my $decoder_cmd_file = ensure_full_path("decoder_cmd");
-my $iteration_file = "iteration";
-
-my $LAMBDAS_FILE = ensure_full_path("finalWeights.txt");
-
-# prepare script that will launch moses from template
-# it will include an update script that will adjust feature weights according to
-# the last zmert iteration (they are stored in file $decoder_cfg_inter)
-
-# prepare lauch command with all parameters
-my $decoder_cmd;
-if (defined $___JOBS) {
- $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix zmert -queue-parameters '$queue_flags' -decoder-parameters '$PARAMETERS' -n-best-list '$nbest_file $___N_BEST_LIST_SIZE' -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > moses.out";
-} else {
- $decoder_cmd = "$___DECODER $PARAMETERS -config $___CONFIG -inputtype $___INPUTTYPE -n-best-list $nbest_file $___N_BEST_LIST_SIZE -i $___DEV_F > moses.out";
-}
-
-my $zmert_decoder_cmd = "$SCRIPTS_ROOTDIR/training/zmert-decoder.pl";
-
-# number of factors that a given metric requires
-my $metric_num_factors = 1;
-
-# SemPOS metric requires 2 parameters specifying position of t_lemma and sempos factor
-# e.g. for t_lemma|sempos|factor3|factor4|... the values are 0 and 1 (default setting)
-if( $___METRIC =~ /^SemPOS$/) {
- $___METRIC .= " 0 1";
- $metric_num_factors = 2;
-}
-# SemPOS_BLEU metric requires 7 parameters
-# 1) weight of SemPOS 2) weight of BLEU
-# 3) index of t_lemma for SemPOS 4) index of sempos for SemPOS
-# 5) max ngram for BLEU 6) ref length strategy for BLEU
-# 7) index of factor to compute BLEU on
-elsif( $___METRIC =~ /^SemPOS_BLEU$/) {
- $___SEMPOSBLEU_WEIGHTS =~ /^.*:.*$/ or die "--semposbleu-weights is not in format <sempos_weight>:<bleu_weight>";
- $___SEMPOSBLEU_WEIGHTS =~ s/:/ /;
- $___METRIC .= " $___SEMPOSBLEU_WEIGHTS 1 2 4 closest 0";
- $metric_num_factors = 3;
-}
-elsif( $___METRIC =~ /^BLEU$/) {
- $___METRIC .= " 4 closest";
-}
- elsif( $___METRIC =~ /^TER$/) {
- $___METRIC .= " nocase punc 20 50";
-}
-elsif( $___METRIC =~ /^TER-BLEU$/) {
- $___METRIC .= " nocase punc 20 50 4 closest";
-}
-
-if( $___EXTRACT_SEMPOS =~ /tmt/) {
- my $print_string = "";
- if( $___METRIC =~ /SemPOS_BLEU/) {
- $print_string = "Print::ForSemPOSBLEUMetric TMT_PARAM_PRINT_FOR_SEMPOS_BLEU_METRIC=m:form|t_lemma|gram/sempos TMT_PARAM_PRINT_FOR_SEMPOS_BLEU_METRIC_DESTINATION=factored_output";
- } elsif( $___METRIC =~ /SemPOS/) {
- $print_string = "Print::ForSemPOSMetric TMT_PARAM_PRINT_FOR_SEMPOS_METRIC=t_lemma|gram/sempos TMT_PARAM_PRINT_FOR_SEMPOS_METRIC_DESTINATION=factored_output";
- } else {
- die "Trying to get factors using tmt for unknown metric $___METRIC";
- }
-
- open( SCENARIO, ">$scenario_file") or die "Cannot open $scenario_file";
- print SCENARIO << "FILE_EOF";
-SCzechW_to_SCzechM::Tokenize_joining_numbers
-SCzechW_to_SCzechM::TagMorce
-# SCzechM_to_SCzechN::Czech_named_ent_SVM_recognizer
-# SCzechM_to_SCzechN::Geo_ne_recognizer
-# SCzechM_to_SCzechN::Embed_instances
-SCzechM_to_SCzechA::McD_parser_local TMT_PARAM_MCD_CZ_MODEL=pdt20_train_autTag_golden_latin2_pruned_0.02.model
-# SCzechM_to_SCzechA::McD_parser_local TMT_PARAM_MCD_CZ_MODEL=pdt20_train_autTag_golden_latin2_pruned_0.10.model
-SCzechM_to_SCzechA::Fix_atree_after_McD
-SCzechM_to_SCzechA::Fix_is_member
-SCzechA_to_SCzechT::Mark_auxiliary_nodes
-SCzechA_to_SCzechT::Build_ttree
-SCzechA_to_SCzechT::Fill_is_member
-SCzechA_to_SCzechT::Rehang_unary_coord_conj
-SCzechA_to_SCzechT::Assign_coap_functors
-SCzechA_to_SCzechT::Fix_is_member
-SCzechA_to_SCzechT::Distrib_coord_aux
-SCzechA_to_SCzechT::Mark_clause_heads
-SCzechA_to_SCzechT::Mark_relclause_heads
-SCzechA_to_SCzechT::Mark_relclause_coref
-SCzechA_to_SCzechT::Fix_tlemmas
-SCzechA_to_SCzechT::Assign_nodetype
-SCzechA_to_SCzechT::Assign_grammatemes
-SCzechA_to_SCzechT::Detect_formeme
-SCzechA_to_SCzechT::Add_PersPron
-SCzechA_to_SCzechT::Mark_reflpron_coref
-SCzechA_to_SCzechT::TBLa2t_phaseFd
-$print_string
-FILE_EOF
- close( SCENARIO);
-}
-
-my $feats_order = join( " ", keys %used_triples);
-
-open( DECODER_CMD, ">$decoder_cmd_file") or die "Cannot open $decoder_cmd_file";
- print DECODER_CMD <<"FILE_EOF";
-#!/usr/bin/perl -w
-
-use strict;
-
-my %FULL2ABBR = map {my (\$a, \$b) = split/=/,\$_,2; (\$b, \$a);} split /\\s+/, "$ABBR_FULL_MAP";
-
-open( ITERATION, "<$iteration_file") or die "Cannot open $iteration_file";
-my \$iteration = <ITERATION>;
-close( ITERATION);
-chomp( \$iteration);
-
-my \@features_order = qw( $feats_order );
-
-# extract feature weights from last zmert iteration (stored in \$decoder_cfg_inter)
-print "Updating decoder config file from file $decoder_cfg_inter\n";
-
-my \$moses_ini = "$___CONFIG";
-
-open( IN, "$decoder_cfg_inter") or die "Cannot open file $decoder_cfg_inter (reading updated lambdas)";
-FILE_EOF
-
-print DECODER_CMD <<'FILE_EOF';
-my %lambdas = ();
-my $lastName = "";
-while( my $line = <IN>) {
- chomp($line);
- my ($name, $val) = split( /\s+/, $line);
- $name =~ s/_\d+$//; # remove index of the lambda
- push( @{$lambdas{$name}}, $val);
-}
-close(IN);
-
-
-my $moses_ini_old = "$moses_ini";
-$moses_ini_old =~ s/^(.*)\/([^\/]+)$/$1\/run$iteration.$2/;
-$moses_ini_old = $moses_ini.".orig" if( $iteration == 0);
-safesystem("mv $moses_ini $moses_ini_old");
-# update moses.ini
-open( INI_OLD, "<$moses_ini_old") or die "Cannot open config file $moses_ini_old";
-open( INI, ">$moses_ini") or die "Cannot open config file $moses_ini";
-while( my $line = <INI_OLD>) {
- if( $line =~ m/^\[(weight-.+)\]$/) {
- my $name = $FULL2ABBR{$1};
- print STDERR "Updating weight: $1, $name\n";
- print INI "$line";
- foreach( @{$lambdas{$name}}) {
- print INI "$_\n";
- print STDERR "NEW: $_\tOLD:";
- $line = <INI_OLD>;
- print STDERR $line;
- }
- } else {
- print INI $line;
- }
-}
-close(INI_OLD);
-close(INI);
-
-FILE_EOF
-
-print DECODER_CMD <<"FILE_EOF";
-print "Executing: $decoder_cmd";
-safesystem("$decoder_cmd") or die "Failed to execute $decoder_cmd";
-
-# update iteration number in intermediate config file
-++\$iteration;
-safesystem("echo \$iteration > $iteration_file");
-
-# modify the nbest-list to conform the zmert required format
-# <i> ||| <candidate_translation> ||| featVal_1 featVal_2 ... featVal_m
-my \$nbest_file_orig = "$nbest_file".".orig";
-safesystem( "mv $nbest_file \$nbest_file_orig");
-open( NBEST_ORIG, "<\$nbest_file_orig") or die "Cannot open original nbest-list \$nbest_file_orig";
-open( NBEST, ">$nbest_file") or die "Cannot open modified nbest-list $nbest_file";
-
-my \$line_num = 0;
-
-FILE_EOF
-
-
-if( "$___EXTRACT_SEMPOS" =~ /factors/) {
- print DECODER_CMD <<"FILE_EOF";
-my (undef, \$args) = split( /:/, "$___EXTRACT_SEMPOS");
-my \$factor_count = $metric_num_factors;
-FILE_EOF
-print DECODER_CMD <<'FILE_EOF';
-my @indices = split( /,/, $args);
-die "Specified ".scalar @indices." factors to extract but selected metric requires $factor_count factors"
- if( @indices != $factor_count);
-while( my $line = <NBEST_ORIG>) {
- my @array = split( /\|\|\|/, $line);
- # remove feature names from the feature scores string
- $array[2] = extractScores( $array[2]);
- my @tokens = split( /\s/, $array[1]); # split sentence into words
- $array[1] = "";
- foreach my $token (@tokens) {
- next if $token eq "";
- my @factors = split( /\|/, $token);
- my $put_separator = 0;
- foreach my $index (@indices) {
- die "Cannot extract factor with index $index from '$token'" if ($index > $#factors);
- $array[1] .= '|' if ($put_separator); # separator between factors
- $array[1] .= $factors[$index];
- $put_separator = 1;
- }
- $array[1] .= " "; # space between words
- }
- print NBEST join( '|||', @array);
-}
-
-FILE_EOF
-
-} elsif( "$___EXTRACT_SEMPOS" =~ /tmt/) {
- print DECODER_CMD <<"FILE_EOF";
-# run TectoMT to analyze sentences
-print STDERR "Analyzing candidates using $srunblocks_cmd\n";
-my \$nbest_factored = "$nbest_file.factored";
-open( NBEST_FACTORED, "|$srunblocks_cmd > \$nbest_factored") or die "Cannot open pipe to command $srunblocks_cmd";
-FILE_EOF
-print DECODER_CMD <<'FILE_EOF';
-my $line_count = 0;
-my @out = ();
-while( my $line = <NBEST_ORIG>) {
- my @array = split( /\|\|\|/, $line);
- die "Nbest-list does not have required format (values separated by '|||')" if ($#array != 3);
- # remove feature names from the feature scores string
- $array[2] = extractScores( $array[2]);
- push( @out, \@array); # store line with scores for output
- # select only word forms
- my $sentence = "";
- foreach my $fact ( split /\s+/, $array[1]) {
- next if( $fact eq "");
- my @fact_array = split( /\|/, $fact);
- $sentence .= "$fact_array[0] ";
- }
- # analyze sentence via TectoMT using scenario
- print NBEST_FACTORED "$sentence\n";
- ++$line_count;
-}
-close( NBEST_ORIG);
-close( NBEST_FACTORED);
-
-open( NBEST_FACTORED, "<$nbest_factored") or die "Cannot open $nbest_factored";
-my $line_count_check = 0;
-while( my $line = <NBEST_FACTORED>) {
- chomp( $line);
- my $array_ref = shift( @out);
- $array_ref->[1] = $line;
- print NBEST join( '|||', @{$array_ref});
- ++$line_count_check;
-}
-die "Error: Sent $line_count sentences to analyze but got only $line_count_check back"
- if( $line_count != $line_count_check);
-
-FILE_EOF
-
-} elsif ($___EXTRACT_SEMPOS eq "none") {
-print DECODER_CMD <<'FILE_EOF';
-while( my $line = <NBEST_ORIG>) {
- my @array = split( /\|\|\|/, $line);
- # remove feature names from the feature scores string
- $array[2] = extractScores( $array[2]);
- print NBEST join( '|||', @array);
-}
-FILE_EOF
-} else {
- die "Unknown type of factor extraction: $___EXTRACT_SEMPOS";
-}
-
-print DECODER_CMD <<'FILE_EOF';
-close( NBEST);
-close( NBEST_ORIG);
-
-# END OF BODY
-
-sub extractScores {
- my $scores = shift;
- my (%scores_hash, $name);
- foreach my $score_or_name (split /\s+/, $scores) {
- if( $score_or_name =~ s/://) {
- $name = $score_or_name;
- } elsif ($score_or_name =~ /\d/) {
- die "Cannot guess nbest-list first feature score name" if( not defined $name);
- $scores_hash{$name} .= "$score_or_name ";
- } else {
- die "Unknown string ($score_or_name) in nbest-list feature scores section (not a feature name or score)"
- if( $score_or_name =~ /\S/);
- }
- }
- $scores = "";
- foreach $name (@features_order) {
- $scores .= $scores_hash{$name};
- }
- #print STDERR "REORDERED SCORES: $scores\n";
- return $scores;
-}
-
-sub safesystem {
- print STDERR "Executing: @_\n";
- system(@_);
- if ($? == -1) {
- print STDERR "Failed to execute: @_\n $!\n";
- exit(1);
- }
- elsif ($? & 127) {
- printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
- ($? & 127), ($? & 128) ? 'with' : 'without';
- exit(1);
- }
- else {
- my $exitcode = $? >> 8;
- print STDERR "Exit code: $exitcode\n" if $exitcode;
- return ! $exitcode;
- }
-}
-FILE_EOF
-
-close( DECODER_CMD);
-
-# make the decoder lauch script executable
-safesystem("chmod a+x $decoder_cmd_file");
-
-# analyze reference if necessary
-if( $___EXTRACT_SEMPOS =~ /tmt/) {
- my $part = 0;
- foreach my $ref (@references) {
- my $line_count = 0;
- print STDERR "Analyzing references using $srunblocks_cmd\n";
- open( REF_IN, "<$ref") or die "Cannot open $ref";
- my $ref_factored = "$ref.factored.$part";
- push( @references_factored, $ref_factored);
- open( REF_FACTORED, "|$srunblocks_cmd > $ref_factored");
- while( my $line = <REF_IN>) {
- # analyze sentence via TectoMT using scenario in file $scerario_file
- print REF_FACTORED $line;
- ++$line_count;
- }
- close( REF_IN);
- close( REF_FACTORED);
- my $line_count_check = 0;
- open( REF_FACTORED, "<$ref_factored") or die "Cannot open $ref_factored";
- ++$line_count_check while( <REF_FACTORED>);
- die "Error: Sent $line_count sentences to analyze but got $line_count_check back"
- if( $line_count != $line_count_check);
- close( REF_FACTORED);
- ++$part;
- }
- print STDERR "References analyzed\n";
-} else {
- push( @references_factored, @references);
-}
-
-my $ref_stem = $references_factored[0];
-$ref_stem =~ s/\d+$// if( $#references_factored); # get the file stem if we have more than one refs
-$ref_stem =~ s/.*\/([^\/]+)$/..\/$1/;
-
-# prepare zmert configuration file
-open( ZMERT_CFG, ">$zmert_cfg") or die "Cannot open $zmert_cfg";
-
-# FILES
-# print ZMERT_CFG "-dir\t$___PATH_FROM_LAUNCHDIR\n"; # working path (relative to the lauch path)
-# print ZMERT_CFG "-r\t$___DEV_E\n"; # file(s) containing references
-print ZMERT_CFG "-r\t$ref_stem\n"; # file(s) containing references
-print ZMERT_CFG "-rps\t".scalar(@references)."\n"; # number of references per sentence
-print ZMERT_CFG "-txtNrm\t0\n"; # we use our own text normalization
-print ZMERT_CFG "-p\t$opt_params\n"; # file containig parameter names, initial values, ranges
-print ZMERT_CFG "-fin\t$___LAMBDAS_OUT\n" if(defined $___LAMBDAS_OUT); # file where the final weight vector is written
-
-# MERT CONFIGURATION
-print ZMERT_CFG "-m\t$___METRIC\n";
-print ZMERT_CFG "-maxIt\t$___MAX_MERT_ITER\n" if( $___MAX_MERT_ITER); # maximum number of MERT iterations
-# print ZMERT_CFG "-prevIt\t$PREV_MERT_ITER\n";
-# number of iteration before considering an early exit
-# print ZMERT_CFG "-minIt\t$MIN_MERT_ITER\n";
-# number of consecutive iterations that must satisfy some early stopping
-# criterion to cause an early exit
-# print ZMERT_CFG "-stopIt\t$STOP_MIN_ITER\n";
-# early exit criterion: no weight changes by more than $LAMBDA_CHANGE;
-# default value: -1 (this criterion is never investigated)
-# print ZMERT_CFG "-stopSig\t$LAMBDA_CHANGE\n";
-# save intermediate decoder config files (1) or decoder outputs (2) or both (3) or neither (0)
-print ZMERT_CFG "-save\t$___SAVE_INTER\n";
-# print ZMERT_CFG "-ipi\t$INITS_PER_ITER\n"; # number of intermediate initial points per iteration
-# print ZMERT_CFG "-opi\t$ONCE_PER_ITER\n"; # modify a parameter only once per iteration;
-# print ZMERT_CFG "-rand\t$RAND_INIT\n"; # choose initial points randomly
-print ZMERT_CFG "-seed\t$___PREDICTABLE_SEEDS\n" if($___PREDICTABLE_SEEDS); # initialize the random number generator
-
-# DECODER SPECIFICATION
-print ZMERT_CFG "-cmd\t$decoder_cmd_file\n"; # name of file containing commands to run the decoder
-print ZMERT_CFG "-decOut\t$nbest_file\n"; # name of the n-best file produced by the decoder
-# print ZMERT_CFG "-decExit\t$DECODER_EXIT_CODE\n"; # value returned by decoder after successful exit
-print ZMERT_CFG "-dcfg\t$decoder_cfg_inter\n"; # name of intermediate decoder configuration file
-print ZMERT_CFG "-N\t$___N_BEST_LIST_SIZE\n";
-
-# OUTPUT SPECIFICATION
-print ZMERT_CFG "-v\t$___MERT_VERBOSE\n"; # zmert verbosity level (0-2)
-print ZMERT_CFG "-decV\t$___DECODER_VERBOSE\n"; # decoder output printed (1) or ignored (0)
-
-close( ZMERT_CFG);
-
-my ($name, $num, $val, $min, $max);
-# prepare file with parameters to optimize
-open( PARAMS, ">$opt_params") or die "Cannot open file $opt_params with parameters to optimize";
-my $optString;
-foreach $name (keys %used_triples) {
- $num = 0;
- foreach my $triple (@{$used_triples{$name}}) {
- ($val, $min, $max) = @$triple;
- my ($minRand, $maxRand) = ($min, $max);
- # the file should describe features to optimize in the following format:
- # "featureName ||| defValue optString minVal maxVal minRandVal maxRandVal"
- # optString can be 'Opt' or 'Fix'
- $optString = "Opt";
- if( defined $___ACTIVATE_FEATURES and not $active_features{$name."_$num"}) {
- $optString = "Fix";
- }
- print PARAMS "$name"."_$num ||| $val $optString $min $max $minRand $maxRand\n";
- ++$num;
- }
-}
-print PARAMS "normalization = $___NORM\n";
-close( PARAMS);
-
-# prepare intermediate config file from which moses.ini will be updated before each launch
-open( DEC_CFG, ">$decoder_cfg_inter") or die "Cannot open file $decoder_cfg_inter";
-foreach $name (keys %used_triples) {
- $num = 0;
- foreach my $tri (@{$used_triples{$name}}) {
- ($val, $min, $max) = @$tri;
- print DEC_CFG $name."_$num $val\n";
- ++$num;
- }
-}
-close( DEC_CFG);
-
-open( ITER, ">$iteration_file") or die "Cannot open file $iteration_file";
-print ITER "1";
-close( ITER);
-
-# launch zmert
-my $javaMaxMem = ""; # -maxMem 4000" # use at most 4000MB of memory
-my $cmd = "java -cp $zmert_classpath ZMERT $javaMaxMem $zmert_cfg";
-
-print "Zmert start at ".`date`;
-
-if ( 0 && defined $___JOBS) {
- # NOT WORKING - this branch needs to init environment variables
- safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -stderr=$mert_logfile -queue-parameter='$queue_flags'") or die "Failed to start zmert (via qsubwrapper $qsubwrapper)";
-
-} else {
- safesystem("$cmd 2> $mert_logfile") or die "Failed to run zmert";
-}
-
-print "Zmert finished at ".`date`;
-
-# RELEVANT ONLY FOR PLAYGROUND at UFAL, CHARLES UNIVESITY IN PRAGUE
-# copy optimized moses.ini and original run1.moses.ini to the working directory
-if( $___FILTER_PHRASE_TABLE) {
- my ($config_opt, $config_std, $config_base) = ($___CONFIG, $___CONFIG, "$cwd/moses.abs.ini");
- $config_std =~ s/^(.*)\/([^\/]+)$/$1\/run1.$2/;
- mergeConfigs( $config_base, $___CONFIG);
- mergeConfigs( $config_base, $config_std);
-}
-
-# chdir back to the original directory # useless, just to remind we were not there
-chdir($cwd);
-
-
-} # end of local scope
-
-sub mergeConfigs {
- my ($config_base, $config_weights) = @_;
- my $config_new = $config_weights;
- $config_new =~ s/^.*\///;
- open BASE, "<$config_base" or die "Cannot open $config_base";
- open WEIGHTS, "<$config_weights" or die "Cannot open $config_weights";
- open NEW, ">$config_new" or die "Cannot open $config_new";
- my $cont = 1;
- my ($b_line, $w_line);
- while( $cont) {
- $b_line = <BASE>;
- $w_line = <WEIGHTS>;
- $cont = (defined $b_line and defined $w_line);
- if( $b_line =~ /^\[weight-/) {
- if( $w_line !~ /^\[weight-/) { die "mergeConfigs: $config_base and $config_weights do not have the same format"; }
- print NEW $w_line;
- $b_line = <BASE>; $w_line = <WEIGHTS>;
- while( $w_line =~ /\d/) {
- print NEW $w_line;
- $b_line = <BASE>; $w_line = <WEIGHTS>;
- }
- print NEW $b_line;
- } else {
- print NEW $b_line;
- }
- }
- close BASE;
- close WEIGHTS;
- close NEW;
-}
-
-sub dump_triples {
- my $triples = shift;
-
- foreach my $name (keys %$triples) {
- foreach my $triple (@{$triples->{$name}}) {
- my ($val, $min, $max) = @$triple;
- }
- }
-}
-
-sub safesystem {
- print STDERR "Executing: @_\n";
- system(@_);
- if ($? == -1) {
- print STDERR "Failed to execute: @_\n $!\n";
- exit(1);
- }
- elsif ($? & 127) {
- printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
- ($? & 127), ($? & 128) ? 'with' : 'without';
- exit(1);
- }
- else {
- my $exitcode = $? >> 8;
- print STDERR "Exit code: $exitcode\n" if $exitcode;
- return ! $exitcode;
- }
-}
-
-sub ensure_full_path {
- my $PATH = shift;
-$PATH =~ s/\/nfsmnt//;
- return $PATH if $PATH =~ /^\//;
- my $dir = `pawd 2>/dev/null`;
- if(!$dir){$dir = `pwd`;}
- chomp($dir);
- $PATH = $dir."/".$PATH;
- $PATH =~ s/[\r\n]//g;
- $PATH =~ s/\/\.\//\//g;
- $PATH =~ s/\/+/\//g;
- my $sanity = 0;
- while($PATH =~ /\/\.\.\// && $sanity++<10) {
- $PATH =~ s/\/+/\//g;
- $PATH =~ s/\/[^\/]+\/\.\.\//\//g;
- }
- $PATH =~ s/\/[^\/]+\/\.\.$//;
- $PATH =~ s/\/+$//;
-$PATH =~ s/\/nfsmnt//;
- return $PATH;
-}
-
-sub scan_config {
- my $ini = shift;
- my $inishortname = $ini; $inishortname =~ s/^.*\///; # for error reporting
- # we get a pre-filled counts, because some lambdas are always needed (word penalty, for instance)
- # as we walk though the ini file, we record how many extra lambdas do we need
- # and finally, we report it
-
- # in which field (counting from zero) is the filename to check?
- my %where_is_filename = (
- "ttable-file" => 4,
- "generation-file" => 3,
- "lmodel-file" => 3,
- "distortion-file" => 3,
- "global-lexical-file" => 1,
- );
- # by default, each line of each section means one lambda, but some sections
- # explicitly state a custom number of lambdas
- my %where_is_lambda_count = (
- "ttable-file" => 3,
- "generation-file" => 2,
- "distortion-file" => 2,
- );
-
- open INI, $ini or die "Can't read $ini";
- my $section = undef; # name of the section we are reading
- my $shortname = undef; # the corresponding short name
- my $nr = 0;
- my $error = 0;
- my %defined_files;
- my %defined_steps; # check the ini file for compatible mapping steps and actually defined files
- while (<INI>) {
- $nr++;
- next if /^\s*#/; # skip comments
- if (/^\[([^\]]*)\]\s*$/) {
- $section = $1;
- $shortname = $TABLECONFIG2ABBR{$section};
- next;
- }
- if (defined $section && $section eq "mapping") {
- # keep track of mapping steps used
- $defined_steps{$1}++ if /^([TG])/ || /^\d+ ([TG])/;
- }
- if (defined $section && defined $where_is_filename{$section}) {
- print "$section -> $where_is_filename{$section}\n";
- # this ini section is relevant to lambdas
- chomp;
- my @flds = split / +/;
- my $fn = $flds[$where_is_filename{$section}];
- if (defined $fn && $fn !~ /^\s+$/) {
- print "checking weight-count for $section\n";
- # this is a filename! check it
- if ($fn !~ /^\//) {
- $error = 1;
- print STDERR "$inishortname:$nr:Filename not absolute: $fn\n";
- }
- if (! -s $fn && ! -s "$fn.gz" && ! -s "$fn.binphr.idx" && ! -s "$fn.binlexr.idx" ) {
- $error = 1;
- print STDERR "$inishortname:$nr:File does not exist or empty: $fn\n";
- }
- # remember the number of files used, to know how many lambdas do we need
- die "No short name was defined for section $section!"
- if ! defined $shortname;
-
- # how many lambdas does this model need?
- # either specified explicitly, or the default, i.e. one
- my $needlambdas = defined $where_is_lambda_count{$section} ? $flds[$where_is_lambda_count{$section}] : 1;
-
- print STDERR "Config needs $needlambdas lambdas for $section (i.e. $shortname)\n" if $verbose;
- if (!defined $___LAMBDA && (!defined $additional_triples->{$shortname} || scalar(@{$additional_triples->{$shortname}}) < $needlambdas)) {
- print STDERR "$inishortname:$nr:Your model $shortname needs $needlambdas weights but we define the default ranges for only "
- .scalar(@{$additional_triples->{$shortname}})." weights. Cannot use the default, you must supply lambdas by hand.\n";
- $error = 1;
- }
- else {
- # note: table may use less parameters than the maximum number
- # of triples
- for(my $lambda=0;$lambda<$needlambdas;$lambda++) {
- my ($start, $min, $max)
- = @{${$additional_triples->{$shortname}}[$lambda]};
- push @{$used_triples{$shortname}}, [$start, $min, $max];
- }
- }
- $defined_files{$shortname}++;
- }
- }
- }
- die "$inishortname: File was empty!" if !$nr;
- close INI;
- for my $pair (qw/T=tm=translation G=g=generation/) {
- my ($tg, $shortname, $label) = split /=/, $pair;
- $defined_files{$shortname} = 0 if ! defined $defined_files{$shortname};
- $defined_steps{$tg} = 0 if ! defined $defined_steps{$tg};
-
- if ($defined_files{$shortname} != $defined_steps{$tg}) {
- print STDERR "$inishortname: You defined $defined_files{$shortname} files for $label but use $defined_steps{$tg} in [mapping]!\n";
- $error = 1;
- }
- }
-
- # distance-based distortion
- if ($___ASYNC == 1)
- {
- print STDERR "ASYNC distortion & word penalty";
- my @my_array;
- for(my $i=0 ; $i < $defined_steps{"T"} ; $i++)
- {
- push @my_array, [ 1.0, 0.0, 2.0 ];
- }
- push @{$used_triples{"d"}}, @my_array;
-
- @my_array = ();
- for(my $i=0 ; $i < $defined_steps{"T"} ; $i++)
- {
- push @my_array, [ 0.5, -1.0, 1.0 ];
- }
- push @{$used_triples{"w"}}, @my_array;
-
- # debug print
- print "distortion:";
- my $refarray=$used_triples{"d"};
- my @vector=@$refarray;
- foreach my $subarray (@vector) {
- my @toto=@$subarray;
- print @toto,"\n";
- }
- #exit 1;
- }
- else
- {
- print STDERR "SYNC distortion";
- push @{$used_triples{"d"}}, [1.0, 0.0, 2.0];
- }
-
-
- exit(1) if $error;
- return (\%defined_files);
-}