Merge remote branch 'origin/master' into phrase-weighting

author: Barry Haddow <barry.haddow@gmail.com> 2013-02-21 21:34:59 +0400
committer: Barry Haddow <barry.haddow@gmail.com> 2013-02-21 21:34:59 +0400
commit: 51ab9aa19dccefea54b45dc81a929301ba5d6ea5 (patch)
tree: 364018d46af083e52b0658359d535a055bdd92ac /scripts/training
parent: 87d7294d50d69da1833b6a78829154c444f2be6e (diff)
parent: 5844fb21a758a492b0847ba0939a7856a9a5cb68 (diff)
6 files changed, 218 insertions, 1217 deletions
diff --git a/scripts/training/LexicalTranslationModel.pm b/scripts/training/LexicalTranslationModel.pm
index c0570df5c..08d161cc1 100644
--- a/scripts/training/LexicalTranslationModel.pm
+++ b/scripts/training/LexicalTranslationModel.pm
@@ -38,20 +38,59 @@ sub fix_spaces {
 }
 
 sub get_lexical {
-    my ($alignment_file_f,$alignment_file_e,$alignment_file_a,$lexical_file,$write_counts) = @_;
+    my ($alignment_file_f,$alignment_file_e,$alignment_file_a,$lexical_file,$write_counts,$baseline_corpus_f,$baseline_corpus_e,$baseline_alignment, $instance_weights_file) = @_;
     print STDERR "($alignment_file_f,$alignment_file_e,$lexical_file)\n";
+    print STDERR "baseline ($baseline_corpus_f,$baseline_corpus_e,$baseline_alignment)\n" if defined $baseline_alignment;
+    print STDERR "instance weights ($instance_weights_file)\n" if defined $instance_weights_file;
 #    my $alignment_file_a = $___ALIGNMENT_FILE.".".$___ALIGNMENT;
 
-    my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);
 
     if (-e "$lexical_file.f2e" && -e "$lexical_file.e2f" && (!$write_counts || -e "$lexical_file.counts.f2e" && -e "$lexical_file.counts.e2f")) {
       print STDERR "  reusing: $lexical_file.f2e and $lexical_file.e2f\n";
       return;
     }
 
+    my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);
+    &get_lexical_counts($alignment_file_e,$alignment_file_f,$alignment_file_a,$instance_weights_file,\%WORD_TRANSLATION,\%TOTAL_FOREIGN,\%TOTAL_ENGLISH);
+    if (defined($baseline_alignment)) {
+      &get_lexical_counts($baseline_corpus_e,$baseline_corpus_f,$baseline_alignment,undef,\%WORD_TRANSLATION,\%TOTAL_FOREIGN,\%TOTAL_ENGLISH);
+    }
+
+    open(F2E,">$lexical_file.f2e") or die "ERROR: Can't write $lexical_file.f2e";
+    open(E2F,">$lexical_file.e2f") or die "ERROR: Can't write $lexical_file.e2f";
+    if ($write_counts) {
+        open(F2E2,">$lexical_file.counts.f2e") or die "ERROR: Can't write $lexical_file.counts.f2e";
+        open(E2F2,">$lexical_file.counts.e2f") or die "ERROR: Can't write $lexical_file.counts.e2f";
+    }
+
+    foreach my $f (keys %WORD_TRANSLATION) {
+	foreach my $e (keys %{$WORD_TRANSLATION{$f}}) {
+	    printf F2E "%s %s %.7f\n",$e,$f,$WORD_TRANSLATION{$f}{$e}/$TOTAL_FOREIGN{$f};
+	    printf E2F "%s %s %.7f\n",$f,$e,$WORD_TRANSLATION{$f}{$e}/$TOTAL_ENGLISH{$e};
+	    if ($write_counts) {
+	        printf F2E2 "%s %s %i %i\n",$e,$f,$WORD_TRANSLATION{$f}{$e},$TOTAL_FOREIGN{$f};
+	        printf E2F2 "%s %s %i %i\n",$f,$e,$WORD_TRANSLATION{$f}{$e},$TOTAL_ENGLISH{$e};
+	    }
+	}
+    }
+    close(E2F);
+    close(F2E);
+    if ($write_counts) {
+        close(E2F2);
+        close(F2E2);
+    }
+    print STDERR "Saved: $lexical_file.f2e and $lexical_file.e2f\n";
+}
+
+sub get_lexical_counts {
+    my ($alignment_file_e,$alignment_file_f,$alignment_file_a,$instance_weights_file,$WORD_TRANSLATION,$TOTAL_FOREIGN,$TOTAL_ENGLISH) = @_;
     open(E,&open_compressed($alignment_file_e)) or die "ERROR: Can't read $alignment_file_e";
     open(F,&open_compressed($alignment_file_f)) or die "ERROR: Can't read $alignment_file_f";
     open(A,&open_compressed($alignment_file_a)) or die "ERROR: Can't read $alignment_file_a";
+    my $W = undef;
+    if (defined($instance_weights_file) && $instance_weights_file) {
+      open($W, $instance_weights_file) or die "ERROR: Can't read $instance_weights_file";
+    }
 
     my $alignment_id = 0;
     while(my $e = <E>) {
@@ -61,7 +100,8 @@ sub get_lexical {
         my $f = <F>; chomp($f); fix_spaces(\$f);
         my @FOREIGN = split(/ /,$f);
         my $a = <A>; chomp($a); fix_spaces(\$a);
-
+        my $iw = 1; # instance weight
+        $iw = <$W> if defined $W;
         my (%FOREIGN_ALIGNED,%ENGLISH_ALIGNED);
         foreach (split(/ /,$a)) {
             my ($fi,$ei) = split(/\-/);
@@ -70,62 +110,36 @@ sub get_lexical {
 	    }
 	    else {
 		# local counts
-		$FOREIGN_ALIGNED{$fi}++;
-		$ENGLISH_ALIGNED{$ei}++;
+		$FOREIGN_ALIGNED{$fi}+=$iw;
+		$ENGLISH_ALIGNED{$ei}+=$iw;
 		
 		# global counts
-		$WORD_TRANSLATION{$FOREIGN[$fi]}{$ENGLISH[$ei]}++;
-		$TOTAL_FOREIGN{$FOREIGN[$fi]}++;
-		$TOTAL_ENGLISH{$ENGLISH[$ei]}++;
+		$$WORD_TRANSLATION{$FOREIGN[$fi]}{$ENGLISH[$ei]}+=$iw;
+		$$TOTAL_FOREIGN{$FOREIGN[$fi]}+=$iw;
+		$$TOTAL_ENGLISH{$ENGLISH[$ei]}+=$iw;
 	    }
         }
 
         # unaligned words
         for(my $ei=0;$ei<scalar(@ENGLISH);$ei++) {
           next if defined($ENGLISH_ALIGNED{$ei});
-          $WORD_TRANSLATION{"NULL"}{$ENGLISH[$ei]}++;
-          $TOTAL_ENGLISH{$ENGLISH[$ei]}++;
-          $TOTAL_FOREIGN{"NULL"}++;
+          $$WORD_TRANSLATION{"NULL"}{$ENGLISH[$ei]}+=$iw;
+          $$TOTAL_ENGLISH{$ENGLISH[$ei]}+=$iw;
+          $$TOTAL_FOREIGN{"NULL"}+=$iw;
         }
         for(my $fi=0;$fi<scalar(@FOREIGN);$fi++) {
           next if defined($FOREIGN_ALIGNED{$fi});
-          $WORD_TRANSLATION{$FOREIGN[$fi]}{"NULL"}++;
-          $TOTAL_FOREIGN{$FOREIGN[$fi]}++;
-          $TOTAL_ENGLISH{"NULL"}++;
+          $$WORD_TRANSLATION{$FOREIGN[$fi]}{"NULL"}+=$iw;
+          $$TOTAL_FOREIGN{$FOREIGN[$fi]}+=$iw;
+          $$TOTAL_ENGLISH{"NULL"}+=$iw;
         }
     }
     print STDERR "\n";
     close(A);
     close(F);
     close(E);
-
-    open(F2E,">$lexical_file.f2e") or die "ERROR: Can't write $lexical_file.f2e";
-    open(E2F,">$lexical_file.e2f") or die "ERROR: Can't write $lexical_file.e2f";
-    if ($write_counts) {
-        open(F2E2,">$lexical_file.counts.f2e") or die "ERROR: Can't write $lexical_file.counts.f2e";
-        open(E2F2,">$lexical_file.counts.e2f") or die "ERROR: Can't write $lexical_file.counts.e2f";
-    }
-
-    foreach my $f (keys %WORD_TRANSLATION) {
-	foreach my $e (keys %{$WORD_TRANSLATION{$f}}) {
-	    printf F2E "%s %s %.7f\n",$e,$f,$WORD_TRANSLATION{$f}{$e}/$TOTAL_FOREIGN{$f};
-	    printf E2F "%s %s %.7f\n",$f,$e,$WORD_TRANSLATION{$f}{$e}/$TOTAL_ENGLISH{$e};
-	    if ($write_counts) {
-	        printf F2E2 "%s %s %i %i\n",$e,$f,$WORD_TRANSLATION{$f}{$e},$TOTAL_FOREIGN{$f};
-	        printf E2F2 "%s %s %i %i\n",$f,$e,$WORD_TRANSLATION{$f}{$e},$TOTAL_ENGLISH{$e};
-	    }
-	}
-    }
-    close(E2F);
-    close(F2E);
-    if ($write_counts) {
-        close(E2F2);
-        close(F2E2);
-    }
-    print STDERR "Saved: $lexical_file.f2e and $lexical_file.e2f\n";
 }
 
-
 END {
 }
 
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index df9c528e0..d994fbcef 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -119,7 +119,11 @@ while(<INI>) {
     	  print INI_OUT "2 $source_factor $t $w $new_name.bin$table_flag\n";
         }
         elsif ($binarizer && $phrase_table_impl == 0) {
-    	  print INI_OUT "1 $source_factor $t $w $new_name$table_flag\n";
+          if ($binarizer =~ /processPhraseTableMin/) {
+            print INI_OUT "12 $source_factor $t $w $new_name$table_flag\n";
+          } else {
+    	    print INI_OUT "1 $source_factor $t $w $new_name$table_flag\n";
+          }
         } else {
           $new_name .= ".gz" if $opt_gzip;
     	    print INI_OUT "$phrase_table_impl $source_factor $t $w $new_name$table_flag\n";
@@ -147,7 +151,7 @@ while(<INI>) {
 
     	$file =~ s/^.*\/+([^\/]+)/$1/g;
     	my $new_name = "$dir/$file";
-	$new_name =~ s/\.gz//;
+	    $new_name =~ s/\.gz//;
     	print INI_OUT "$factors $t $w $new_name\n";
     	push @TABLE_NEW_NAME,$new_name;
 
@@ -275,11 +279,16 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         # ... hierarchical translation model
         if ($opt_hierarchical) {
           my $cmd = "$binarizer $new_file $new_file.bin";
-	  print STDERR $cmd."\n";
-	  print STDERR `$cmd`;
+          print STDERR $cmd."\n";
+          print STDERR `$cmd`;
         }
         # ... phrase translation model
-        else { 
+        elsif ($binarizer =~ /processPhraseTableMin/) {
+          #compact phrase table
+          my $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
+          print STDERR $cmd."\n";
+          print STDERR `$cmd`;
+        } else { 
           my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
           print STDERR $cmd."\n";
           print STDERR `$cmd`;
@@ -289,8 +298,13 @@ for(my $i=0;$i<=$#TABLE;$i++) {
       else {
         my $lexbin = $binarizer; 
         $lexbin =~ s/PhraseTable/LexicalTable/;
-        $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
-        my $cmd = "$lexbin -in $new_file -out $new_file";
+        my $cmd;
+        if ($lexbin =~ /processLexicalTableMin/) {
+          $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted;  $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
+        } else {
+          $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
+          $cmd = "$lexbin -in $new_file -out $new_file";
+        }
         print STDERR $cmd."\n";
         print STDERR `$cmd`;
       }
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index c5492498c..6797b57f4 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -158,6 +158,7 @@ my $mertdir = undef; # path to new mert directory
 my $mertargs = undef; # args to pass through to mert & extractor
 my $mertmertargs = undef; # args to pass through to mert only
 my $extractorargs = undef; # args to pass through to extractor only
+my $proargs = undef; # args to pass through to pro only
 
 # Args to pass through to batch mira only.  This flags is useful to
 # change MIRA's hyperparameters such as regularization parameter C,
@@ -207,6 +208,7 @@ GetOptions(
   "mertdir=s" => \$mertdir,
   "mertargs=s" => \$mertargs,
   "extractorargs=s" => \$extractorargs,
+  "proargs=s" => \$proargs,
   "mertmertargs=s" => \$mertmertargs,
   "rootdir=s" => \$SCRIPTS_ROOTDIR,
   "filtercmd=s" => \$filtercmd, # allow to override the default location
@@ -376,12 +378,12 @@ my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt");  # or set t
 
 if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) {
   print "Could not find $pro_optimizer, installing it in $mertdir\n";
-  my $megam_url = "http://www.cs.utah.edu/~hal/megam/";
+  my $megam_url = "http://www.umiacs.umd.edu/~hal/megam/";
   if (&is_mac_osx()) {
     die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details.";
   }
 
-  `cd $mertdir; wget http://www.cs.utah.edu/~hal/megam/megam_i686.opt.gz;`;
+  `cd $mertdir; wget $megam_url/megam_i686.opt.gz;`;
   `gunzip $pro_optimizer.gz`;
   `chmod +x $pro_optimizer`;
   die("ERROR: Installation of megam_i686.opt failed! Install by hand from $megam_url") unless -x $pro_optimizer;
@@ -432,6 +434,8 @@ $mert_extract_args .= " $extractorargs";
 
 $mertmertargs = "" if !defined $mertmertargs;
 
+$proargs = "" unless $proargs;
+
 my $mert_mert_args = "$mertargs $mertmertargs";
 $mert_mert_args =~ s/\-+(binary|b)\b//;
 $mert_mert_args .= " $scconfig";
@@ -904,11 +908,11 @@ while (1) {
   my %sparse_weights; # sparse features
   my $pro_optimizer_cmd = "$pro_optimizer $megam_default_options run$run.pro.data";
   if ($___PAIRWISE_RANKED_OPTIMIZER) {  # pro optimization
-    $cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd";
+    $cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd";
     &submit_or_exec($cmd, $mert_outfile, $mert_logfile);
   } elsif ($___PRO_STARTING_POINT) {  # First, run pro, then mert
     # run pro...
-    my $pro_cmd = "$mert_pro_cmd $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd";
+    my $pro_cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd";
     &submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err");
     # ... get results ...
     ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.pro.out","run$run.pro.err",scalar @{$featlist->{"names"}},\%sparse_weights, \@phrase_weighting_mix_weights);
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 05287afee..5b0553581 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -34,12 +34,12 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
    $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG,
    $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
    $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2,
-   $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
+   $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
    $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
    @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
-   $_SPARSE_TRANSLATION_TABLE,
-   $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $IGNORE);
+   $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_CORPUS, $_BASELINE_ALIGNMENT,
+   $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $IGNORE);
 my $_CORES = 1;
 
 my $debug = 0; # debug this script, do not delete any files in debug mode
@@ -116,7 +116,7 @@ $_HELP = 1
 		       'source-syntax' => \$_SOURCE_SYNTAX,
 		       'target-syntax' => \$_TARGET_SYNTAX,
 		       'xml' => \$_XML,
-		       'phrase-word-alignment' => \$_PHRASE_WORD_ALIGNMENT,
+		       'no-word-alignment' => \$_OMIT_WORD_ALIGNMENT,
 		       'config=s' => \$_CONFIG,
 		       'max-lexical-reordering' => \$_MAX_LEXICAL_REORDERING,
 		       'do-steps=s' => \$_DO_STEPS,
@@ -128,7 +128,13 @@ $_HELP = 1
 		       'additional-ini=s' => \@_ADDITIONAL_INI, 
 		       'additional-ini-file=s' => \$_ADDITIONAL_INI_FILE, 
 		       'sparse-translation-table' => \$_SPARSE_TRANSLATION_TABLE,
-		       'cores=i' => \$_CORES
+		       'baseline-alignment-model=s{8}' => \@_BASELINE_ALIGNMENT_MODEL,
+		       'baseline-extract=s' => \$_BASELINE_EXTRACT,
+		       'baseline-corpus=s' => \$_BASELINE_CORPUS,
+		       'baseline-alignment=s' => \$_BASELINE_ALIGNMENT,
+		       'cores=i' => \$_CORES,
+           'instance-weights-file=s' => \$_INSTANCE_WEIGHTS_FILE,
+           'lmodel-oov-feature' => \$_LMODEL_OOV_FEATURE,
                );
 
 if ($_HELP) {
@@ -240,7 +246,12 @@ if ($STEPS[1] || $STEPS[2])
 		}
 		print STDERR "Using single-thread GIZA\n";
 	} else {
-		$GIZA = "$_EXTERNAL_BINDIR/mgiza";
+	        # accept either "mgiza" or "mgizapp" and either "snt2cooc.out" or "snt2cooc"
+	        if (-x "$_EXTERNAL_BINDIR/mgiza") {
+		        $GIZA = "$_EXTERNAL_BINDIR/mgiza";
+ 	        } elsif (-x "$_EXTERNAL_BINDIR/mgizapp") {
+		        $GIZA = "$_EXTERNAL_BINDIR/mgizapp";
+	        }
 		if (-x "$_EXTERNAL_BINDIR/snt2cooc") {
 			$SNT2COOC = "$_EXTERNAL_BINDIR/snt2cooc";
 		} elsif (-x "$_EXTERNAL_BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $_EXTERNAL_BINDIR
@@ -373,6 +384,11 @@ my $___ALIGNMENT = "grow-diag-final";
 $___ALIGNMENT = $_ALIGNMENT if $_ALIGNMENT;
 my $___NOTE_ALIGNMENT_DROPS = 1;
 
+# baseline alignment model for incremetal updating
+die "ERROR: buggy definition of baseline alignment model, should have 8 values:\n\t".join("\n\t",@_BASELINE_ALIGNMENT_MODEL)."\n"
+  unless scalar(@_BASELINE_ALIGNMENT_MODEL) == 8 || scalar(@_BASELINE_ALIGNMENT_MODEL) == 0;
+die "ERROR: use of baseline alignment model limited to HMM training (-hmm-align)\n"
+  if defined($___FINAL_ALIGNMENT_MODEL) && $___FINAL_ALIGNMENT_MODEL ne 'hmm' && scalar(@_BASELINE_ALIGNMENT_MODEL) == 8;
 
 # model dir and alignment/extract file
 my $___MODEL_DIR = $___ROOT_DIR."/model";
@@ -620,8 +636,8 @@ sub prepare {
 	&make_classes($corpus.".".$___F,$___VCB_F.".classes");
 	&make_classes($corpus.".".$___E,$___VCB_E.".classes");
 	
-	$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F);
-	$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E);
+	$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
+	$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
 	
 	&numberize_txt_file($VCB_F,$corpus.".".$___F,
 			    $VCB_E,$corpus.".".$___E,
@@ -659,8 +675,8 @@ sub prepare {
 	    exit 0;
 	}
 	
-	$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F);
-	$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E);
+	$VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F,0);
+	$VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E,1);
 	
 	&numberize_txt_file($VCB_F,$corpus.".".$___F,
 			    $VCB_E,$corpus.".".$___E,
@@ -787,7 +803,7 @@ sub make_classes {
 
 sub get_vocabulary {
 #    return unless $___LEXICAL_WEIGHTING;
-    my($corpus,$vcb) = @_;
+    my($corpus,$vcb,$is_target) = @_;
     print STDERR "(1.2) creating vcb file $vcb @ ".`date`;
     
     my %WORD;
@@ -797,17 +813,37 @@ sub get_vocabulary {
 	foreach (split) { $WORD{$_}++; }
     }
     close(TXT);
-    
+
+    my ($id,%VCB);
+    open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
+
+    # words from baseline alignment model when incrementally updating
+    if (scalar @_BASELINE_ALIGNMENT_MODEL) {
+      open(BASELINE_VCB,$_BASELINE_ALIGNMENT_MODEL[$is_target]);
+      while(<BASELINE_VCB>) {
+        chop;
+        my ($i,$word,$count) = split;
+	if (defined($WORD{$word})) {
+          $count += $WORD{$word};
+          delete($WORD{$word});
+        }
+	printf VCB "%d\t%s\t%d\n",$i,$word,$count;
+	$VCB{$word} = $i;
+        $id = $i+1;
+      }
+      close(BASELINE_VCB);
+    }
+    # not incrementally updating
+    else {
+      print VCB "1\tUNK\t0\n";
+      $id=2;
+    }
+
     my @NUM;
     foreach my $word (keys %WORD) {
 	my $vcb_with_number = sprintf("%07d %s",$WORD{$word},$word);
 	push @NUM,$vcb_with_number;
     }
-    
-    my %VCB;
-    open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb";
-    print VCB "1\tUNK\t0\n";
-    my $id=2;
     foreach (reverse sort @NUM) {
 	my($count,$word) = split;
 	printf VCB "%d\t%s\t%d\n",$id,$word,$count;
@@ -986,15 +1022,30 @@ sub run_single_giza_on_parts {
     close(SNT);
 
     # run snt2cooc in parts
+    my @COOC_PART_FILE_NAME;
     for(my $i=1;$i<=$___PARTS;$i++) {
 	&run_single_snt2cooc("$dir/part$i",$e,$f,$vcb_e,$vcb_f,"$___CORPUS_DIR/part$i/$f-$e-int-train.snt");
+        push @COOC_PART_FILE_NAME, "$dir/part$i/$f-$e.cooc";
     }
+    # include baseline cooc, if baseline alignment model (incremental training)
+    if (scalar @_BASELINE_ALIGNMENT_MODEL) {
+      push @COOC_PART_FILE_NAME, $_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)];
+    }
+    &merge_cooc_files($dir,$e,$f,@COOC_PART_FILE_NAME);
+
+    # run giza
+    &run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
+}
+
+sub merge_cooc_files {
+    my ($dir,$e,$f,@COOC_PART_FILE_NAME) = @_;
 
     # merge parts
     open(COOC,">$dir/$f-$e.cooc") or die "ERROR: Can't write $dir/$f-$e.cooc";
     my(@PF,@CURRENT);
-    for(my $i=1;$i<=$___PARTS;$i++) {
-	open($PF[$i],"$dir/part$i/$f-$e.cooc")or die "ERROR: Can't read $dir/part$i/$f-$e.cooc";
+    for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
+	print STDERR "merging cooc file $COOC_PART_FILE_NAME[$i]...\n";
+	open($PF[$i],$COOC_PART_FILE_NAME[$i]) or die "ERROR: Can't read $COOC_PART_FILE_NAME[$i]";
 	my $pf = $PF[$i];
 	$CURRENT[$i] = <$pf>;
 	chop($CURRENT[$i]) if $CURRENT[$i];
@@ -1002,7 +1053,7 @@ sub run_single_giza_on_parts {
 
     while(1) {
 	my ($min1,$min2) = (1e20,1e20);
-	for(my $i=1;$i<=$___PARTS;$i++) {
+        for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
 	    next unless $CURRENT[$i];
 	    my ($w1,$w2) = split(/ /,$CURRENT[$i]);
 	    if ($w1 < $min1 || ($w1 == $min1 && $w2 < $min2)) {
@@ -1012,7 +1063,7 @@ sub run_single_giza_on_parts {
 	}
 	last if $min1 == 1e20;
 	print COOC "$min1 $min2\n";
-	for(my $i=1;$i<=$___PARTS;$i++) {
+        for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
 	    next unless $CURRENT[$i];
 	    my ($w1,$w2) = split(/ /,$CURRENT[$i]);
 	    if ($w1 == $min1 && $w2 == $min2) {
@@ -1022,13 +1073,10 @@ sub run_single_giza_on_parts {
 	    }
 	}	
     }
-    for(my $i=1;$i<=$___PARTS;$i++) {
+    for(my $i=0;$i<scalar(@COOC_PART_FILE_NAME);$i++) {
 	close($PF[$i]);
     }
     close(COOC);
-
-    # run giza
-    &run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);
 }
 
 sub run_single_giza {
@@ -1083,6 +1131,12 @@ sub run_single_giza {
         $GizaDefaultOptions{m5} =                    ($___FINAL_ALIGNMENT_MODEL eq '5')? 3: 0;
     }
 
+    if (scalar(@_BASELINE_ALIGNMENT_MODEL)) {
+        $GizaDefaultOptions{oldTrPrbs} = $_BASELINE_ALIGNMENT_MODEL[4 + ($dir eq $___GIZA_F2E?2:0)];
+        $GizaDefaultOptions{oldAlPrbs} = $_BASELINE_ALIGNMENT_MODEL[5 + ($dir eq $___GIZA_F2E?2:0)];
+        $GizaDefaultOptions{step_k} = 1;
+    }
+
     if ($___GIZA_OPTION) {
 	foreach (split(/[ ,]+/,$___GIZA_OPTION)) {
 	    my ($option,$value) = split(/=/,$_,2);
@@ -1123,16 +1177,19 @@ sub run_single_giza {
 }
 
 sub run_single_snt2cooc {
-    my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
-    print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
-    safesystem("mkdir -p $dir") or die("ERROR");
-    if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
-    print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc\n";
-    safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc") or die("ERROR");
-    } else {
-    print "$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train\n";
-    safesystem("$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train") or die("ERROR");
-    } 
+  my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_;
+  print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n";
+  my $suffix = (scalar @_BASELINE_ALIGNMENT_MODEL) ? ".new" : "";
+  safesystem("mkdir -p $dir") or die("ERROR");
+  if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") {
+    print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix\n";
+    safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc$suffix") or die("ERROR");
+  } else {
+    print "$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train\n";
+    safesystem("$SNT2COOC $dir/$f-$e.cooc$suffix $vcb_e $vcb_f $train") or die("ERROR");
+  }
+  &merge_cooc_files($dir,$e,$f,"$dir/$f-$e.cooc.new",$_BASELINE_ALIGNMENT_MODEL[2 + ($dir eq $___GIZA_F2E?1:0)])
+    if scalar @_BASELINE_ALIGNMENT_MODEL;
 }
 
 ### (3) CREATE WORD ALIGNMENT FROM GIZA ALIGNMENTS
@@ -1200,7 +1257,11 @@ sub get_lexical_factored {
 		     $___CORPUS.".".$___E,
 		     $___ALIGNMENT_FILE.".".$___ALIGNMENT,
 		     $___LEXICAL_FILE, 
-		     $___LEXICAL_COUNTS);
+		     $___LEXICAL_COUNTS,
+                     $_BASELINE_CORPUS.".".$___F,
+                     $_BASELINE_CORPUS.".".$___E,
+                     $_BASELINE_ALIGNMENT,
+                     $_INSTANCE_WEIGHTS_FILE);
     }
     else {
 	foreach my $factor (split(/\+/,$___TRANSLATION_FACTORS)) {
@@ -1218,7 +1279,11 @@ sub get_lexical_factored {
 			 $___ALIGNMENT_STEM.".".$factor_e.".".$___E,
 			 $___ALIGNMENT_FILE.".".$___ALIGNMENT,
 			 $lexical_file, 
-			 $___LEXICAL_COUNTS);
+			 $___LEXICAL_COUNTS,
+                         $_BASELINE_CORPUS.".".$factor_f.".".$___F,
+                         $_BASELINE_CORPUS.".".$factor_e.".".$___E,
+                         $_BASELINE_ALIGNMENT,
+                         $_INSTANCE_WEIGHTS_FILE);
 	}
     }
 }
@@ -1326,11 +1391,12 @@ sub extract_phrase {
      }
     }
     my $cmd;
+    my $suffix = (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) ? ".new" : "";
     if ($_HIERARCHICAL)
     {
         my $max_length = &get_max_phrase_length($table_number);
 
-        $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file";
+        $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix";
         $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR;
         $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
         $cmd .= " --PCFG" if $_PCFG;
@@ -1347,28 +1413,43 @@ sub extract_phrase {
     {
 		if ( $_EPPEX ) {
 			# eppex sets max_phrase_length itself (as the maximum phrase length for which any Lossy Counter is defined)
-      		$cmd = "$EPPEX $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $_EPPEX";
+      		$cmd = "$EPPEX $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix $_EPPEX";
 		}
 		else {
       my $max_length = &get_max_phrase_length($table_number);
       print "MAX $max_length $reordering_flag $table_number\n";
       $max_length = &get_max_phrase_length(-1) if $reordering_flag;
 
-      $cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $max_length";
+      $cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file$suffix $max_length";
 		}
       if ($reordering_flag) {
         $cmd .= " orientation";
         $cmd .= get_extract_reordering_flags();
         $cmd .= " --NoTTable" if !$ttable_flag;
-        $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
       }
+      $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
     }
     
     $cmd .= " --GZOutput ";
+    $cmd .= " --InstanceWeights $_INSTANCE_WEIGHTS_FILE " if defined $_INSTANCE_WEIGHTS_FILE;
+    $cmd .= " --BaselineExtract $_BASELINE_EXTRACT" if defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT =~ /extract-parallel.perl/;
     
     map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
     print STDERR "$cmd\n";
     safesystem("$cmd") or die "ERROR: Phrase extraction failed (missing input files?)";
+
+    if (defined($_BASELINE_EXTRACT) && $PHRASE_EXTRACT !~ /extract-parallel.perl/) {
+      print STDERR "merging with baseline extract from $_BASELINE_EXTRACT\n";
+      safesystem("$ZCAT $_BASELINE_EXTRACT.gz $extract_file$suffix.gz | gzip > $extract_file.gz");
+      safesystem("$ZCAT $_BASELINE_EXTRACT.inv.gz $extract_file$suffix.inv.gz | gzip > $extract_file.inv.gz");
+      safesystem("$ZCAT $_BASELINE_EXTRACT.o.gz $extract_file$suffix.o.gz | gzip > $extract_file.o.gz")
+	if -e "$extract_file$suffix.o.gz";
+      safesystem("rm $extract_file$suffix.gz");
+      safesystem("rm $extract_file$suffix.inv.gz");
+      safesystem("rm $extract_file$suffix.o.gz") 
+        if -e "$extract_file$suffix.o.gz";
+    }
+
     foreach my $f (@tempfiles) {
       unlink $f;
     }
@@ -1471,7 +1552,7 @@ sub score_phrase_phrase_extract {
 
         my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction.gz $inverse";
         $cmd .= " --Hierarchical" if $_HIERARCHICAL;
-        $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
+        $cmd .= " --NoWordAlignment" if $_OMIT_WORD_ALIGNMENT;
         $cmd .= " --KneserNey" if $KNESER_NEY;
         $cmd .= " --GoodTuring" if $GOOD_TURING && $inverse eq "";
         $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
@@ -1491,7 +1572,7 @@ sub score_phrase_phrase_extract {
 					$cmd .= " 0 ";
 				}
 
-      print $cmd."\n";
+        print STDERR $cmd."\n";
         safesystem($cmd) or die "ERROR: Scoring of phrases failed";	    
   
         exit();
@@ -1909,8 +1990,12 @@ sub create_ini {
   }
   print INI "\n# language model weights\n[weight-l]\n";
   my $lmweighttotal = 0.5;
+  my $lmoovweighttotal = 0.1;
   foreach(1..scalar @___LM) {
     printf INI "%.4f\n", $lmweighttotal / scalar @___LM;
+    if ($_LMODEL_OOV_FEATURE) {
+      printf INI "%.4f\n", $lmoovweighttotal / scalar @___LM;
+    }
   }
 
   print INI "\n\n# translation model weights\n[weight-t]\n";
@@ -1954,6 +2039,10 @@ sub create_ini {
     print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
   }
 
+  if ($_LMODEL_OOV_FEATURE) {
+    print INI "\n# language model OOV feature enabled\n[lmodel-oov-feature]\n1\n\n";
+  }
+
   # get addititional content for config file from switch or file
   if ($_ADDITIONAL_INI) {
     print INI "\n# additional settings\n\n";
diff --git a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
index 305a6ec52..c3c309bad 100755
--- a/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
+++ b/scripts/training/wrappers/make-factor-en-pos.mxpost.perl
@@ -1,6 +1,7 @@
 #!/usr/bin/perl -w
 
 use strict;
+use FindBin qw($RealBin);
 
 use Getopt::Long "GetOptions";
 my ($IN,$OUT,$MXPOST);
@@ -14,8 +15,8 @@ if (!&GetOptions('mxpost=s' => \$MXPOST) ||
 
 my $pipeline = "perl -ne 'chop; tr/\\x20-\\x7f/\?/c; print \$_.\"\\n\";' | tee debug | ";
 $pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |";
-open(TAGGER,"cat $IN | $pipeline");
-open(OUT,">$OUT");
+open(TAGGER,"$RealBin/../../tokenizer/deescape-special-chars.perl < $IN | $pipeline");
+open(OUT,"| $RealBin/../../tokenizer/escape-special-chars.perl > $OUT");
 while(<TAGGER>) {
     foreach my $word_pos (split) {
 	$word_pos =~ s/\/([^\/]+)$/_$1/;
diff --git a/scripts/training/zmert-moses.pl b/scripts/training/zmert-moses.pl
deleted file mode 100755
index ecd783fa2..000000000
--- a/scripts/training/zmert-moses.pl
+++ /dev/null
@@ -1,1121 +0,0 @@
-#!/usr/bin/perl -w 
-
-# Usage:
-# zmert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
-# For other options see below or run 'zmert-moses.pl --help'
-
-# Notes:
-# <foreign> and <english> should be raw text files, one sentence per line
-# <english> can be a prefix, in which case the files are <english>0, <english>1, etc. are used
-
-# Revision history
-
-# 29 Dec 2009 Derived from mert-moses-new.pl (Kamil Kos)
-
-use FindBin qw($RealBin);
-use File::Basename;
-my $SCRIPTS_ROOTDIR = $RealBin;
-$SCRIPTS_ROOTDIR =~ s/\/training$//;
-$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
-
-# for each _d_istortion, _l_anguage _m_odel, _t_ranslation _m_odel and _w_ord penalty, there is a list
-# of [ default value, lower bound, upper bound ]-triples. In most cases, only one triple is used,
-# but the translation model has currently 5 features
-
-# defaults for initial values and ranges are:
-
-my $default_triples = {
-    # these two basic models exist even if not specified, they are
-    # not associated with any model file
-    "w" => [ [ 0.0, -1.0, 1.0 ] ],  # word penalty
-};
-
-my $additional_triples = {
-    # if the more lambda parameters for the weights are needed
-    # (due to additional tables) use the following values for them
-    "d"  => [ [ 1.0, 0.0, 2.0 ],    # lexicalized reordering model
-	      [ 1.0, 0.0, 2.0 ],
-	      [ 1.0, 0.0, 2.0 ],
-	      [ 1.0, 0.0, 2.0 ],
-	      [ 1.0, 0.0, 2.0 ],
-	      [ 1.0, 0.0, 2.0 ],
-	      [ 1.0, 0.0, 2.0 ] ],
-    "lm" => [ [ 1.0, 0.0, 2.0 ] ],  # language model
-    "g"  => [ [ 1.0, 0.0, 2.0 ],    # generation model
-	      [ 1.0, 0.0, 2.0 ] ],
-    "tm" => [ [ 0.3, 0.0, 0.5 ],    # translation model
-	      [ 0.2, 0.0, 0.5 ],
-	      [ 0.3, 0.0, 0.5 ],
-	      [ 0.2, 0.0, 0.5 ],
-	      [ 0.0,-1.0, 1.0 ] ],  # ... last weight is phrase penalty
-    "lex"=> [ [ 0.1, 0.0, 0.2 ] ],  # global lexical model
-};
-
-# moses.ini file uses FULL names for lambdas, while this training script internally (and on the command line)
-# uses ABBR names.
-my $ABBR_FULL_MAP = "d=weight-d lm=weight-l tm=weight-t w=weight-w g=weight-generation lex=weight-lex";
-my %ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP;
-my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
-
-# We parse moses.ini to figure out how many weights do we need to optimize.
-# For this, we must know the correspondence between options defining files
-# for models and options assigning weights to these models.
-my $TABLECONFIG_ABBR_MAP = "ttable-file=tm lmodel-file=lm distortion-file=d generation-file=g global-lexical-file=lex";
-my %TABLECONFIG2ABBR = map {split(/=/,$_,2)} split /\s+/, $TABLECONFIG_ABBR_MAP;
-
-# There are weights that do not correspond to any input file, they just increase the total number of lambdas we optimize
-#my $extra_lambdas_for_model = {
-#  "w" => 1,  # word penalty
-#  "d" => 1,  # basic distortion
-#};
-
-my $verbose = 0;
-my $___MERT_VERBOSE = 1; # verbosity of zmert (values: 0-2)
-my $___DECODER_VERBOSE = 1; # should decoder output be included? - 0:no,1:yes
-my $___SAVE_INTER = 2; # save intermediate nbest-lists
-my $usage = 0; # request for --help
-my $___WORKING_DIR = "mert-work";
-my $___DEV_F = undef; # required, input text to decode
-my $___DEV_E = undef; # required, basename of files with references
-my $___DECODER = undef; # required, pathname to the decoder executable
-my $___CONFIG = undef; # required, pathname to startup ini file
-my $___N_BEST_LIST_SIZE = 100;
-my $___MAX_MERT_ITER = 0; # do not limit the number of iterations
-my $queue_flags = "-l mem_free=0.5G -hard";  # extra parameters for parallelizer
-      # the -l ws0ssmt is relevant only to JHU workshop
-my $___JOBS = undef; # if parallel, number of jobs to use (undef -> serial)
-my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
-my $___LAMBDA = undef; # string specifying the seed weights and boundaries of all lambdas
-my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert)
-my $___FILTER_PHRASE_TABLE = 1; # filter phrase table
-my $___PREDICTABLE_SEEDS = 0;
-my $___METRIC = "BLEU 4 shortest"; # name of metric that will be used for minimum error training, followed by metric parameters (see zmert documentation)
-my $___SEMPOSBLEU_WEIGHTS = "1 1"; # weights of SemPOS and BLEU
-my $___LAMBDAS_OUT = undef; # file where final lambdas should be written
-my $___EXTRACT_SEMPOS = "none"; # how shall we get the SemPOS factor (only for SemPOS metric)
-      # options: 1) 'none' - moses generates SemPOS factor in required format 
-      #             (<word_form>|<SemPOS>)
-      #          2) 'factors:<factor_index_list>' - extract factors from decoder output on positions from <factor_index_list>
-      #              <factor_index_list> contains indices of factors separated by comma, e.g. '0,1,4'
-      #          3) 'tmt' - moses outputs only <word_form> and we need to 
-      #             generate factors like SemPOS with TectoMT (see http://ufal.mff.cuni.cz/tectomt/)
-
-# set 1 if using with async decoder
-my $___ASYNC = 0; 
-
-# Use "--norm" to select normalization in mert
-my $___NORM = "none";
-
-# set 0 if input type is text, set 1 if input type is confusion network
-my $___INPUTTYPE = 0; 
-
-my $mertdir = "$SCRIPTS_ROOTDIR/../zmert/";  # path to zmert directory
-my $filtercmd = undef; # path to filter-model-given-input.pl
-my $clonecmd = "$SCRIPTS_ROOTDIR/training/clone_moses_model.pl"; # executable clone_moses_model.pl
-my $qsubwrapper = undef;
-my $moses_parallel_cmd = undef;
-my $old_sge = 0; # assume sge<6.0
-my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on 
-                                  # if undef work on all features
-                                  # (others are fixed to the starting values)
-my %active_features; # hash with features to optimize; optimize all if empty
-
-use strict;
-use Getopt::Long;
-GetOptions(
-  "working-dir=s" => \$___WORKING_DIR,
-  "input=s" => \$___DEV_F,
-  "inputtype=i" => \$___INPUTTYPE,
-  "refs=s" => \$___DEV_E,
-  "decoder=s" => \$___DECODER,
-  "config=s" => \$___CONFIG,
-  "nbest:i" => \$___N_BEST_LIST_SIZE,
-  "maxiter:i" => \$___MAX_MERT_ITER,
-  "queue-flags:s" => \$queue_flags,
-  "jobs=i" => \$___JOBS,
-  "decoder-flags=s" => \$___DECODER_FLAGS,
-  "lambdas=s" => \$___LAMBDA,
-  "metric=s" => \$___METRIC,
-  "semposbleu-weights:s" => \$___SEMPOSBLEU_WEIGHTS,
-  "extract-sempos=s" => \$___EXTRACT_SEMPOS,
-  "norm:s" => \$___NORM,
-  "help" => \$usage,
-  "verbose" => \$verbose,
-  "mert-verbose:i" => \$___MERT_VERBOSE,
-  "decoder-verbose:i" => \$___DECODER_VERBOSE,
-  "mertdir:s" => \$mertdir, # allow to override the default location of zmert.jar
-  "lambdas-out:s" => \$___LAMBDAS_OUT,
-  "rootdir=s" => \$SCRIPTS_ROOTDIR,
-  "filtercmd=s" => \$filtercmd, # allow to override the default location
-  "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location
-  "mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location
-  "old-sge" => \$old_sge, #passed to moses-parallel
-  "filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # allow (disallow)filtering of phrase tables
-  "predictable-seeds:s" => \$___PREDICTABLE_SEEDS, # allow (disallow) switch on/off reseeding of random restarts
-  "async=i" => \$___ASYNC, #whether script to be used with async decoder
-  "activate-features=s" => \$___ACTIVATE_FEATURES #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values)
-) or exit(1);
-
-print "Predict $___PREDICTABLE_SEEDS\n";
-
-# the 4 required parameters can be supplied on the command line directly
-# or using the --options
-if (scalar @ARGV == 4) {
-  # required parameters: input_file references_basename decoder_executable
-  $___DEV_F = shift;
-  $___DEV_E = shift;
-  $___DECODER = shift;
-  $___CONFIG = shift;
-}
-
-if ($___ASYNC) {
-	delete $default_triples->{"w"};
-	$additional_triples->{"w"} = [ [ 0.0, -1.0, 1.0 ] ];
-}
-
-print STDERR "After default: $queue_flags\n";
-
-if ($usage || !defined $___DEV_F || !defined$___DEV_E || !defined$___DECODER || !defined $___CONFIG) {
-  print STDERR "usage: zmert-moses.pl input-text references decoder-executable decoder.ini
-Options:
-  --working-dir=mert-dir ... where all the files are created
-  --nbest=100 ... how big nbestlist to generate
-  --maxiter=N ... maximum number of zmert iterations
-  --jobs=N  ... set this to anything to run moses in parallel
-  --mosesparallelcmd=STRING ... use a different script instead of moses-parallel
-  --queue-flags=STRING  ... anything you with to pass to 
-              qsub, eg. '-l ws06osssmt=true'
-              The default is 
-								-l mem_free=0.5G -hard
-              To reset the parameters, please use \"--queue-flags=' '\" (i.e. a space between
-              the quotes).
-  --decoder-flags=STRING ... extra parameters for the decoder
-  --lambdas=STRING  ... default values and ranges for lambdas, a complex string
-         such as 'd:1,0.5-1.5 lm:1,0.5-1.5 tm:0.3,0.25-0.75;0.2,0.25-0.75;0.2,0.25-0.75;0.3,0.25-0.75;0,-0.5-0.5 w:0,-0.5-0.5'
-  --allow-unknown-lambdas ... keep going even if someone supplies a new lambda
-         in the lambdas option (such as 'superbmodel:1,0-1'); optimize it, too
-  --lambdas-out=STRING ... file where final lambdas should be written
-  --metric=STRING ... metric name for optimization with metric parameters
-         such as 'BLEU 4 closest' or 'SemPOS 0 1'. Use default parameters by specifying 'BLEU' or 'SemPOS'
-  --semposbleu-weights=STRING ... weights for SemPOS and BLEU in format 'N:M' where 'N' is SemPOS weight and 'M' BLEU weight
-         used only with SemPOS_BLEU metric
-  --extract-sempos=STRING ... none|factors:<factor_list>|tmt
-         'none' ... decoder generates all required factors for optimization metric
-         'factors:<factor_list>' ... extract factors with index in <factor_list> from decoder output
-                 e.g. 'factors:0,2,3' to extract first, third and fourth factor from decoder output
-         'tmt' ... use TectoMT (see http://ufal.mff.cuni.cz/tectomt) to generate required factors
-  --norm ... Select normalization for zmert
-  --mert-verbose=N ... verbosity of zmert [0|1|2]
-  --decoder-verbose=N ... decoder verbosity [0|1] - 1=decoder output included
-  --mertdir=STRING ... directory with zmert.jar
-  --filtercmd=STRING  ... path to filter-model-given-input.pl
-  --rootdir=STRING  ... where do helpers reside (if not given explicitly)
-  --mertdir=STRING ... path to zmert implementation
-  --scorenbestcmd=STRING  ... path to score-nbest.py
-  --old-sge ... passed to moses-parallel, assume Sun Grid Engine < 6.0
-  --inputtype=[0|1|2] ... Handle different input types (0 for text, 1 for confusion network, 2 for lattices, default is 0)
-  --no-filter-phrase-table ... disallow filtering of phrase tables
-                              (useful if binary phrase tables are available)
-  --predictable-seeds ... provide predictable seeds to mert so that random restarts are the same on every run
-  --activate-features=STRING  ... comma-separated list of features to work on
-                                  (if undef work on all features)
-                                  # (others are fixed to the starting values)
-  --verbose ... verbosity of this script
-  --help ... print this help
-
-";
-  exit 1;
-}
-
-# ensure we know where is tectomt, if we need it
-if( !defined $ENV{"TMT_ROOT"} && $___EXTRACT_SEMPOS =~ /tmt/) {
-  die "Cannot find TMT_ROOT. Is TectoMT really initialized?";
-}
-my $TMT_ROOT = $ENV{"TMT_ROOT"};
-
-my $srunblocks = "$TMT_ROOT/tools/srunblocks_streaming/srunblocks";
-my $scenario_file = "scenario"; 
-my $qruncmd = "/home/bojar/diplomka/bin/qruncmd";
-my $srunblocks_cmd = "$srunblocks --errorlevel=FATAL $scenario_file czech_source_sentence factored_output";
-if (defined $___JOBS && $___JOBS > 1) {
-  die "Can't run $qruncmd" if ! -x $qruncmd;
-  $srunblocks_cmd = "$qruncmd --jobs=$___JOBS --join '$srunblocks_cmd'";
-}
-
-
-# update variables if input is confusion network
-if ($___INPUTTYPE == 1)
-{
-  $ABBR_FULL_MAP = "$ABBR_FULL_MAP I=weight-i";
-  %ABBR2FULL = map {split/=/,$_,2} split /\s+/, $ABBR_FULL_MAP;
-  %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} split /\s+/, $ABBR_FULL_MAP;
-
-  push @{$default_triples -> {"I"}}, [ 1.0, 0.0, 2.0 ];
-  #$extra_lambdas_for_model -> {"I"} = 1; #Confusion network posterior
-}
-
-# update variables if input is lattice
-if ($___INPUTTYPE == 2)
-{
-# TODO
-}
-
-if (defined $___ACTIVATE_FEATURES)
-{
-  %active_features = map {$_ => 1} split( /,/, $___ACTIVATE_FEATURES);
-}
-
-# Check validity of input parameters and set defaults if needed
-
-print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
-
-# path of script for filtering phrase tables and running the decoder
-$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd;
-
-$qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper;
-
-$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl"
-  if !defined $moses_parallel_cmd;
-
-
-
-die "Error: need to specify the zmert.jar directory" if !defined $mertdir;
-
-my $zmert_classpath = ensure_full_path("$mertdir/zmert.jar");
-die "File not found: $mertdir/zmert.jar (interpreted as $zmert_classpath)"
-  if ! -e $zmert_classpath;
-
-my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd);
-die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd;
-die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd;
-die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
-die "Not executable: $___DECODER" if ! -x $___DECODER;
-
-my $input_abs = ensure_full_path($___DEV_F);
-die "File not found: $___DEV_F (interpreted as $input_abs)."
-  if ! -e $input_abs;
-$___DEV_F = $input_abs;
-
-
-# Option to pass to qsubwrapper and moses-parallel
-my $pass_old_sge = $old_sge ? "-old-sge" : "";
-
-my $decoder_abs = ensure_full_path($___DECODER);
-die "File not found: $___DECODER (interpreted as $decoder_abs)."
-  if ! -x $decoder_abs;
-$___DECODER = $decoder_abs;
-
-
-my $ref_abs = ensure_full_path($___DEV_E);
-# check if English dev set (reference translations) exist and store a list of all references
-my @references;
-my @references_factored;
-if (-e $ref_abs) {
-  push @references, $ref_abs;
-}
-else {
-  # if multiple file, get a full list of the files
-    my $part = 0;
-    while (-e $ref_abs.$part) {
-        push @references, $ref_abs.$part;
-        $part++;
-    }
-    die("Reference translations not found: $___DEV_E (interpreted as $ref_abs)") unless $part;
-}
-
-my $config_abs = ensure_full_path($___CONFIG);
-die "File not found: $___CONFIG (interpreted as $config_abs)."
-  if ! -e $config_abs;
-$___CONFIG = $config_abs;
-
-
-
-# check validity of moses.ini and collect number of models and lambdas per model
-# need to make a copy of $extra_lambdas_for_model, scan_config spoils it
-#my %copy_of_extra_lambdas_for_model = %$extra_lambdas_for_model;
-my %used_triples = %{$default_triples};
-my ($models_used) = scan_config($___CONFIG);
-
-# Parse the lambda config string and convert it to a nice structure in the same format as $used_triples
-if (defined $___LAMBDA) {
-  my %specified_triples;
-  # interpreting lambdas from command line
-  foreach (split(/\s+/,$___LAMBDA)) {
-      my ($name,$values) = split(/:/);
-      die "Malformed setting: '$_', expected name:values\n" if !defined $name || !defined $values;
-      foreach my $startminmax (split/;/,$values) {
-	  if ($startminmax =~ /^(-?[\.\d]+),(-?[\.\d]+)-(-?[\.\d]+)$/) {
-	      my $start = $1;
-	      my $min = $2;
-	      my $max = $3;
-              push @{$specified_triples{$name}}, [$start, $min, $max];
-	  }
-	  else {
-	      die "Malformed feature range definition: $name => $startminmax\n";
-	  }
-      } 
-  }
-  # sanity checks for specified lambda triples
-  foreach my $name (keys %used_triples) {
-      die "No lambdas specified for '$name', but ".($#{$used_triples{$name}}+1)." needed.\n"
-	  unless defined($specified_triples{$name});
-      die "Number of lambdas specified for '$name' (".($#{$specified_triples{$name}}+1).") does not match number needed (".($#{$used_triples{$name}}+1).")\n"
-	  if (($#{$used_triples{$name}}) != ($#{$specified_triples{$name}}));
-  }
-  foreach my $name (keys %specified_triples) {
-      die "Lambdas specified for '$name' ".(@{$specified_triples{$name}}).", but none needed.\n"
-	  unless defined($used_triples{$name});
-  }
-  %used_triples = %specified_triples;
-}
-
-# moses should use our config
-if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(generation-file) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) /
-|| $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) /
-) {
-  die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files.";
-}
-
-#store current directory and create the working directory (if needed)
-my $cwd = `pawd 2>/dev/null`; 
-if(!$cwd){$cwd = `pwd`;}
-chomp($cwd);
-
-safesystem("mkdir -p $___WORKING_DIR") or die "Can't mkdir $___WORKING_DIR";
-
-{
-# open local scope
-
-#chdir to the working directory
-chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR";
-
-# fixed file names
-my $mert_logfile = "zmert.log";
-
-if ($___FILTER_PHRASE_TABLE){
-  # filter the phrase tables wih respect to input, use --decoder-flags
-  print "filtering the phrase tables... ".`date`;
-  my $cmd = "$filtercmd ./filtered $___CONFIG $___DEV_F";
-  if (defined $___JOBS) {
-    safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=filterphrases.out -stderr=filterphrases.err" )
-      or die "Failed to submit filtering of tables to the queue (via $qsubwrapper)";
-  } else {
-    safesystem($cmd) or die "Failed to filter the tables.";
-  }
-
-  # the decoder should now use the filtered model
-  $___CONFIG = "filtered/moses.ini";
-}
-else{
-  # make a local clone of moses.ini
-  safesystem("$clonecmd $___CONFIG");
-  $___CONFIG = "moses.ini";
-}
-
-$___CONFIG = ensure_full_path($___CONFIG);
-
-my $PARAMETERS;
-$PARAMETERS = $___DECODER_FLAGS;
-
-my $nbest_file = "zmert.best$___N_BEST_LIST_SIZE.out";
-
-# Run zmert to optimize lambdas
-# We need to prepare:
-#	1) decoder launch script (decoder_cmd) - must be executable
-#	2) zmert configuration file (zmert_cfg.txt)
-#	3) parameters we want to optimize (params.txt)
-#	4) decoder configuration file (decoder_cfg_inter.txt)
-
-
-my $zmert_cfg = ensure_full_path("zmert_cfg.txt");
-my $opt_params = "params.txt"; # zmert requires path relative to launch path
-my $decoder_cfg_inter = "decoder_cfg_inter.txt"; # zmert requires path relative to launch path
-my $decoder_cmd_file = ensure_full_path("decoder_cmd");
-my $iteration_file = "iteration";
-
-my $LAMBDAS_FILE = ensure_full_path("finalWeights.txt");
-
-# prepare script that will launch moses from template
-# it will include an update script that will adjust feature weights according to
-# the last zmert iteration (they are stored in file $decoder_cfg_inter)
-
-# prepare lauch command with all parameters
-my $decoder_cmd;
-if (defined $___JOBS) {
-  $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix zmert -queue-parameters '$queue_flags' -decoder-parameters '$PARAMETERS' -n-best-list '$nbest_file $___N_BEST_LIST_SIZE' -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > moses.out";
-} else {
-  $decoder_cmd = "$___DECODER $PARAMETERS -config $___CONFIG -inputtype $___INPUTTYPE -n-best-list $nbest_file $___N_BEST_LIST_SIZE -i $___DEV_F > moses.out";
-}
-
-my $zmert_decoder_cmd = "$SCRIPTS_ROOTDIR/training/zmert-decoder.pl";
-
-# number of factors that a given metric requires
-my $metric_num_factors = 1;	
-
-# SemPOS metric requires 2 parameters specifying position of t_lemma and sempos factor
-# e.g. for t_lemma|sempos|factor3|factor4|... the values are 0 and 1 (default setting)
-if( $___METRIC =~ /^SemPOS$/) {
-  $___METRIC .= " 0 1";
-  $metric_num_factors = 2;
-}
-# SemPOS_BLEU metric requires 7 parameters
-# 1) weight of SemPOS 2) weight of BLEU 
-# 3) index of t_lemma for SemPOS 4) index of sempos for SemPOS
-# 5) max ngram for BLEU 6) ref length strategy for BLEU
-# 7) index of factor to compute BLEU on
-elsif( $___METRIC =~ /^SemPOS_BLEU$/) {
-  $___SEMPOSBLEU_WEIGHTS =~ /^.*:.*$/ or die "--semposbleu-weights is not in format <sempos_weight>:<bleu_weight>";
-  $___SEMPOSBLEU_WEIGHTS =~ s/:/ /;
-  $___METRIC .= " $___SEMPOSBLEU_WEIGHTS 1 2 4 closest 0";
-  $metric_num_factors = 3;
-}
-elsif( $___METRIC =~ /^BLEU$/) {
-  $___METRIC .= " 4 closest";
-}
- elsif( $___METRIC =~ /^TER$/) {
-  $___METRIC .= " nocase punc 20 50";
-}
-elsif( $___METRIC =~ /^TER-BLEU$/) {
-  $___METRIC .= " nocase punc 20 50 4 closest";
-}
-
-if( $___EXTRACT_SEMPOS =~ /tmt/) {
-  my $print_string = "";
-  if( $___METRIC =~ /SemPOS_BLEU/) {
-    $print_string = "Print::ForSemPOSBLEUMetric TMT_PARAM_PRINT_FOR_SEMPOS_BLEU_METRIC=m:form|t_lemma|gram/sempos TMT_PARAM_PRINT_FOR_SEMPOS_BLEU_METRIC_DESTINATION=factored_output";
-  } elsif( $___METRIC =~ /SemPOS/) {
-    $print_string = "Print::ForSemPOSMetric TMT_PARAM_PRINT_FOR_SEMPOS_METRIC=t_lemma|gram/sempos TMT_PARAM_PRINT_FOR_SEMPOS_METRIC_DESTINATION=factored_output";
-  } else {
-    die "Trying to get factors using tmt for unknown metric $___METRIC";
-  }
-
-  open( SCENARIO, ">$scenario_file") or die "Cannot open $scenario_file";
-  print SCENARIO << "FILE_EOF";
-SCzechW_to_SCzechM::Tokenize_joining_numbers
-SCzechW_to_SCzechM::TagMorce
-# SCzechM_to_SCzechN::Czech_named_ent_SVM_recognizer
-# SCzechM_to_SCzechN::Geo_ne_recognizer
-# SCzechM_to_SCzechN::Embed_instances
-SCzechM_to_SCzechA::McD_parser_local TMT_PARAM_MCD_CZ_MODEL=pdt20_train_autTag_golden_latin2_pruned_0.02.model
-# SCzechM_to_SCzechA::McD_parser_local TMT_PARAM_MCD_CZ_MODEL=pdt20_train_autTag_golden_latin2_pruned_0.10.model
-SCzechM_to_SCzechA::Fix_atree_after_McD
-SCzechM_to_SCzechA::Fix_is_member
-SCzechA_to_SCzechT::Mark_auxiliary_nodes
-SCzechA_to_SCzechT::Build_ttree
-SCzechA_to_SCzechT::Fill_is_member
-SCzechA_to_SCzechT::Rehang_unary_coord_conj
-SCzechA_to_SCzechT::Assign_coap_functors
-SCzechA_to_SCzechT::Fix_is_member
-SCzechA_to_SCzechT::Distrib_coord_aux
-SCzechA_to_SCzechT::Mark_clause_heads
-SCzechA_to_SCzechT::Mark_relclause_heads
-SCzechA_to_SCzechT::Mark_relclause_coref
-SCzechA_to_SCzechT::Fix_tlemmas
-SCzechA_to_SCzechT::Assign_nodetype
-SCzechA_to_SCzechT::Assign_grammatemes
-SCzechA_to_SCzechT::Detect_formeme
-SCzechA_to_SCzechT::Add_PersPron
-SCzechA_to_SCzechT::Mark_reflpron_coref
-SCzechA_to_SCzechT::TBLa2t_phaseFd
-$print_string
-FILE_EOF
-  close( SCENARIO);
-}
-
-my $feats_order = join( " ", keys %used_triples);
-
-open( DECODER_CMD, ">$decoder_cmd_file") or die "Cannot open $decoder_cmd_file";
-  print DECODER_CMD <<"FILE_EOF";
-#!/usr/bin/perl -w
-
-use strict;
-
-my %FULL2ABBR = map {my (\$a, \$b) = split/=/,\$_,2; (\$b, \$a);} split /\\s+/, "$ABBR_FULL_MAP";
-
-open( ITERATION, "<$iteration_file") or die "Cannot open $iteration_file";
-my \$iteration = <ITERATION>;
-close( ITERATION);
-chomp( \$iteration);
-
-my \@features_order = qw( $feats_order );
-
-# extract feature weights from last zmert iteration (stored in \$decoder_cfg_inter)
-print "Updating decoder config file from file $decoder_cfg_inter\n";
-
-my \$moses_ini = "$___CONFIG";
-
-open( IN, "$decoder_cfg_inter") or die "Cannot open file $decoder_cfg_inter (reading updated lambdas)";
-FILE_EOF
-
-print DECODER_CMD <<'FILE_EOF';
-my %lambdas = ();
-my $lastName = "";
-while( my $line = <IN>) {
-  chomp($line); 
-  my ($name, $val) = split( /\s+/, $line);
-  $name =~ s/_\d+$//;      # remove index of the lambda
-  push( @{$lambdas{$name}}, $val);
-}
-close(IN);
-
-
-my $moses_ini_old = "$moses_ini";
-$moses_ini_old =~ s/^(.*)\/([^\/]+)$/$1\/run$iteration.$2/;
-$moses_ini_old = $moses_ini.".orig" if( $iteration == 0);
-safesystem("mv $moses_ini $moses_ini_old");
-# update moses.ini
-open( INI_OLD, "<$moses_ini_old") or die "Cannot open config file $moses_ini_old";
-open( INI, ">$moses_ini") or die "Cannot open config file $moses_ini";
-while( my $line = <INI_OLD>) {
-  if( $line =~ m/^\[(weight-.+)\]$/) {
-    my $name = $FULL2ABBR{$1};
-    print STDERR "Updating weight: $1, $name\n";
-    print INI "$line";
-    foreach( @{$lambdas{$name}}) {
-      print INI "$_\n";
-      print STDERR "NEW: $_\tOLD:";
-      $line = <INI_OLD>;
-      print STDERR $line;
-    }
-  } else {
-    print INI $line;
-  }
-}
-close(INI_OLD);
-close(INI);
-
-FILE_EOF
-
-print DECODER_CMD <<"FILE_EOF";
-print "Executing: $decoder_cmd";
-safesystem("$decoder_cmd") or die "Failed to execute $decoder_cmd";
-
-# update iteration number in intermediate config file
-++\$iteration;
-safesystem("echo \$iteration > $iteration_file");
-
-# modify the nbest-list to conform the zmert required format
-# <i> ||| <candidate_translation> ||| featVal_1 featVal_2 ... featVal_m
-my \$nbest_file_orig = "$nbest_file".".orig";
-safesystem( "mv $nbest_file \$nbest_file_orig");
-open( NBEST_ORIG, "<\$nbest_file_orig") or die "Cannot open original nbest-list \$nbest_file_orig";
-open( NBEST, ">$nbest_file") or die "Cannot open modified nbest-list $nbest_file";
-
-my \$line_num = 0;
-
-FILE_EOF
-
-
-if( "$___EXTRACT_SEMPOS" =~ /factors/) {
-  print DECODER_CMD <<"FILE_EOF";
-my (undef, \$args) = split( /:/, "$___EXTRACT_SEMPOS");
-my \$factor_count = $metric_num_factors;
-FILE_EOF
-print DECODER_CMD <<'FILE_EOF';
-my @indices = split( /,/, $args);
-die "Specified ".scalar @indices." factors to extract but selected metric requires $factor_count factors" 
-  if( @indices != $factor_count);
-while( my $line = <NBEST_ORIG>) {
-  my @array = split( /\|\|\|/, $line);
-  # remove feature names from the feature scores string
-  $array[2] = extractScores( $array[2]);
-  my @tokens = split( /\s/, $array[1]); # split sentence into words
-  $array[1] = "";
-  foreach my $token (@tokens) {
-    next if $token eq "";
-    my @factors = split( /\|/, $token);
-    my $put_separator = 0;
-    foreach my $index (@indices) {
-      die "Cannot extract factor with index $index from '$token'" if ($index > $#factors);
-      $array[1] .= '|' if ($put_separator);	# separator between factors
-      $array[1] .= $factors[$index];
-      $put_separator = 1;
-    }
-    $array[1] .= " ";	# space between words
-  }
-  print NBEST join( '|||', @array);
-}
- 
-FILE_EOF
-
-} elsif( "$___EXTRACT_SEMPOS" =~ /tmt/) {
-  print DECODER_CMD <<"FILE_EOF";
-# run TectoMT to analyze sentences
-print STDERR "Analyzing candidates using $srunblocks_cmd\n"; 
-my \$nbest_factored = "$nbest_file.factored";
-open( NBEST_FACTORED, "|$srunblocks_cmd > \$nbest_factored") or die "Cannot open pipe to command $srunblocks_cmd";
-FILE_EOF
-print DECODER_CMD <<'FILE_EOF';
-my $line_count = 0;
-my @out = ();
-while( my $line = <NBEST_ORIG>) {
-  my @array = split( /\|\|\|/, $line);
-  die "Nbest-list does not have required format (values separated by '|||')" if ($#array != 3);
-  # remove feature names from the feature scores string
-  $array[2] = extractScores( $array[2]);
-  push( @out, \@array); # store line with scores for output
-  # select only word forms
-  my $sentence = "";
-  foreach my $fact ( split /\s+/, $array[1]) {
-    next if( $fact eq "");
-    my @fact_array = split( /\|/, $fact);
-    $sentence .= "$fact_array[0] ";
-  }
-  # analyze sentence via TectoMT using scenario
-  print NBEST_FACTORED "$sentence\n";
-  ++$line_count;
-}
-close( NBEST_ORIG);
-close( NBEST_FACTORED);
-
-open( NBEST_FACTORED, "<$nbest_factored") or die "Cannot open $nbest_factored";
-my $line_count_check = 0;
-while( my $line = <NBEST_FACTORED>) {
-  chomp( $line);
-  my $array_ref = shift( @out);
-  $array_ref->[1] = $line;  
-  print NBEST join( '|||', @{$array_ref});
-  ++$line_count_check;
-}
-die "Error: Sent $line_count sentences to analyze but got only $line_count_check back" 
-  if( $line_count != $line_count_check);
-
-FILE_EOF
-
-} elsif ($___EXTRACT_SEMPOS eq "none") {
-print DECODER_CMD <<'FILE_EOF';
-while( my $line = <NBEST_ORIG>) {
-  my @array = split( /\|\|\|/, $line);
-  # remove feature names from the feature scores string
-  $array[2] = extractScores( $array[2]);
-  print NBEST join( '|||', @array);
-}
-FILE_EOF
-} else {
-  die "Unknown type of factor extraction: $___EXTRACT_SEMPOS";
-}
-
-print DECODER_CMD <<'FILE_EOF';
-close( NBEST);
-close( NBEST_ORIG);
-
-# END OF BODY
-
-sub extractScores {
-  my $scores = shift;
-  my (%scores_hash, $name); 
-  foreach my $score_or_name (split /\s+/, $scores) {
-    if( $score_or_name =~ s/://) {
-      $name = $score_or_name; 
-    } elsif ($score_or_name =~ /\d/) {
-      die "Cannot guess nbest-list first feature score name" if( not defined $name);
-      $scores_hash{$name} .= "$score_or_name ";
-    } else {
-      die "Unknown string ($score_or_name) in nbest-list feature scores section (not a feature name or score)" 
-        if( $score_or_name =~ /\S/);
-    }
-  }
-  $scores = "";
-  foreach $name (@features_order) {
-    $scores .= $scores_hash{$name};
-  }
-  #print STDERR "REORDERED SCORES: $scores\n";
-  return $scores;
-}
-
-sub safesystem {
-  print STDERR "Executing: @_\n";
-  system(@_);
-  if ($? == -1) {
-      print STDERR "Failed to execute: @_\n  $!\n";
-      exit(1);
-  }
-  elsif ($? & 127) {
-      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
-          ($? & 127),  ($? & 128) ? 'with' : 'without';
-      exit(1);
-  }
-  else {
-    my $exitcode = $? >> 8;
-    print STDERR "Exit code: $exitcode\n" if $exitcode;
-    return ! $exitcode;
-  }
-}
-FILE_EOF
-
-close( DECODER_CMD);
-
-# make the decoder lauch script executable
-safesystem("chmod a+x $decoder_cmd_file");
-
-# analyze reference if necessary
-if( $___EXTRACT_SEMPOS =~ /tmt/) {
-  my $part = 0;
-  foreach my $ref (@references) {
-    my $line_count = 0;
-    print STDERR "Analyzing references using $srunblocks_cmd\n"; 
-    open( REF_IN, "<$ref") or die "Cannot open $ref";
-    my $ref_factored = "$ref.factored.$part";
-    push( @references_factored, $ref_factored);
-    open( REF_FACTORED, "|$srunblocks_cmd > $ref_factored");
-    while( my $line = <REF_IN>) {
-      # analyze sentence via TectoMT using scenario in file $scerario_file
-      print REF_FACTORED $line;
-      ++$line_count;
-    }
-    close( REF_IN);
-    close( REF_FACTORED);
-    my $line_count_check = 0;
-    open( REF_FACTORED, "<$ref_factored") or die "Cannot open $ref_factored";
-    ++$line_count_check while( <REF_FACTORED>);
-    die "Error: Sent $line_count sentences to analyze but got $line_count_check back"
-     if( $line_count != $line_count_check);  
-    close( REF_FACTORED);
-    ++$part;
-  }
-  print STDERR "References analyzed\n";
-} else {
-  push( @references_factored, @references);
-}
-
-my $ref_stem = $references_factored[0];
-$ref_stem =~ s/\d+$// if( $#references_factored); # get the file stem if we have more than one refs
-$ref_stem =~ s/.*\/([^\/]+)$/..\/$1/; 
-
-# prepare zmert configuration file
-open( ZMERT_CFG, ">$zmert_cfg") or die "Cannot open $zmert_cfg";
-
-# FILES
-# print ZMERT_CFG "-dir\t$___PATH_FROM_LAUNCHDIR\n";	# working path (relative to the lauch path)
-# print ZMERT_CFG "-r\t$___DEV_E\n";	# file(s) containing references
-print ZMERT_CFG "-r\t$ref_stem\n";	# file(s) containing references
-print ZMERT_CFG "-rps\t".scalar(@references)."\n";	# number of references per sentence
-print ZMERT_CFG "-txtNrm\t0\n";	# we use our own text normalization
-print ZMERT_CFG "-p\t$opt_params\n";	# file containig parameter names, initial values, ranges
-print ZMERT_CFG "-fin\t$___LAMBDAS_OUT\n" if(defined $___LAMBDAS_OUT);	# file where the final weight vector is written
-
-# MERT CONFIGURATION
-print ZMERT_CFG "-m\t$___METRIC\n";	
-print ZMERT_CFG "-maxIt\t$___MAX_MERT_ITER\n" if( $___MAX_MERT_ITER);	# maximum number of MERT iterations
-# print ZMERT_CFG "-prevIt\t$PREV_MERT_ITER\n";	
-# number of iteration before considering an early exit
-# print ZMERT_CFG "-minIt\t$MIN_MERT_ITER\n";	
-# number of consecutive iterations that must satisfy some early stopping 
-# criterion to cause an early exit
-# print ZMERT_CFG "-stopIt\t$STOP_MIN_ITER\n";	
-# early exit criterion: no weight changes by more than $LAMBDA_CHANGE; 
-# default value: -1 (this criterion is never investigated)
-# print ZMERT_CFG "-stopSig\t$LAMBDA_CHANGE\n";
-# save intermediate decoder config files (1) or decoder outputs (2) or both (3) or neither (0)
-print ZMERT_CFG "-save\t$___SAVE_INTER\n";
-# print ZMERT_CFG "-ipi\t$INITS_PER_ITER\n";	# number of intermediate initial points per iteration
-# print ZMERT_CFG "-opi\t$ONCE_PER_ITER\n";	# modify a parameter only once per iteration;
-# print ZMERT_CFG "-rand\t$RAND_INIT\n";		# choose initial points randomly
-print ZMERT_CFG "-seed\t$___PREDICTABLE_SEEDS\n" if($___PREDICTABLE_SEEDS);	# initialize the random number generator
-
-# DECODER SPECIFICATION
-print ZMERT_CFG "-cmd\t$decoder_cmd_file\n";	# name of file containing commands to run the decoder
-print ZMERT_CFG "-decOut\t$nbest_file\n";	# name of the n-best file produced by the decoder
-# print ZMERT_CFG "-decExit\t$DECODER_EXIT_CODE\n";	# value returned by decoder after successful exit
-print ZMERT_CFG "-dcfg\t$decoder_cfg_inter\n";		# name of intermediate decoder configuration file
-print ZMERT_CFG "-N\t$___N_BEST_LIST_SIZE\n";	
-
-# OUTPUT SPECIFICATION
-print ZMERT_CFG "-v\t$___MERT_VERBOSE\n";	# zmert verbosity level (0-2)
-print ZMERT_CFG "-decV\t$___DECODER_VERBOSE\n";	# decoder output printed (1) or ignored (0)
-
-close( ZMERT_CFG);
-
-my ($name, $num, $val, $min, $max);
-# prepare file with parameters to optimize
-open( PARAMS, ">$opt_params") or die "Cannot open file $opt_params with parameters to optimize";
-my $optString;
-foreach $name (keys %used_triples) {
-  $num = 0;
-  foreach my $triple (@{$used_triples{$name}}) {
-    ($val, $min, $max) = @$triple;
-    my ($minRand, $maxRand) = ($min, $max);
-    # the file should describe features to optimize in the following format:
-    # "featureName ||| defValue optString minVal maxVal minRandVal maxRandVal"
-    #    optString can be 'Opt' or 'Fix' 
-   $optString = "Opt"; 
-   if( defined $___ACTIVATE_FEATURES and not $active_features{$name."_$num"}) {
-     $optString = "Fix";
-   } 
-   print PARAMS "$name"."_$num ||| $val $optString $min $max $minRand $maxRand\n";
-    ++$num;
-  }
-}
-print PARAMS  "normalization = $___NORM\n";
-close( PARAMS);
-
-# prepare intermediate config file from which moses.ini will be updated before each launch
-open( DEC_CFG, ">$decoder_cfg_inter") or die "Cannot open file $decoder_cfg_inter";
-foreach $name (keys %used_triples) {
-  $num = 0;
-  foreach my $tri (@{$used_triples{$name}}) {
-    ($val, $min, $max) = @$tri;
-    print DEC_CFG $name."_$num $val\n";
-    ++$num;
-  }
-}
-close( DEC_CFG);
-
-open( ITER, ">$iteration_file") or die "Cannot open file $iteration_file";
-print ITER "1"; 
-close( ITER);
-
-# launch zmert
-my $javaMaxMem = ""; # -maxMem 4000" # use at most 4000MB of memory
-my $cmd = "java -cp $zmert_classpath ZMERT $javaMaxMem $zmert_cfg";
-
-print "Zmert start at ".`date`;
-
-if ( 0 && defined $___JOBS) {
-  # NOT WORKING - this branch needs to init environment variables
-  safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -stderr=$mert_logfile -queue-parameter='$queue_flags'") or die "Failed to start zmert (via qsubwrapper $qsubwrapper)";
-
-} else {
-  safesystem("$cmd 2> $mert_logfile") or die "Failed to run zmert";
-}
-
-print "Zmert finished at ".`date`;
-
-# RELEVANT ONLY FOR PLAYGROUND at UFAL, CHARLES UNIVESITY IN PRAGUE
-# copy optimized moses.ini and original run1.moses.ini to the working directory
-if( $___FILTER_PHRASE_TABLE) {
-  my ($config_opt, $config_std, $config_base) = ($___CONFIG, $___CONFIG, "$cwd/moses.abs.ini");
-  $config_std =~ s/^(.*)\/([^\/]+)$/$1\/run1.$2/;
-  mergeConfigs( $config_base, $___CONFIG);
-  mergeConfigs( $config_base, $config_std);
-}
-
-# chdir back to the original directory # useless, just to remind we were not there
-chdir($cwd);
-
-
-} # end of local scope
-
-sub mergeConfigs {
-  my ($config_base, $config_weights) = @_;
-  my $config_new = $config_weights;
-  $config_new =~ s/^.*\///;
-  open BASE, "<$config_base" or die "Cannot open $config_base";
-  open WEIGHTS, "<$config_weights" or die "Cannot open $config_weights";
-  open NEW, ">$config_new" or die "Cannot open $config_new";
-  my $cont = 1;
-  my ($b_line, $w_line);
-  while( $cont) {
-    $b_line = <BASE>;
-    $w_line = <WEIGHTS>;
-    $cont = (defined $b_line and defined $w_line);
-    if( $b_line =~ /^\[weight-/) {
-      if( $w_line !~ /^\[weight-/) { die "mergeConfigs: $config_base and $config_weights do not have the same format"; }
-      print NEW $w_line;
-      $b_line = <BASE>; $w_line = <WEIGHTS>;
-      while( $w_line =~ /\d/) {
-        print NEW $w_line;
-        $b_line = <BASE>; $w_line = <WEIGHTS>;
-      }
-      print NEW $b_line;
-    } else {
-      print NEW $b_line;
-    }
-  }
-  close BASE;
-  close WEIGHTS;
-  close NEW;
-}
-
-sub dump_triples {
-  my $triples = shift;
-
-  foreach my $name (keys %$triples) {
-    foreach my $triple (@{$triples->{$name}}) {
-      my ($val, $min, $max) = @$triple;
-    }
-  }
-}
-
-sub safesystem {
-  print STDERR "Executing: @_\n";
-  system(@_);
-  if ($? == -1) {
-      print STDERR "Failed to execute: @_\n  $!\n";
-      exit(1);
-  }
-  elsif ($? & 127) {
-      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
-          ($? & 127),  ($? & 128) ? 'with' : 'without';
-      exit(1);
-  }
-  else {
-    my $exitcode = $? >> 8;
-    print STDERR "Exit code: $exitcode\n" if $exitcode;
-    return ! $exitcode;
-  }
-}
-
-sub ensure_full_path {
-    my $PATH = shift;
-$PATH =~ s/\/nfsmnt//;
-    return $PATH if $PATH =~ /^\//;
-    my $dir = `pawd 2>/dev/null`; 
-    if(!$dir){$dir = `pwd`;}
-    chomp($dir);
-    $PATH = $dir."/".$PATH;
-    $PATH =~ s/[\r\n]//g;
-    $PATH =~ s/\/\.\//\//g;
-    $PATH =~ s/\/+/\//g;
-    my $sanity = 0;
-    while($PATH =~ /\/\.\.\// && $sanity++<10) {
-        $PATH =~ s/\/+/\//g;
-        $PATH =~ s/\/[^\/]+\/\.\.\//\//g;
-    }
-    $PATH =~ s/\/[^\/]+\/\.\.$//;
-    $PATH =~ s/\/+$//;
-$PATH =~ s/\/nfsmnt//;
-    return $PATH;
-}
-
-sub scan_config {
-  my $ini = shift;
-  my $inishortname = $ini; $inishortname =~ s/^.*\///; # for error reporting
-  # we get a pre-filled counts, because some lambdas are always needed (word penalty, for instance)
-  # as we walk though the ini file, we record how many extra lambdas do we need
-  # and finally, we report it
-
-  # in which field (counting from zero) is the filename to check?
-  my %where_is_filename = (
-    "ttable-file" => 4,
-    "generation-file" => 3,
-    "lmodel-file" => 3,
-    "distortion-file" => 3,
-    "global-lexical-file" => 1,
-  );
-  # by default, each line of each section means one lambda, but some sections
-  # explicitly state a custom number of lambdas
-  my %where_is_lambda_count = (
-    "ttable-file" => 3,
-    "generation-file" => 2,
-    "distortion-file" => 2,
-  );
-  
-  open INI, $ini or die "Can't read $ini";
-  my $section = undef;  # name of the section we are reading
-  my $shortname = undef;  # the corresponding short name
-  my $nr = 0;
-  my $error = 0;
-  my %defined_files;
-  my %defined_steps;  # check the ini file for compatible mapping steps and actually defined files
-  while (<INI>) {
-    $nr++;
-    next if /^\s*#/; # skip comments
-    if (/^\[([^\]]*)\]\s*$/) {
-      $section = $1;
-      $shortname = $TABLECONFIG2ABBR{$section};
-      next;
-    }
-    if (defined $section && $section eq "mapping") {
-      # keep track of mapping steps used
-      $defined_steps{$1}++ if /^([TG])/ || /^\d+ ([TG])/;
-    }
-    if (defined $section && defined $where_is_filename{$section}) {
-      print "$section -> $where_is_filename{$section}\n";
-      # this ini section is relevant to lambdas
-      chomp;
-      my @flds = split / +/;
-      my $fn = $flds[$where_is_filename{$section}];
-      if (defined $fn && $fn !~ /^\s+$/) {
-	  print "checking weight-count for $section\n";
-        # this is a filename! check it
-	if ($fn !~ /^\//) {
-	  $error = 1;
-	  print STDERR "$inishortname:$nr:Filename not absolute: $fn\n";
-	}
-	if (! -s $fn && ! -s "$fn.gz" && ! -s "$fn.binphr.idx" && ! -s "$fn.binlexr.idx" ) {
-	  $error = 1;
-	  print STDERR "$inishortname:$nr:File does not exist or empty: $fn\n";
-	}
-	# remember the number of files used, to know how many lambdas do we need
-        die "No short name was defined for section $section!"
-          if ! defined $shortname;
-
-        # how many lambdas does this model need?
-        # either specified explicitly, or the default, i.e. one
-        my $needlambdas = defined $where_is_lambda_count{$section} ? $flds[$where_is_lambda_count{$section}] : 1;
-
-        print STDERR "Config needs $needlambdas lambdas for $section (i.e. $shortname)\n" if $verbose;
-        if (!defined $___LAMBDA && (!defined $additional_triples->{$shortname} || scalar(@{$additional_triples->{$shortname}}) < $needlambdas)) {
-          print STDERR "$inishortname:$nr:Your model $shortname needs $needlambdas weights but we define the default ranges for only "
-            .scalar(@{$additional_triples->{$shortname}})." weights. Cannot use the default, you must supply lambdas by hand.\n";
-          $error = 1;
-        }
-	else {
-	    # note: table may use less parameters than the maximum number
-	    # of triples
-	    for(my $lambda=0;$lambda<$needlambdas;$lambda++) {
-		my ($start, $min, $max) 
-		    = @{${$additional_triples->{$shortname}}[$lambda]};
-		push @{$used_triples{$shortname}}, [$start, $min, $max];
-	    }
-	}
-        $defined_files{$shortname}++;
-      }
-    }
-  }
-  die "$inishortname: File was empty!" if !$nr;
-  close INI;
-  for my $pair (qw/T=tm=translation G=g=generation/) {
-    my ($tg, $shortname, $label) = split /=/, $pair;
-    $defined_files{$shortname} = 0 if ! defined $defined_files{$shortname};
-    $defined_steps{$tg} = 0 if ! defined $defined_steps{$tg};
-
-    if ($defined_files{$shortname} != $defined_steps{$tg}) {
-      print STDERR "$inishortname: You defined $defined_files{$shortname} files for $label but use $defined_steps{$tg} in [mapping]!\n";
-      $error = 1;
-    }
-  }
-
-  # distance-based distortion
-  if ($___ASYNC == 1)
-  {
-    print STDERR "ASYNC distortion & word penalty";
-    my @my_array;
-    for(my $i=0 ; $i < $defined_steps{"T"} ; $i++) 
-    {
-      push @my_array, [ 1.0, 0.0, 2.0 ];
-    }
-    push @{$used_triples{"d"}}, @my_array;
-
-    @my_array = ();
-    for(my $i=0 ; $i < $defined_steps{"T"} ; $i++) 
-    {
-      push @my_array, [ 0.5, -1.0, 1.0 ];
-    }
-    push @{$used_triples{"w"}}, @my_array;
-
-    # debug print
-    print "distortion:";
-    my $refarray=$used_triples{"d"};
-    my @vector=@$refarray;
-    foreach my $subarray (@vector) {
-      my @toto=@$subarray;
-      print @toto,"\n";
-    }
-    #exit 1;
-  }
-  else
-  { 
-    print STDERR "SYNC distortion";
-    push @{$used_triples{"d"}}, [1.0, 0.0, 2.0];
-  }
-
-
-  exit(1) if $error;
-  return (\%defined_files);
-}
author	Barry Haddow <barry.haddow@gmail.com>	2013-02-21 21:34:59 +0400
committer	Barry Haddow <barry.haddow@gmail.com>	2013-02-21 21:34:59 +0400
commit	51ab9aa19dccefea54b45dc81a929301ba5d6ea5 (patch)
tree	364018d46af083e52b0658359d535a055bdd92ac /scripts/training
parent	87d7294d50d69da1833b6a78829154c444f2be6e (diff)
parent	5844fb21a758a492b0847ba0939a7856a9a5cb68 (diff)