From 41a184943720ddf85ac83339ecffa6db15ed8efb Mon Sep 17 00:00:00 2001
From: phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>
Date: Wed, 7 Sep 2011 16:37:33 +0000
Subject: support for sparse feature functions (mert support only when using
 PRO)

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4184 1f5c12ca-751b-0410-a591-d2e778427230
---
 scripts/training/mert-moses.pl                    | 136 +++++++++++---------
 scripts/training/phrase-extract/consolidate.cpp   | 150 +++++++++++++++++++---
 scripts/training/phrase-extract/extract-rules.cpp |   2 +-
 scripts/training/phrase-extract/extract.cpp       |   2 +-
 scripts/training/phrase-extract/tables-core.cpp   |   4 +
 scripts/training/train-model.perl                 |  58 +++++++--
 6 files changed, 258 insertions(+), 94 deletions(-)

(limited to 'scripts')
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 15e1d0d00..9a37cc137 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -11,6 +11,7 @@
 # Excerpts from revision history
 
 # Sept 2011   multi-threaded mert (Barry Haddow)
+# 3 Aug 2011  Added random directions, historic best, pairwise ranked (PK)
 # Jul 2011    simplifications (Ondrej Bojar)
 #             -- rely on moses' -show-weights instead of parsing moses.ini 
 #                ... so moses is also run once *before* mert starts, checking
@@ -287,8 +288,6 @@ $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper
 $moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl"
   if !defined $moses_parallel_cmd;
 
-
-
 if (!defined $mertdir) {
   $mertdir = "$SCRIPTS_ROOTDIR/../mert";
   print STDERR "Assuming --mertdir=$mertdir\n";
@@ -357,13 +356,11 @@ die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_par
 die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper;
 die "Not executable: $___DECODER" if ! -x $___DECODER;
 
-
 my $input_abs = ensure_full_path($___DEV_F);
 die "File not found: $___DEV_F (interpreted as $input_abs)."
   if ! -e $input_abs;
 $___DEV_F = $input_abs;
 
-
 # Option to pass to qsubwrapper and moses-parallel
 my $pass_old_sge = $old_sge ? "-old-sge" : "";
 
@@ -372,7 +369,6 @@ die "File not executable: $___DECODER (interpreted as $decoder_abs)."
   if ! -x $decoder_abs;
 $___DECODER = $decoder_abs;
 
-
 my $ref_abs = ensure_full_path($___DEV_E);
 # check if English dev set (reference translations) exist and store a list of all references
 my @references;
@@ -409,9 +405,6 @@ if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) /
 # normalize initial LAMBDAs, too
 my $need_to_normalize = 1;
 
-
-
-
 #store current directory and create the working directory (if needed)
 my $cwd = `pawd 2>/dev/null`; 
 if(!$cwd){$cwd = `pwd`;}
@@ -431,17 +424,16 @@ my $mert_logfile = "mert.log";
 my $weights_in_file = "init.opt";
 my $weights_out_file = "weights.txt";
 
-
 # set start run
 my $start_run = 1;
 my $bestpoint = undef;
 my $devbleu = undef;
+my $sparse_weights_file = undef;
 
 my $prev_feature_file = undef;
 my $prev_score_file = undef;
 my $prev_init_file = undef;
 
-
 if ($___FILTER_PHRASE_TABLE) {
   my $outdir = "filtered";
   if (-e "$outdir/moses.ini") {
@@ -471,7 +463,6 @@ else{
   $___CONFIG_ORIG = $___CONFIG;
 }
 
-
 # we run moses to check validity of moses.ini and to obtain all the feature
 # names
 my $featlist = get_featlist_from_moses($___CONFIG);
@@ -579,28 +570,19 @@ if ($continue) {
     print STDERR "All needed data are available\n";
 
     print STDERR "Loading information from last step ($step)\n";
-    open(IN,"run$step.$mert_logfile") or die "Can't open run$step.$mert_logfile";
-    while (<IN>) {
-      if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
-	$bestpoint = $1;
-	$devbleu = $2;
-	last;
-      }
-    }
-    close IN;
+    my %dummy; # sparse features
+    ($bestpoint,$devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile",scalar @{$featlist->{"names"}},\%dummy);
     die "Failed to parse mert.log, missed Best point there."
       if !defined $bestpoint || !defined $devbleu;
     print "($step) BEST at $step $bestpoint => $devbleu at ".`date`;
-    
     my @newweights = split /\s+/, $bestpoint;
     
     # Sanity check: order of lambdas must match
     sanity_check_order_of_lambdas($featlist,
       "gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |");
-    
+
     # update my cache of lambda values
     $featlist->{"values"} = \@newweights;
-    
   }
   else{
     print STDERR "No previous data are needed\n";
@@ -630,10 +612,10 @@ while(1) {
   print "run $run start at ".`date`;
 
   # In case something dies later, we might wish to have a copy
-  create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"));
+  create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file);
 
 
-  # skip if the user wanted
+  # skip running the decoder if the user wanted
   if (!$skip_decoder) {
       print "($run) run decoder to produce n-best lists\n";
       $nbest_file = run_decoder($featlist, $run, $need_to_normalize);
@@ -648,8 +630,6 @@ while(1) {
       $need_to_normalize = 0;
   }
 
-
-
   # extract score statistics and features from the nbest lists
   print STDERR "Scoring the nbestlist.\n";
 
@@ -740,7 +720,7 @@ while(1) {
     if ! -s $weights_out_file;
 
 
- # backup copies
+  # backup copies
   safesystem ("\\cp -f extract.err run$run.extract.err") or die;
   safesystem ("\\cp -f extract.out run$run.extract.out") or die;
   if ($___PAIRWISE_RANKED_OPTIMIZER) { safesystem ("\\cp -f pro.data run$run.pro.data") or die; }
@@ -751,34 +731,10 @@ while(1) {
 
   print "run $run end at ".`date`;
 
-  $bestpoint = undef;
-  $devbleu = undef;
-  if ($___PAIRWISE_RANKED_OPTIMIZER) {
-    open(IN,"run$run.$mert_outfile") or die "Can't open run$run.$mert_outfile";
-    my (@WEIGHT,$sum);
-    foreach (@CURR) { push @WEIGHT, 0; }
-    while(<IN>) {
-      if (/^F(\d+) ([\-\.\de]+)/) {
-	$WEIGHT[$1] = $2;
-	$sum += abs($2);
-      }
-    }
-    $devbleu = "unknown";
-    foreach (@WEIGHT) { $_ /= $sum; }
-    $bestpoint = join(" ",@WEIGHT);
-    close IN;
-  }
-  else {
-    open(IN,"run$run.$mert_logfile") or die "Can't open run$run.$mert_logfile";
-    while (<IN>) {
-      if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
-        $bestpoint = $1;
-        $devbleu = $2;
-        last;
-      }
-    }
-    close IN;
-  }
+  my %sparse_weights; # sparse features
+  ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights);
+
+
   die "Failed to parse mert.log, missed Best point there."
     if !defined $bestpoint || !defined $devbleu;
   print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`;
@@ -788,6 +744,15 @@ while(1) {
 
   $featlist->{"values"} = \@newweights;
 
+  if (scalar keys %sparse_weights) {
+    $sparse_weights_file = "run".($run+1).".sparse-weights";
+    open(SPARSE,">".$sparse_weights_file);
+    foreach my $feature (keys %sparse_weights) {
+      print SPARSE "$feature $sparse_weights{$feature}\n";
+    }
+    close(SPARSE);
+  }
+
   ## additional stopping criterion: weights have not changed
   my $shouldstop = 1;
   for(my $i=0; $i<@CURR; $i++) {
@@ -864,6 +829,43 @@ chdir($cwd);
 
 } # end of local scope
 
+sub get_weights_from_mert {
+  my ($outfile,$logfile,$weight_count,$sparse_weights) = @_;
+  my ($bestpoint,$devbleu);
+  if ($___PAIRWISE_RANKED_OPTIMIZER) {
+    open(IN,$outfile) or die "Can't open $outfile";
+    my (@WEIGHT,$sum);
+    for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; }
+    while(<IN>) {
+      # regular features
+      if (/^F(\d+) ([\-\.\de]+)/) {
+        $WEIGHT[$1] = $2;
+        $sum += abs($2);
+      }
+      # sparse features
+      elsif(/^(.+_.+) ([\-\.\de]+)/) {
+        $$sparse_weights{$1} = $2;
+      }
+    }
+    $devbleu = "unknown";
+    foreach (@WEIGHT) { $_ /= $sum; }
+    $bestpoint = join(" ",@WEIGHT);
+    close IN;
+  }
+  else {
+    open(IN,$logfile) or die "Can't open $logfile";
+    while (<IN>) {
+      if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) {
+        $bestpoint = $1;
+        $devbleu = $2;
+        last;
+      }
+    }
+    close IN;
+  }
+  return ($bestpoint,$devbleu);
+}
+
 sub run_decoder {
     my ($featlist, $run, $need_to_normalize) = @_;
     my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out";
@@ -984,6 +986,7 @@ sub get_featlist_from_moses {
     $nr++;
     chomp;
     my ($longname, $feature, $value) = split / /;
+    next if $value eq "sparse";
     push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
       if $value !~ /^[+-]?[0-9.e]+$/;
     push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
@@ -1015,14 +1018,20 @@ sub get_order_of_scores_from_nbestlist {
 
   my @order = ();
   my $label = undef;
+  my $sparse = 0; # we ignore sparse features here
   foreach my $tok (split /\s+/, $scores) {
-    if ($tok =~ /^([a-z][0-9a-z]*):/i) {
+    if ($tok =~ /.+_.+:/) {
+      $sparse = 1;
+    } elsif ($tok =~ /^([a-z][0-9a-z]*):/i) {
       $label = $1;
     } elsif ($tok =~ /^-?[-0-9.e]+$/) {
-      # a score found, remember it
-      die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
-        if !defined $label;
-      push @order, $label;
+      if (!$sparse) {
+        # a score found, remember it
+        die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!"
+          if !defined $label;
+        push @order, $label;
+      }
+      $sparse = 0;
     } else {
       die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'";
     }
@@ -1037,6 +1046,7 @@ sub create_config {
     my $featlist = shift; # the lambdas we should write
     my $iteration = shift;  # just for verbosity
     my $bleu_achieved = shift; # just for verbosity
+    my $sparse_weights_file = shift; # only defined when optimizing sparse features
 
     my %P; # the hash of all parameters we wish to override
 
@@ -1076,6 +1086,10 @@ sub create_config {
       push @{$P{$name}}, $val;
     }
 
+    if (defined($sparse_weights_file)) {
+      push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file;
+    }
+
     # create new moses.ini decoder config file by cloning and overriding the original one
     open(INI,$infn) or die "Can't read $infn";
     delete($P{"config"}); # never output 
diff --git a/scripts/training/phrase-extract/consolidate.cpp b/scripts/training/phrase-extract/consolidate.cpp
index 53a141221..8d31a1d27 100644
--- a/scripts/training/phrase-extract/consolidate.cpp
+++ b/scripts/training/phrase-extract/consolidate.cpp
@@ -36,10 +36,15 @@ using namespace std;
 bool hierarchicalFlag = false;
 bool onlyDirectFlag = false;
 bool phraseCountFlag = true;
+bool lowCountFlag = false;
+bool goodTuringFlag = false;
+bool kneserNeyFlag = false;
 bool logProbFlag = false;
-char line[LINE_MAX_LENGTH];
+inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
 
-void processFiles( char*, char*, char* );
+char line[LINE_MAX_LENGTH];
+void processFiles( char*, char*, char*, char* );
+void loadCountOfCounts( char* );
 bool getLine( istream &fileP, vector< string > &item );
 vector< string > splitLine();
 
@@ -55,6 +60,7 @@ int main(int argc, char* argv[])
   char* &fileNameDirect = argv[1];
   char* &fileNameIndirect = argv[2];
   char* &fileNameConsolidated = argv[3];
+  char* fileNameCountOfCounts;
 
   for(int i=4; i<argc; i++) {
     if (strcmp(argv[i],"--Hierarchical") == 0) {
@@ -66,6 +72,25 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
       phraseCountFlag = false;
       cerr << "not including the phrase count feature\n";
+    } else if (strcmp(argv[i],"--GoodTuring") == 0) {
+      goodTuringFlag = true;
+      if (i+1==argc) { 
+        cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
+        exit(1);
+      }
+      fileNameCountOfCounts = argv[++i];
+      cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+    } else if (strcmp(argv[i],"--KneserNey") == 0) {
+      kneserNeyFlag = true;
+      if (i+1==argc) { 
+        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
+        exit(1);
+      }
+      fileNameCountOfCounts = argv[++i];
+      cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+    } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
+      lowCountFlag = true;
+      cerr << "including the low count feature\n";
     } else if (strcmp(argv[i],"--LogProb") == 0) {
       logProbFlag = true;
       cerr << "using log-probabilities\n";
@@ -75,11 +100,61 @@ int main(int argc, char* argv[])
     }
   }
 
-  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated );
+  processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
+}
+
+vector< float > countOfCounts;
+vector< float > goodTuringDiscount;
+float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
+void loadCountOfCounts( char* fileNameCountOfCounts )
+{
+  Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
+  if (fileCountOfCounts.fail()) {
+    cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl;
+    exit(1);
+  }
+  istream &fileP = fileCountOfCounts;
+
+  countOfCounts.push_back(0.0);
+  while(1) {
+    if (fileP.eof()) break;
+    SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+    if (fileP.eof()) break;
+    if (totalCount < 0)
+      totalCount = atof(line); // total number of distinct phrase pairs
+    else
+      countOfCounts.push_back( atof(line) );
+  }
+  fileCountOfCounts.Close();
+
+  // compute Good Turing discounts
+  if (goodTuringFlag) {
+    goodTuringDiscount.push_back(0.01); // floor value
+    for( size_t i=1; i<countOfCounts.size()-1; i++ ) {
+      goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1))); 
+      if (goodTuringDiscount[i]>1)
+        goodTuringDiscount[i] = 1;
+      if (goodTuringDiscount[i]<goodTuringDiscount[i-1])
+        goodTuringDiscount[i] = goodTuringDiscount[i-1];
+    }
+  }
+
+  // compute Kneser Ney co-efficients [Chen&Goodman, 1998]
+  float Y = countOfCounts[1] / (countOfCounts[1] + 2*countOfCounts[2]);
+  kneserNey_D1 = 1 - 2*Y * countOfCounts[2] / countOfCounts[1];
+  kneserNey_D2 = 2 - 3*Y * countOfCounts[3] / countOfCounts[2];
+  kneserNey_D3 = 3 - 4*Y * countOfCounts[4] / countOfCounts[3];
+  // sanity constraints
+  if (kneserNey_D1 > 0.9) kneserNey_D1 = 0.9;
+  if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9;
+  if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
 }
 
-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated )
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
 {
+  if (goodTuringFlag || kneserNeyFlag)
+    loadCountOfCounts( fileNameCountOfCounts );
+
   // open input files
   Moses::InputFileStream fileDirect(fileNameDirect);
   Moses::InputFileStream fileIndirect(fileNameIndirect);
@@ -134,29 +209,67 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     // output hierarchical phrase pair (with separated labels)
     fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
 
-    // probs
-    fileConsolidated << " ||| ";
+    // SCORES ...
+    fileConsolidated << " |||";
+    vector<string> directCounts = tokenize(itemDirect[4].c_str());
+    vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+    float countF = atof(directCounts[0].c_str());
+    float countE = atof(indirectCounts[0].c_str());
+    float countEF = atof(indirectCounts[1].c_str());
+    float n1_F, n1_E;
+    if (kneserNeyFlag) {
+      n1_F = atof(directCounts[2].c_str());
+      n1_E = atof(indirectCounts[2].c_str());
+    }
+
+    // Good Turing discounting
+    float adjustedCountEF = countEF;
+    if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
+      adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
+    float adjustedCountEF_indirect = adjustedCountEF;
+
+    // Kneser Ney discounting [Foster et al, 2006]
+   if (kneserNeyFlag) {
+     float D = kneserNey_D3;
+     if (countEF < 2) D = kneserNey_D1;
+     if (countEF < 3) D = kneserNey_D2;
+     if (D > countEF) D = countEF - 0.01; // sanity constraint
+
+     float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
+     float alpha_F = D * n1_F / countF; // available mass
+     adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
+
+     // for indirect
+     float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
+     float alpha_E = D * n1_E / countE; // available mass
+     adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
+   }
+
+    // prob indirect
     if (!onlyDirectFlag) {
-      fileConsolidated << itemIndirect[2];    // prob indirect
+      fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
+      fileConsolidated << " " << itemIndirect[2];
     }
-    fileConsolidated << " " << itemDirect[2]; // prob direct
+
+    // prob direct
+    fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
+    fileConsolidated << " " << itemDirect[2];
+
+    // phrase count feature
     if (phraseCountFlag) {
-      fileConsolidated << " " << (logProbFlag ? 1 : 2.718); // phrase count feature
+      fileConsolidated << " " << maybeLogProb(2.718);
+    }
+
+    // low count feature
+    if (lowCountFlag) {
+      fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
     }
 
     // alignment
     fileConsolidated << " ||| " << itemDirect[3];
 
     // counts, for debugging
-    vector<string> directCounts = tokenize(itemDirect[4].c_str());
-    vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
-    fileConsolidated << "||| " << indirectCounts[0] << " " << directCounts[0];
-    // output rule count if present in either file
-    if (directCounts.size() > 1) {
-      fileConsolidated << " " << directCounts[1];
-    } else if (indirectCounts.size() > 1) {
-      fileConsolidated << " " << indirectCounts[1];
-    }
+    fileConsolidated << "||| " << countE << " " << countF; // << " " << countEF;
 
     fileConsolidated << endl;
   }
@@ -165,6 +278,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
   fileConsolidated.close();
 }
 
+
 bool getLine( istream &fileP, vector< string > &item )
 {
   if (fileP.eof())
diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp
index 15367f900..0b56f0c29 100644
--- a/scripts/training/phrase-extract/extract-rules.cpp
+++ b/scripts/training/phrase-extract/extract-rules.cpp
@@ -45,7 +45,7 @@
 #include "tables-core.h"
 #include "XmlTree.h"
 
-#define LINE_MAX_LENGTH 60000
+#define LINE_MAX_LENGTH 500000
 
 using namespace std;
 
diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp
index 5a63b6345..0848723df 100644
--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@@ -24,7 +24,7 @@
 
 using namespace std;
 
-#define LINE_MAX_LENGTH 60000
+#define LINE_MAX_LENGTH 500000
 
 // HPhraseVertex represents a point in the alignment matrix
 typedef pair <int, int> HPhraseVertex;
diff --git a/scripts/training/phrase-extract/tables-core.cpp b/scripts/training/phrase-extract/tables-core.cpp
index c8911f6a9..399026930 100644
--- a/scripts/training/phrase-extract/tables-core.cpp
+++ b/scripts/training/phrase-extract/tables-core.cpp
@@ -29,6 +29,10 @@ vector<string> tokenize( const char* input )
   return token;
 }
 
+bool isNonTerminal( const WORD &symbol ) {
+   return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
+}
+
 WORD_ID Vocabulary::storeIfNew( const WORD& word )
 {
   map<WORD, WORD_ID>::iterator i = lookup.find( word );
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index d754aa885..ec0ecf79d 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -34,9 +34,10 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
    $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
    $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
+   $_ADDITIONAL_INI,
    $_DICTIONARY, $_EPPEX);
 
-my $debug = 0; # debug this script, do not delete any files in debug mode
+my $debug = 1; # debug this script, do not delete any files in debug mode
 
 # the following line is set installation time by 'make release'.  BEWARE!
 my $BINDIR="/home/pkoehn/statmt/bin";
@@ -109,7 +110,7 @@ $_HELP = 1
 		       'memscore:s' => \$_MEMSCORE,
 		       'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
 		       'dictionary=s' => \$_DICTIONARY,
-		       'eppex:s' => \$_EPPEX,
+           'additional-ini=s' => \$_ADDITIONAL_INI
                );
 
 if ($_HELP) {
@@ -1372,11 +1373,28 @@ sub score_phrase {
 sub score_phrase_phrase_extract {
     my ($ttable_file,$lexical_file,$extract_file) = @_;
 
+    # remove consolidation options
     my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
     my $PHRASE_COUNT = (!defined($_SCORE_OPTIONS) || $_SCORE_OPTIONS !~ /NoPhraseCount/);
-    my $CORE_SCORE_OPTIONS = defined($_SCORE_OPTIONS) ? $_SCORE_OPTIONS : "";
-    $CORE_SCORE_OPTIONS =~ s/\-+OnlyDirect//i;
-    $CORE_SCORE_OPTIONS =~ s/\-+NoPhraseCount//i;
+    my $LOW_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/);
+    my $UNALIGNED_COUNT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/);
+    my ($UNALIGNED_FW_COUNT,$UNALIGNED_FW_F,$UNALIGNED_FW_E);
+    if (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty +(\S+) +(\S+)/) {
+      $UNALIGNED_FW_COUNT = 1;
+      $UNALIGNED_FW_F = $1;
+      $UNALIGNED_FW_E = $2;
+    }
+    my $GOOD_TURING = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /GoodTuring/);
+    my $KNESER_NEY = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /KneserNey/);
+    my $LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LogProb/);
+    my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/);
+    my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/);
+    my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
+    my $CORE_SCORE_OPTIONS = "";
+    $CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
+    $CORE_SCORE_OPTIONS .= " --NegLogProb" if $NEG_LOG_PROB;
+    $CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
+
     my $substep = 1;
     for my $direction ("f2e","e2f") {
 	next if $___CONTINUE && -e "$ttable_file.half.$direction";
@@ -1405,6 +1423,11 @@ sub score_phrase_phrase_extract {
         my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
         $cmd .= " --Hierarchical" if $_HIERARCHICAL;
         $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
+        $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
+        $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING && $inverse eq "";
+        $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
+        $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
+        $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
         $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
         print $cmd."\n";
         safesystem($cmd) or die "ERROR: Scoring of phrases failed";	    
@@ -1423,8 +1446,13 @@ sub score_phrase_phrase_extract {
     return if $___CONTINUE && -e "$ttable_file.gz";
     my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
     $cmd .= " --Hierarchical" if $_HIERARCHICAL;
+    $cmd .= " --LogProb" if $LOG_PROB;
+    $cmd .= " --NegLogProb" if $NEG_LOG_PROB;
     $cmd .= " --OnlyDirect" if $ONLY_DIRECT;
     $cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
+    $cmd .= " --LowCountFeature" if $LOW_COUNT;
+    $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING;
+    $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
     safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
     if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
     if (! $___DONT_ZIP) {
@@ -1681,6 +1709,13 @@ sub create_ini {
 [ttable-file]\n";
    my $num_of_ttables = 0;
    my @SPECIFIED_TABLE = @_PHRASE_TABLE;
+   my $basic_weight_count = 4; # both directions, lex and phrase
+   $basic_weight_count-=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
+   $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedPenalty/; # word ins/del
+   $basic_weight_count+=2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /UnalignedFunctionWordPenalty/;
+   $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
+   $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
+   $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
    foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
      $num_of_ttables++;
      my $ff = $f;
@@ -1688,10 +1723,6 @@ sub create_ini {
      my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").($___NOT_FACTORED ? "" : ".$f").".gz";
      $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
      my $phrase_table_impl = ($_HIERARCHICAL ? 6 : 0);
-     my $basic_weight_count = 4; # both directions, lex and phrase
-     $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
-     $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
-     $basic_weight_count++ unless defined($_SCORE_OPTIONS) &&  $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
      print INI "$phrase_table_impl $ff $basic_weight_count $file\n";
    }
    if ($_GLUE_GRAMMAR) {
@@ -1783,10 +1814,6 @@ sub create_ini {
 
   print INI "\n\n# translation model weights\n[weight-t]\n";
   foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
-     my $basic_weight_count = 4; # both directions, lex and phrase
-     $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
-     $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/;
-     $basic_weight_count++ unless defined($_SCORE_OPTIONS) &&  $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
      for(1..$basic_weight_count) {
        printf INI "%.2f\n", 1/$basic_weight_count;
      }
@@ -1826,6 +1853,11 @@ sub create_ini {
     print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n"
   }
 
+  if ($_ADDITIONAL_INI) {
+    print INI "\n# additional settings\n\n";
+    foreach (split(/<br>/i,$_ADDITIONAL_INI)) { print INI $_."\n"; }
+  }
+
   close(INI);
 }
 
-- 
cgit v1.2.3