support for Chris Dyer's fast-align; bug fix with sparse word translations feature; threshold pruning in filter

author: phikoehn <pkoehn@inf.ed.ac.uk> 2013-05-01 22:20:05 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2013-05-01 22:20:05 +0400
commit: cd8915647b74cd60cb259c2ec8ba5230970389f5 (patch)
tree: 6c4357ecb255b3d480a9c729fe20c096512921cb /scripts/training
parent: 8a1e944bb428a0af9f6c82c26e5633361ce4052c (diff)
2 files changed, 37 insertions, 5 deletions
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index d994fbcef..6323096be 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -36,6 +36,7 @@ my $ZCAT = "gzip -cd";
 # get optional parameters
 my $opt_hierarchical = 0;
 my $binarizer = undef;
+my $min_score = undef;
 my $opt_min_non_initial_rule_count = undef;
 my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
 
@@ -43,6 +44,7 @@ GetOptions(
     "gzip!" => \$opt_gzip,
     "Hierarchical" => \$opt_hierarchical,
     "Binarizer=s" => \$binarizer,
+    "MinScore=s" => \$min_score,
     "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count
 ) or exit(1);
 
@@ -52,11 +54,20 @@ my $config = shift;
 my $input = shift;
 
 if (!defined $dir || !defined $config || !defined $input) {
-  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical]\n";
+  print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*]\n";
   exit 1;
 }
 $dir = ensure_full_path($dir);
 
+# decode min-score definitions
+my %MIN_SCORE;
+if ($min_score) {
+  foreach (split(/ *, */,$min_score)) {
+    my ($id,$score) = split(/ *: */);
+    $MIN_SCORE{$id} = $score;
+    print STDERR "score $id must be at least $score\n";
+  }
+}
 # buggy directory in place?
 if (-d $dir && ! -e "$dir/info") {
     print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n";
@@ -262,6 +273,18 @@ for(my $i=0;$i<=$#TABLE;$i++) {
             my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
             $foreign =~ s/ $//;
             if (defined($PHRASE_USED{$factors}{$foreign})) {
+                # handle min_score thresholds
+                if ($min_score) {
+                   my @ITEM = split(/ *\|\|\| */,$rest);
+                   if(scalar (@ITEM)>2) { # do not filter reordering table
+                     my @SCORE = split(/ /,$ITEM[1]);
+                     my $okay = 1;
+                     foreach my $id (keys %MIN_SCORE) {
+                       $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id};
+                     }
+                     next unless $okay;
+                   }
+                }
                 print FILE_OUT $entry;
                 $used++;
             }
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index e4292007e..680495602 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -1468,21 +1468,23 @@ sub score_phrase_factored {
 	&score_phrase($file,$___LEXICAL_FILE,$___EXTRACT_FILE);
     }
     else {
+	my $table_id = 0;
 	foreach my $factor (split(/\+/,$___TRANSLATION_FACTORS)) {
 	    print STDERR "(6) [$factor] score phrases @ ".`date`;
 	    my ($factor_f,$factor_e) = split(/\-/,$factor);
 	    my $file = "$___MODEL_DIR/".($_HIERARCHICAL?"rule-table":"phrase-table").".$factor";
 	    $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE);
-	    &score_phrase($file,$___LEXICAL_FILE.".".$factor,$___EXTRACT_FILE.".".$factor);
+	    &score_phrase($file,$___LEXICAL_FILE.".".$factor,$___EXTRACT_FILE.".".$factor,$table_id);
+	    $table_id++;
 	}
     }
 }
 
 sub score_phrase {
-    my ($ttable_file,$lexical_file,$extract_file) = @_;
+    my ($ttable_file,$lexical_file,$extract_file,$table_id) = @_;
 
     if ($___PHRASE_SCORER eq "phrase-extract") {
-        &score_phrase_phrase_extract($ttable_file,$lexical_file,$extract_file);
+        &score_phrase_phrase_extract($ttable_file,$lexical_file,$extract_file,$table_id);
     } elsif ($___PHRASE_SCORER eq "memscore") {
         &score_phrase_memscore($ttable_file,$lexical_file,$extract_file);
     } else {
@@ -1491,7 +1493,7 @@ sub score_phrase {
 }
 
 sub score_phrase_phrase_extract {
-    my ($ttable_file,$lexical_file,$extract_file) = @_;
+    my ($ttable_file,$lexical_file,$extract_file,$table_id) = @_;
 
     # distinguish between score and consolidation options
     my $ONLY_DIRECT = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/);
@@ -1502,6 +1504,13 @@ sub score_phrase_phrase_extract {
     $COUNT_BIN = $1 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /\-CountBinFeature ([\s\d]*\d)/;
     $DOMAIN = $1 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /(\-+[a-z]*Domain[a-z]+ .+)/i;
     $DOMAIN =~ s/ \-.+//g;
+    if ($DOMAIN =~ /^(.+) table ([\d\,]+) *$/) {
+      my ($main_spec,$specified_tables) = ($1,$2);
+      $DOMAIN = "--IgnoreSentenceId";
+      foreach my $specified_table_id (split(/,/,$specified_tables)) {
+	$DOMAIN = $main_spec if $specified_table_id == $table_id;
+      }
+    }
     my $SINGLETON = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /Singleton/);
     my $CROSSEDNONTERM = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /CrossedNonTerm/);
author	phikoehn <pkoehn@inf.ed.ac.uk>	2013-05-01 22:20:05 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2013-05-01 22:20:05 +0400
commit	cd8915647b74cd60cb259c2ec8ba5230970389f5 (patch)
tree	6c4357ecb255b3d480a9c729fe20c096512921cb /scripts/training
parent	8a1e944bb428a0af9f6c82c26e5633361ce4052c (diff)