Enable skipping of filtering in EMS

Use 'binarize-all = path-to-binarize-model.perl
author: Barry Haddow <barry.haddow@gmail.com> 2013-05-02 18:15:52 +0400
committer: Barry Haddow <barry.haddow@gmail.com> 2013-05-02 18:15:52 +0400
commit: 5eebb9538ef25824504214852acc38bc4a93fb2e (patch)
tree: a41e99097d7f5aa64c3d18c6119c9a580264aa81 /scripts/training
parent: 5638aa6a32ce0e6b0f68c0f6d073e40adf19124d (diff)
2 files changed, 159 insertions, 78 deletions
diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl
new file mode 100755
index 000000000..15ad23ac4
--- /dev/null
+++ b/scripts/training/binarize-model.perl
@@ -0,0 +1,62 @@
+#!/usr/bin/perl -w
+
+#
+# Binarize a Moses model
+#
+
+use strict;
+
+use Getopt::Long "GetOptions";
+use FindBin qw($RealBin);
+
+$ENV{"LC_ALL"} = "C";
+my $SCRIPTS_ROOTDIR = $RealBin;
+if ($SCRIPTS_ROOTDIR eq '') {
+    $SCRIPTS_ROOTDIR = dirname(__FILE__);
+}
+$SCRIPTS_ROOTDIR =~ s/\/training$//;
+
+my ($binarizer, $input_config, $output_config);
+my $opt_hierarchical = 0;
+$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTable";
+GetOptions(
+  "Hierarchical" => \$opt_hierarchical,
+  "Binarizer=s" => \$binarizer
+) or exit(1);
+
+$input_config = shift;
+$output_config = shift;
+
+if (!defined $input_config || !defined $output_config) {
+  print STDERR "usage: binarize-model.perl input-config output-config [-Binarizer binarizer]\n";
+  exit 1;
+}
+
+my $hierarchical = "";
+$hierarchical = "-Hierarchical" if $opt_hierarchical;
+my $targetdir = "$output_config.tables";
+
+safesystem("$RealBin/filter-model-given-input.pl  $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed"; 
+safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file";
+
+#FIXME: Why isn't this in a module?
+sub safesystem {
+  print STDERR "Executing: @_\n";
+  system(@_);
+  if ($? == -1) {
+      print STDERR "Failed to execute: @_\n  $!\n";
+      exit(1);
+  }
+  elsif ($? & 127) {
+      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
+          ($? & 127),  ($? & 128) ? 'with' : 'without';
+      exit(1);
+  }
+  else {
+    my $exitcode = $? >> 8;
+    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    return ! $exitcode;
+  }
+}
+
+
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index d994fbcef..2452ca40c 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -38,9 +38,11 @@ my $opt_hierarchical = 0;
 my $binarizer = undef;
 my $opt_min_non_initial_rule_count = undef;
 my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
+my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice
 
 GetOptions(
     "gzip!" => \$opt_gzip,
+    "filter!" => \$opt_filter,
     "Hierarchical" => \$opt_hierarchical,
     "Binarizer=s" => \$binarizer,
     "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count
@@ -165,51 +167,53 @@ close(INI);
 close(INI_OUT);
 
 my %TMP_INPUT_FILENAME;
+my %PHRASE_USED;
 
-if ($opt_hierarchical)
-{
-    # Write a separate, temporary input file for each combination of source
-    # factors
-    foreach my $key (keys %CONSIDER_FACTORS) {
-        my $filename = "$dir/input-$key";
-        open(FILEHANDLE,">$filename") or die "Can't open $filename for writing";
-        $TMP_INPUT_FILENAME{$key} = $filename;
-        my @FACTOR = split(/,/, $key);
-        open(PIPE,"$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |");
-        while (my $line = <PIPE>) {
-            print FILEHANDLE $line
-        }
-        close(FILEHANDLE);
-    }
-}
+if ($opt_filter) {
+  if ($opt_hierarchical)
+  {
+      # Write a separate, temporary input file for each combination of source
+      # factors
+      foreach my $key (keys %CONSIDER_FACTORS) {
+          my $filename = "$dir/input-$key";
+          open(FILEHANDLE,">$filename") or die "Can't open $filename for writing";
+          $TMP_INPUT_FILENAME{$key} = $filename;
+          my @FACTOR = split(/,/, $key);
+          open(PIPE,"$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |");
+          while (my $line = <PIPE>) {
+              print FILEHANDLE $line
+          }
+          close(FILEHANDLE);
+      }
+  }
 
-my %PHRASE_USED;
-if (!$opt_hierarchical) {
-    # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
-    open(INPUT,mk_open_string($input)) or die "Can't read $input";
-    while(my $line = <INPUT>) {
-        chomp($line);
-        my @WORD = split(/ +/,$line);
-        for(my $i=0;$i<=$#WORD;$i++) {
-            for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
-                foreach (keys %CONSIDER_FACTORS) {
-                    my @FACTOR = split(/,/);
-                    my $phrase = "";
-                    for(my $k=$i;$k<=$i+$j;$k++) {
-                        my @WORD_FACTOR = split(/\|/,$WORD[$k]);
-                        for(my $f=0;$f<=$#FACTOR;$f++) {
-                            $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
-                        }
-                        chop($phrase);
-                        $phrase .= " ";
-                    }
-                    chop($phrase);
-                    $PHRASE_USED{$_}{$phrase}++;
-                }
-            }
-        }
-    }
-    close(INPUT);
+  if (!$opt_hierarchical) {
+      # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
+      open(INPUT,mk_open_string($input)) or die "Can't read $input";
+      while(my $line = <INPUT>) {
+          chomp($line);
+          my @WORD = split(/ +/,$line);
+          for(my $i=0;$i<=$#WORD;$i++) {
+              for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
+                  foreach (keys %CONSIDER_FACTORS) {
+                      my @FACTOR = split(/,/);
+                      my $phrase = "";
+                      for(my $k=$i;$k<=$i+$j;$k++) {
+                          my @WORD_FACTOR = split(/\|/,$WORD[$k]);
+                          for(my $f=0;$f<=$#FACTOR;$f++) {
+                              $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
+                          }
+                          chop($phrase);
+                          $phrase .= " ";
+                      }
+                      chop($phrase);
+                      $PHRASE_USED{$_}{$phrase}++;
+                  }
+              }
+          }
+      }
+      close(INPUT);
+  }
 }
 
 sub mk_open_string {
@@ -235,41 +239,57 @@ for(my $i=0;$i<=$#TABLE;$i++) {
     my $factors = $TABLE_FACTORS[$i];
     my $new_file = $TABLE_NEW_NAME[$i];
     print STDERR "filtering $file -> $new_file...\n";
+    my $cat_or_zcat = "cat"; # How to open filtered corpus
+    if (!$opt_filter) {
+      # check if original file was gzipped
+      if ($file !~ /\.gz$/ && -e "$file.gz") {
+        $file .= ".gz";
+      }
+      if ($file =~ /\.gz$/) {
+        $cat_or_zcat = $ZCAT;
+      }
+      safesystem("ln -s $file $new_file");
+    } else {
 
-    my $openstring = mk_open_string($file);
+      my $openstring = mk_open_string($file);
 
-    my $new_openstring;
-    if ($new_file =~ /\.gz$/) {
-      $new_openstring = "| gzip -c > $new_file";
-    } else {
-      $new_openstring = ">$new_file";
-    }
+      my $new_openstring;
+      if ($new_file =~ /\.gz$/) {
+        $new_openstring = "| gzip -c > $new_file";
+      } else {
+        $new_openstring = ">$new_file";
+      }
 
-    open(FILE_OUT,$new_openstring) or die "Can't write to $new_openstring";
+      #FIXME: Shouldn't need to make a text version of the filtered table if we're going
+      # to binarise it.
 
-    if ($opt_hierarchical) {
-        my $tmp_input = $TMP_INPUT_FILENAME{$factors};
-        my $options = "";
-        $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
-        open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
-        while (my $line = <PIPE>) {
-            print FILE_OUT $line
-        }
-        close(FILEHANDLE);
-    } else {
-        open(FILE,$openstring) or die "Can't open '$openstring'";
-        while(my $entry = <FILE>) {
-            my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
-            $foreign =~ s/ $//;
-            if (defined($PHRASE_USED{$factors}{$foreign})) {
-                print FILE_OUT $entry;
-                $used++;
-            }
-            $total++;
-        }
-        close(FILE);
-        die "No phrases found in $file!" if $total == 0;
-        printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+      open(FILE_OUT,$new_openstring) or die "Can't write to $new_openstring";
+
+      if ($opt_hierarchical) {
+          my $tmp_input = $TMP_INPUT_FILENAME{$factors};
+          my $options = "";
+          $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
+          open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
+          while (my $line = <PIPE>) {
+              print FILE_OUT $line
+          }
+          close(FILEHANDLE);
+      } else {
+          open(FILE,$openstring) or die "Can't open '$openstring'";
+          while(my $entry = <FILE>) {
+              my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
+              $foreign =~ s/ $//;
+              if (defined($PHRASE_USED{$factors}{$foreign})) {
+                  print FILE_OUT $entry;
+                  $used++;
+              }
+              $total++;
+          }
+          close(FILE);
+          die "No phrases found in $file!" if $total == 0;
+          printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+      }
+      close(FILE_OUT);
     }
 
     if(defined($binarizer)) {
@@ -285,11 +305,11 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         # ... phrase translation model
         elsif ($binarizer =~ /processPhraseTableMin/) {
           #compact phrase table
-          my $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
+          my $cmd = "$cat_or_zcat $new_file | LC_ALL=C sort -T $dir > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
           print STDERR $cmd."\n";
           print STDERR `$cmd`;
         } else { 
-          my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
+          my $cmd = "$cat_or_zcat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
           print STDERR $cmd."\n";
           print STDERR `$cmd`;
         }
@@ -300,7 +320,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
         $lexbin =~ s/PhraseTable/LexicalTable/;
         my $cmd;
         if ($lexbin =~ /processLexicalTableMin/) {
-          $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted;  $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
+          $cmd = "$cat_or_zcat $new_file | LC_ALL=C sort -T $dir > $new_file.sorted;  $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
         } else {
           $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
           $cmd = "$lexbin -in $new_file -out $new_file";
@@ -310,7 +330,6 @@ for(my $i=0;$i<=$#TABLE;$i++) {
       }
     }
 
-    close(FILE_OUT);
 }
 
 if ($opt_hierarchical)
author	Barry Haddow <barry.haddow@gmail.com>	2013-05-02 18:15:52 +0400
committer	Barry Haddow <barry.haddow@gmail.com>	2013-05-02 18:15:52 +0400
commit	5eebb9538ef25824504214852acc38bc4a93fb2e (patch)
tree	a41e99097d7f5aa64c3d18c6119c9a580264aa81 /scripts/training
parent	5638aa6a32ce0e6b0f68c0f6d073e40adf19124d (diff)