support for Chris Dyer's fast-align; bug fix with sparse word translations feature; threshold pruning in filter

author: phikoehn <pkoehn@inf.ed.ac.uk> 2013-05-01 22:20:05 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2013-05-01 22:20:05 +0400
commit: cd8915647b74cd60cb259c2ec8ba5230970389f5 (patch)
tree: 6c4357ecb255b3d480a9c729fe20c096512921cb /scripts/ems
parent: 8a1e944bb428a0af9f6c82c26e5633361ce4052c (diff)
5 files changed, 181 insertions, 9 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 769fc0ebf..fd50a3cd7 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -400,6 +400,27 @@ mml-filter-before-wa
 	rerun-on-change: mml-filter-corpora mml-before-wa
 	default-name: training/corpus-mml
 	template: $moses-script-dir/ems/support/mml-filter.perl -in IN -out OUT -score IN1 -domain IN2 -input-extension $input-extension -output-extension $output-extension $mml-before-wa
+prepare-data-fast-align
+	in: corpus-mml-prefilter=OR=corpus
+	out: prepared-data-fast-align
+	default-name: prepared
+fast-align
+	in: prepared-data-fast-align
+	out: fast-alignment
+	rerun-on-change: fast-align-settings
+	template: $external-bin-dir/fast_align -i IN $fast-align-settings > OUT
+	default-name: fast-align
+fast-align-inverse
+	in: prepared-data-fast-align
+	out: fast-alignment-inverse
+	template: $external-bin-dir/fast_align -i IN -r $fast-align-settings > OUT
+	default-name: fast-align-inverse
+symmetrize-fast-align
+	in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus
+	out: word-alignment
+	ignore-unless: fast-align-settings
+	template: $moses-script-dir/ems/support/symmetrize-fast-align.perl IN IN1 IN2.$input-extension IN2.$output-extension OUT $alignment-symmetrization-method $moses-src-dir/bin/symal
+        default-name: model/aligned
 prepare-data
 	in: corpus-mml-prefilter=OR=corpus
 	out: prepared-data
@@ -441,7 +462,7 @@ process-berkeley
 symmetrize-giza
 	in: giza-alignment giza-alignment-inverse
 	out: word-alignment
-	ignore-if: use-berkeley
+	ignore-if: use-berkeley fast-align-settings
 	rerun-on-change: alignment-symmetrization-method training-options script
 	default-name: model/aligned
 	error: skip=<[1-9]
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index ff619b0a3..d55913a3c 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -940,6 +940,9 @@ sub define_step {
         elsif ($DO_STEP[$i] eq 'TRAINING:prepare-data') {
             &define_training_prepare_data($i);
         }
+        elsif ($DO_STEP[$i] eq 'TRAINING:prepare-data-fast-align') {
+            &define_training_prepare_data_fast_align($i);
+        }
         elsif ($DO_STEP[$i] eq 'TRAINING:run-giza') {
             &define_training_run_giza($i);
         }
@@ -1842,6 +1845,25 @@ sub write_selectBestMiraWeights {
     system("chmod u+x $script_filename");
 }
 
+sub define_training_prepare_data_fast_align {
+    my ($step_id) = @_;
+
+    my ($prepared, $corpus) = &get_output_and_input($step_id);
+    my $scripts = &check_and_get("GENERAL:moses-script-dir");
+    my $input_extension  = &check_backoff_and_get("TRAINING:input-extension");
+    my $output_extension = &check_backoff_and_get("TRAINING:output-extension");
+
+    my $alignment_factors = "";
+    if (&backoff_and_get("TRAINING:input-factors")) {
+      my %IN = &get_factor_id("input");
+      my %OUT = &get_factor_id("output");
+      $alignment_factors = &encode_factor_definition("alignment-factors",\%IN,\%OUT);
+    }
+    my $cmd = "$scripts/ems/support/prepare-fast-align.perl $corpus.$input_extension $corpus.$output_extension $alignment_factors > $prepared";
+
+    &create_step($step_id,$cmd);
+}
+
 sub define_training_prepare_data {
     my ($step_id) = @_;
 
@@ -1999,13 +2021,14 @@ sub define_training_build_ttable {
 sub define_domain_feature_score_option {
     my ($domains) = @_;
     my $spec = &backoff_and_get("TRAINING:domain-features");
-    my $method;
+    my ($method,$restricted_to_table) = ("","");
     $method = "Indicator" if $spec =~ /indicator/;
     $method = "Ratio" if $spec =~ /ratio/;
     $method = "Subset" if $spec =~ /subset/;
+    $restricted_to_table = $1 if $spec =~ /( table \S+)/;
     die("ERROR: faulty TRAINING:domain-features spec (no method): $spec\n") unless defined($method);
     if ($spec =~ /sparse/) {
-      return "-sparse-translation-table -score-options '--SparseDomain$method $domains' -additional-ini '<br>[report-sparse-features]<br>stm<br><br>' ";
+      return "-sparse-translation-table -score-options '--SparseDomain$method $domains$restricted_to_table' -additional-ini '<br>[report-sparse-features]<br>stm<br><br>' ";
     }
     else {
       return "-score-options '--Domain$method $domains' ";
diff --git a/scripts/ems/support/build-sparse-lexical-features.perl b/scripts/ems/support/build-sparse-lexical-features.perl
index ab8627d20..6d383c936 100755
--- a/scripts/ems/support/build-sparse-lexical-features.perl
+++ b/scripts/ems/support/build-sparse-lexical-features.perl
@@ -15,36 +15,49 @@ my %ALREADY;
 
 foreach my $feature_spec (split(/,\s*/,$specification)) {
   my @SPEC = split(/\s+/,$feature_spec);
+
+  my $factor = ($SPEC[0] eq 'word-translation') ? "0-0" : "0";
+  $factor = $1 if $feature_spec =~ / factor ([\d\-]+)/; 
+
   if ($SPEC[0] eq 'target-word-insertion') {
     if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
       my $file = &create_top_words($output_extension, $SPEC[2]);
-      $ini .= "[target-word-insertion-feature]\n0 $file\n\n";
-      $report .= "twi\n";
+      $ini .= "[target-word-insertion-feature]\n$factor $file\n\n";
+    }
+    elsif ($SPEC[1] eq 'all') {
+      $ini .= "[target-word-insertion-feature]\n$factor\n\n";
     }
     else {
       die("ERROR: Unknown parameter specification in '$feature_spec'\n");
     }
+    $report .= "twi\n";
   }
   elsif ($SPEC[0] eq 'source-word-deletion') {
     if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
       my $file = &create_top_words($input_extension, $SPEC[2]);
-      $ini .= "[source-word-deletion-feature]\n0 $file\n\n";
-      $report .= "swd\n";
+      $ini .= "[source-word-deletion-feature]\n$factor $file\n\n";
+    }
+    elsif ($SPEC[1] eq 'all') {
+      $ini .= "[source-word-deletion-feature]\n$factor\n\n";
     }
     else {
       die("ERROR: Unknown parameter specification in '$feature_spec'\n");
     }
+    $report .= "swd\n";
   }
   elsif ($SPEC[0] eq 'word-translation') {
     if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/ && $SPEC[3] =~ /^\d+$/) {
       my $file_in  = &create_top_words($input_extension,  $SPEC[2]);
       my $file_out = &create_top_words($output_extension, $SPEC[3]);
-      $ini .= "[word-translation-feature]\n0-0 0 0 0 $file_in $file_out\n\n";
-      $report .= "wt\n";
+      $ini .= "[word-translation-feature]\n0-0 1 0 0 0 0 $file_in $file_out\n\n";
+    }
+    elsif ($SPEC[1] eq 'all') {
+      $ini .= "[word-translation-feature]\n$factor 1 0 0\n\n";
     }
     else {
       die("ERROR: Unknown parameter specification in '$feature_spec'\n");
     }
+    $report .= "wt\n";
   }
   elsif ($SPEC[0] eq 'phrase-length') {
     $ini .= "[phrase-length-feature]\ntrue\n\n";
diff --git a/scripts/ems/support/prepare-fast-align.perl b/scripts/ems/support/prepare-fast-align.perl
new file mode 100755
index 000000000..4bedbe92b
--- /dev/null
+++ b/scripts/ems/support/prepare-fast-align.perl
@@ -0,0 +1,50 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my ($source_file,$target_file,$alignment_factors) = @ARGV;
+
+# initialize data structures for factors
+my (@SOURCE_FACTOR,@TARGET_FACTOR);
+if (defined($alignment_factors)) {
+  my ($source,$target) = split(/\-/,$alignment_factors);
+  @SOURCE_FACTOR = split(/,/,$source);
+  @TARGET_FACTOR = split(/,/,$target);
+}
+
+# loop through corpus file
+open(SOURCE,$source_file);
+open(TARGET,$target_file);
+while(my $source = <SOURCE>) {
+  my $target = <TARGET>;
+  chop($source);
+  chop($target);
+
+  # no factors
+  if (!defined($alignment_factors)) {
+    print "$source ||| $target\n";
+    next;
+  }
+
+  foreach (split(/\s+/,$source)) {
+    my @SOURCE_WORD = split(/\|/);
+    for(my $i=0; $i<scalar(@SOURCE_FACTOR); $i++) {
+      print "|" if $i;
+      print "$SOURCE_WORD[$SOURCE_FACTOR[$i]]";
+    }
+    print " ";
+  }
+  print "|||";
+  foreach (split(/\s+/,$target)) {
+    print " ";
+    my @TARGET_WORD = split(/\|/);
+    for(my $i=0; $i<scalar(@TARGET_FACTOR); $i++) {
+      print "|" if $i;
+      print "$TARGET_WORD[$TARGET_FACTOR[$i]]";
+    }
+  }
+  print "\n";
+}
+close(TARGET);
+close(SOURCE);
+
diff --git a/scripts/ems/support/symmetrize-fast-align.perl b/scripts/ems/support/symmetrize-fast-align.perl
new file mode 100755
index 000000000..245f2e245
--- /dev/null
+++ b/scripts/ems/support/symmetrize-fast-align.perl
@@ -0,0 +1,65 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+die("ERROR: syntax is fastalign2bal.perl direct-alignment inverse-alignment source-file target-file out-stem symmetrization-method symal\n") unless scalar(@ARGV) == 7;
+
+my ($direct_file,$inverse_file,$source_file,$target_file,$out_stem,$symmetrization_method,$symal) = @ARGV;
+
+# symal options (from train-model.perl)
+my ($__symal_a,$__symal_d,$__symal_f,$__symal_b) = ("","no","no","no");
+$__symal_a = "union" if $symmetrization_method eq 'union';
+$__symal_a = "intersect" if $symmetrization_method=~ /intersect/;
+$__symal_a = "grow" if $symmetrization_method=~ /grow/;
+$__symal_a = "srctotgt" if $symmetrization_method=~ /srctotgt/;
+$__symal_a = "tgttosrc" if $symmetrization_method=~ /tgttosrc/;
+$__symal_d = "yes" if $symmetrization_method=~ /diag/;
+$__symal_f = "yes" if $symmetrization_method=~ /final/;
+$__symal_b = "yes" if $symmetrization_method=~ /final-and/;
+my $symal_options = "-alignment=\"$__symal_a\" -diagonal=\"$__symal_d\" -final=\"$__symal_f\" -both=\"$__symal_b\"";
+
+# open files
+open(DIRECT,$direct_file)   || die("ERROR: can't open direct alignment file '$direct_file'");
+open(INVERSE,$inverse_file) || die("ERROR: can't open inverse alignment file '$inverse_file'");
+open(SOURCE,$source_file)   || die("ERROR: can't open source corpus file '$source_file'");
+open(TARGET,$target_file)   || die("ERROR: can't open target corpus file '$target_file'");
+open(OUT,"| $symal $symal_options > $out_stem.$symmetrization_method");
+
+# loop through sentence pairs and bi-directional alignments
+while(my $direct = <DIRECT>) {
+  my $inverse = <INVERSE>;
+  my $source = <SOURCE>;
+  my $target = <TARGET>;
+
+  print OUT "1\n";
+  &convert($target,$direct,0);
+  &convert($source,$inverse,1);
+}
+close(TARGET);
+close(SOURCE);
+close(INVERSE);
+close(DIRECT);
+
+sub convert {
+  my ($text,$alignment,$is_inverse) = @_;
+  chop($text);
+  chop($alignment);
+  $text =~ s/\s+$//;
+  $alignment =~ s/\s+$//;
+  my @TEXT = split(/\s+/,$text);
+  print OUT scalar(@TEXT)." ".$text." #";
+  #print STDERR scalar(@TEXT)." ".$text." #";
+  my %ALIGNMENT;
+  foreach (split(/\s+/,$alignment)) {
+    my ($target,$source);
+    ($target,$source) = split(/\-/,$_) unless $is_inverse;
+    ($source,$target) = split(/\-/,$_) if $is_inverse;
+    $ALIGNMENT{$source} = $target+1;
+  }
+  for(my $i=0;$i<@TEXT;$i++) {
+    print OUT " ".(defined($ALIGNMENT{$i}) ? $ALIGNMENT{$i} : 0);
+    #print STDERR " ".(defined($ALIGNMENT{$i}) ? $ALIGNMENT{$i} : 0);
+  }
+  print OUT "\n";
+  #print STDERR "\n";
+}
author	phikoehn <pkoehn@inf.ed.ac.uk>	2013-05-01 22:20:05 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2013-05-01 22:20:05 +0400
commit	cd8915647b74cd60cb259c2ec8ba5230970389f5 (patch)
tree	6c4357ecb255b3d480a9c729fe20c096512921cb /scripts/ems
parent	8a1e944bb428a0af9f6c82c26e5633361ce4052c (diff)