Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2013-05-02 18:15:52 +0400
committerBarry Haddow <barry.haddow@gmail.com>2013-05-02 18:15:52 +0400
commit5eebb9538ef25824504214852acc38bc4a93fb2e (patch)
treea41e99097d7f5aa64c3d18c6119c9a580264aa81 /scripts/training
parent5638aa6a32ce0e6b0f68c0f6d073e40adf19124d (diff)
Enable skipping of filtering in EMS
Use 'binarize-all = path-to-binarize-model.perl
Diffstat (limited to 'scripts/training')
-rwxr-xr-xscripts/training/binarize-model.perl62
-rwxr-xr-xscripts/training/filter-model-given-input.pl175
2 files changed, 159 insertions, 78 deletions
diff --git a/scripts/training/binarize-model.perl b/scripts/training/binarize-model.perl
new file mode 100755
index 000000000..15ad23ac4
--- /dev/null
+++ b/scripts/training/binarize-model.perl
@@ -0,0 +1,62 @@
+#!/usr/bin/perl -w
+
+#
+# Binarize a Moses model
+#
+
+use strict;
+
+use Getopt::Long "GetOptions";
+use FindBin qw($RealBin);
+
+$ENV{"LC_ALL"} = "C";
+my $SCRIPTS_ROOTDIR = $RealBin;
+if ($SCRIPTS_ROOTDIR eq '') {
+ $SCRIPTS_ROOTDIR = dirname(__FILE__);
+}
+$SCRIPTS_ROOTDIR =~ s/\/training$//;
+
+my ($binarizer, $input_config, $output_config);
+my $opt_hierarchical = 0;
+$binarizer = "$SCRIPTS_ROOTDIR/../bin/processPhraseTable";
+GetOptions(
+ "Hierarchical" => \$opt_hierarchical,
+ "Binarizer=s" => \$binarizer
+) or exit(1);
+
+$input_config = shift;
+$output_config = shift;
+
+if (!defined $input_config || !defined $output_config) {
+ print STDERR "usage: binarize-model.perl input-config output-config [-Binarizer binarizer]\n";
+ exit 1;
+}
+
+my $hierarchical = "";
+$hierarchical = "-Hierarchical" if $opt_hierarchical;
+my $targetdir = "$output_config.tables";
+
+safesystem("$RealBin/filter-model-given-input.pl $targetdir $input_config /dev/null $hierarchical -nofilter -Binarizer $binarizer") || die "binarising failed";
+safesystem("rm -f $output_config; ln -s $targetdir/moses.ini $output_config") || die "failed to link new ini file";
+
+#FIXME: Why isn't this in a module?
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
+
+
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl
index d994fbcef..2452ca40c 100755
--- a/scripts/training/filter-model-given-input.pl
+++ b/scripts/training/filter-model-given-input.pl
@@ -38,9 +38,11 @@ my $opt_hierarchical = 0;
my $binarizer = undef;
my $opt_min_non_initial_rule_count = undef;
my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats)
+my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice
GetOptions(
"gzip!" => \$opt_gzip,
+ "filter!" => \$opt_filter,
"Hierarchical" => \$opt_hierarchical,
"Binarizer=s" => \$binarizer,
"MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count
@@ -165,51 +167,53 @@ close(INI);
close(INI_OUT);
my %TMP_INPUT_FILENAME;
+my %PHRASE_USED;
-if ($opt_hierarchical)
-{
- # Write a separate, temporary input file for each combination of source
- # factors
- foreach my $key (keys %CONSIDER_FACTORS) {
- my $filename = "$dir/input-$key";
- open(FILEHANDLE,">$filename") or die "Can't open $filename for writing";
- $TMP_INPUT_FILENAME{$key} = $filename;
- my @FACTOR = split(/,/, $key);
- open(PIPE,"$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |");
- while (my $line = <PIPE>) {
- print FILEHANDLE $line
- }
- close(FILEHANDLE);
- }
-}
+if ($opt_filter) {
+ if ($opt_hierarchical)
+ {
+ # Write a separate, temporary input file for each combination of source
+ # factors
+ foreach my $key (keys %CONSIDER_FACTORS) {
+ my $filename = "$dir/input-$key";
+ open(FILEHANDLE,">$filename") or die "Can't open $filename for writing";
+ $TMP_INPUT_FILENAME{$key} = $filename;
+ my @FACTOR = split(/,/, $key);
+ open(PIPE,"$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |");
+ while (my $line = <PIPE>) {
+ print FILEHANDLE $line
+ }
+ close(FILEHANDLE);
+ }
+ }
-my %PHRASE_USED;
-if (!$opt_hierarchical) {
- # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
- open(INPUT,mk_open_string($input)) or die "Can't read $input";
- while(my $line = <INPUT>) {
- chomp($line);
- my @WORD = split(/ +/,$line);
- for(my $i=0;$i<=$#WORD;$i++) {
- for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
- foreach (keys %CONSIDER_FACTORS) {
- my @FACTOR = split(/,/);
- my $phrase = "";
- for(my $k=$i;$k<=$i+$j;$k++) {
- my @WORD_FACTOR = split(/\|/,$WORD[$k]);
- for(my $f=0;$f<=$#FACTOR;$f++) {
- $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
- }
- chop($phrase);
- $phrase .= " ";
- }
- chop($phrase);
- $PHRASE_USED{$_}{$phrase}++;
- }
- }
- }
- }
- close(INPUT);
+ if (!$opt_hierarchical) {
+ # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH
+ open(INPUT,mk_open_string($input)) or die "Can't read $input";
+ while(my $line = <INPUT>) {
+ chomp($line);
+ my @WORD = split(/ +/,$line);
+ for(my $i=0;$i<=$#WORD;$i++) {
+ for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
+ foreach (keys %CONSIDER_FACTORS) {
+ my @FACTOR = split(/,/);
+ my $phrase = "";
+ for(my $k=$i;$k<=$i+$j;$k++) {
+ my @WORD_FACTOR = split(/\|/,$WORD[$k]);
+ for(my $f=0;$f<=$#FACTOR;$f++) {
+ $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
+ }
+ chop($phrase);
+ $phrase .= " ";
+ }
+ chop($phrase);
+ $PHRASE_USED{$_}{$phrase}++;
+ }
+ }
+ }
+ }
+ close(INPUT);
+ }
}
sub mk_open_string {
@@ -235,41 +239,57 @@ for(my $i=0;$i<=$#TABLE;$i++) {
my $factors = $TABLE_FACTORS[$i];
my $new_file = $TABLE_NEW_NAME[$i];
print STDERR "filtering $file -> $new_file...\n";
+ my $cat_or_zcat = "cat"; # How to open filtered corpus
+ if (!$opt_filter) {
+ # check if original file was gzipped
+ if ($file !~ /\.gz$/ && -e "$file.gz") {
+ $file .= ".gz";
+ }
+ if ($file =~ /\.gz$/) {
+ $cat_or_zcat = $ZCAT;
+ }
+ safesystem("ln -s $file $new_file");
+ } else {
- my $openstring = mk_open_string($file);
+ my $openstring = mk_open_string($file);
- my $new_openstring;
- if ($new_file =~ /\.gz$/) {
- $new_openstring = "| gzip -c > $new_file";
- } else {
- $new_openstring = ">$new_file";
- }
+ my $new_openstring;
+ if ($new_file =~ /\.gz$/) {
+ $new_openstring = "| gzip -c > $new_file";
+ } else {
+ $new_openstring = ">$new_file";
+ }
- open(FILE_OUT,$new_openstring) or die "Can't write to $new_openstring";
+ #FIXME: Shouldn't need to make a text version of the filtered table if we're going
+ # to binarise it.
- if ($opt_hierarchical) {
- my $tmp_input = $TMP_INPUT_FILENAME{$factors};
- my $options = "";
- $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
- open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
- while (my $line = <PIPE>) {
- print FILE_OUT $line
- }
- close(FILEHANDLE);
- } else {
- open(FILE,$openstring) or die "Can't open '$openstring'";
- while(my $entry = <FILE>) {
- my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
- $foreign =~ s/ $//;
- if (defined($PHRASE_USED{$factors}{$foreign})) {
- print FILE_OUT $entry;
- $used++;
- }
- $total++;
- }
- close(FILE);
- die "No phrases found in $file!" if $total == 0;
- printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+ open(FILE_OUT,$new_openstring) or die "Can't write to $new_openstring";
+
+ if ($opt_hierarchical) {
+ my $tmp_input = $TMP_INPUT_FILENAME{$factors};
+ my $options = "";
+ $options .= "--min-non-initial-rule-count=$opt_min_non_initial_rule_count" if defined($opt_min_non_initial_rule_count);
+ open(PIPE,"$openstring $SCRIPTS_ROOTDIR/training/filter-rule-table.py $options $tmp_input |");
+ while (my $line = <PIPE>) {
+ print FILE_OUT $line
+ }
+ close(FILEHANDLE);
+ } else {
+ open(FILE,$openstring) or die "Can't open '$openstring'";
+ while(my $entry = <FILE>) {
+ my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
+ $foreign =~ s/ $//;
+ if (defined($PHRASE_USED{$factors}{$foreign})) {
+ print FILE_OUT $entry;
+ $used++;
+ }
+ $total++;
+ }
+ close(FILE);
+ die "No phrases found in $file!" if $total == 0;
+ printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+ }
+ close(FILE_OUT);
}
if(defined($binarizer)) {
@@ -285,11 +305,11 @@ for(my $i=0;$i<=$#TABLE;$i++) {
# ... phrase translation model
elsif ($binarizer =~ /processPhraseTableMin/) {
#compact phrase table
- my $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
+ my $cmd = "$cat_or_zcat $new_file | LC_ALL=C sort -T $dir > $new_file.sorted; $binarizer -in $new_file.sorted -out $new_file -nscores $TABLE_WEIGHTS[$i]; rm $new_file.sorted";
print STDERR $cmd."\n";
print STDERR `$cmd`;
} else {
- my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
+ my $cmd = "$cat_or_zcat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file";
print STDERR $cmd."\n";
print STDERR `$cmd`;
}
@@ -300,7 +320,7 @@ for(my $i=0;$i<=$#TABLE;$i++) {
$lexbin =~ s/PhraseTable/LexicalTable/;
my $cmd;
if ($lexbin =~ /processLexicalTableMin/) {
- $cmd = "LC_ALL=C sort -T $dir $new_file > $new_file.sorted; $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
+ $cmd = "$cat_or_zcat $new_file | LC_ALL=C sort -T $dir > $new_file.sorted; $lexbin -in $new_file.sorted -out $new_file; rm $new_file.sorted";
} else {
$lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options
$cmd = "$lexbin -in $new_file -out $new_file";
@@ -310,7 +330,6 @@ for(my $i=0;$i<=$#TABLE;$i++) {
}
}
- close(FILE_OUT);
}
if ($opt_hierarchical)