diff options
Diffstat (limited to 'scripts/training/filter-model-given-input.pl')
-rwxr-xr-x | scripts/training/filter-model-given-input.pl | 788 |
1 files changed, 435 insertions, 353 deletions
diff --git a/scripts/training/filter-model-given-input.pl b/scripts/training/filter-model-given-input.pl index 13ca6910e..a16aeac4a 100755 --- a/scripts/training/filter-model-given-input.pl +++ b/scripts/training/filter-model-given-input.pl @@ -18,11 +18,12 @@ use FindBin qw($RealBin); use Getopt::Long; my $SCRIPTS_ROOTDIR; -if (defined($ENV{"SCRIPTS_ROOTDIR"})) { +if ( defined( $ENV{"SCRIPTS_ROOTDIR"} ) ) { $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; -} else { +} +else { $SCRIPTS_ROOTDIR = $RealBin; - if ($SCRIPTS_ROOTDIR eq '') { + if ( $SCRIPTS_ROOTDIR eq '' ) { $SCRIPTS_ROOTDIR = dirname(__FILE__); } $SCRIPTS_ROOTDIR =~ s/\/training$//; @@ -39,78 +40,93 @@ my $ZCAT = "gzip -cd"; # sometimes you just have to do the right thing without asking my $sort_option = ""; -if (`echo 'youcandoit' | sort --compress-program gzip 2>/dev/null` =~ /youcandoit/) { - $sort_option = "--compress-program gzip "; +if ( `echo 'youcandoit' | sort --compress-program gzip 2>/dev/null` =~ + /youcandoit/ ) +{ + $sort_option = "--compress-program gzip "; } # get optional parameters my $opt_hierarchical = 0; -my $binarizer = undef; -my $threads = 1; # Default is single-thread, i.e. $threads=1 -my $syntax_filter_cmd = "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; -my $min_score = undef; +my $binarizer = undef; +my $threads = 1; # Default is single-thread, i.e. $threads=1 +my $syntax_filter_cmd = + "$SCRIPTS_ROOTDIR/../bin/filter-rule-table hierarchical"; +my $min_score = undef; my $opt_min_non_initial_rule_count = undef; -my $opt_gzip = 1; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats) -my $opt_filter = 1; # enables skipping of filtering - useful for conf net or lattice -my $opt_strip_xml = 1; # disabling XML stripping is required for STSG models where the input is a tree or forest +my $opt_gzip = 1 + ; # gzip output files (so far only phrase-based ttable until someone tests remaining models and formats) +my $opt_filter = + 1; # enables skipping of filtering - useful for conf net or lattice +my $opt_strip_xml = 1 + ; # disabling XML stripping is required for STSG models where the input is a tree or forest my $tempdir = undef; GetOptions( - "gzip!" => \$opt_gzip, - "filter!" => \$opt_filter, - "Hierarchical" => \$opt_hierarchical, - "Binarizer=s" => \$binarizer, - "StripXml!" => \$opt_strip_xml, - "SyntaxFilterCmd=s" => \$syntax_filter_cmd, - "tempdir=s" => \$tempdir, - "MinScore=s" => \$min_score, - "threads=i" => \$threads, - "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED + "gzip!" => \$opt_gzip, + "filter!" => \$opt_filter, + "Hierarchical" => \$opt_hierarchical, + "Binarizer=s" => \$binarizer, + "StripXml!" => \$opt_strip_xml, + "SyntaxFilterCmd=s" => \$syntax_filter_cmd, + "tempdir=s" => \$tempdir, + "MinScore=s" => \$min_score, + "threads=i" => \$threads, + "MinNonInitialRuleCount=i" => \$opt_min_non_initial_rule_count, # DEPRECATED ) or exit(1); # get command line parameters -my $dir = shift; +my $dir = shift; my $config = shift; -my $input = shift; +my $input = shift; -if (!defined $dir || !defined $config || !defined $input) { - print STDERR "usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n"; - exit 1; +if ( !defined $dir || !defined $config || !defined $input ) { + print STDERR +"usage: filter-model-given-input.pl targetdir moses.ini input.text [-Binarizer binarizer] [-Hierarchical] [-MinScore id:threshold[,id:threshold]*] [-SyntaxFilterCmd cmd] [-threads num]\n"; + exit 1; } $dir = ensure_full_path($dir); # Warn if deprecated -MinNonInitialRuleCount option is used -if (defined($opt_min_non_initial_rule_count)) { - print STDERR "WARNING: -MinNonInitialRuleCount is deprecated; use score's -MinCountHierarchical option or set -SyntaxFilterCmd to \"$SCRIPTS_ROOTDIR/training/filter-rule-table.py --min-non-initial-rule=$opt_min_non_initial_rule_count\"\n"; +if ( defined($opt_min_non_initial_rule_count) ) { + print STDERR +"WARNING: -MinNonInitialRuleCount is deprecated; use score's -MinCountHierarchical option or set -SyntaxFilterCmd to \"$SCRIPTS_ROOTDIR/training/filter-rule-table.py --min-non-initial-rule=$opt_min_non_initial_rule_count\"\n"; } -$tempdir = $dir if !defined $tempdir; # use the working directory as temp by def. +$tempdir = $dir + if !defined $tempdir; # use the working directory as temp by def. # decode min-score definitions my %MIN_SCORE; if ($min_score) { - foreach (split(/ *, */,$min_score)) { - my ($id,$score) = split(/ *: */); - $MIN_SCORE{$id} = $score; - print STDERR "score $id must be at least $score\n"; - } + foreach ( split( / *, */, $min_score ) ) { + my ( $id, $score ) = split(/ *: */); + $MIN_SCORE{$id} = $score; + print STDERR "score $id must be at least $score\n"; + } } + # buggy directory in place? -if (-d $dir && ! -e "$dir/info") { - print STDERR "The directory $dir already exists. Please delete $dir and rerun!\n"; +if ( -d $dir && !-e "$dir/info" ) { + print STDERR + "The directory $dir already exists. Please delete $dir and rerun!\n"; exit(1); } # already filtered? check if it can be re-used -if (-d $dir) { +if ( -d $dir ) { my @INFO = `cat $dir/info`; chop(@INFO); - if($INFO[0] ne $config - || ($INFO[1] ne $input && - $INFO[1].".tagged" ne $input)) { - print STDERR "WARNING: directory exists but does not match parameters:\n"; - print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n"; - exit 1; + if ( + $INFO[0] ne $config + || ( $INFO[1] ne $input + && $INFO[1] . ".tagged" ne $input ) + ) + { + print STDERR + "WARNING: directory exists but does not match parameters:\n"; + print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n"; + exit 1; } print STDERR "The filtered model was ready in $dir, not doing anything.\n"; exit 0; @@ -129,191 +145,221 @@ if ($opt_strip_xml) { } # get tables to be filtered (and modify config file) -my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%KNOWN_TTABLE,@TABLE_WEIGHTS,%TABLE_NUMBER); +my ( @TABLE, @TABLE_FACTORS, @TABLE_NEW_NAME, %CONSIDER_FACTORS, %KNOWN_TTABLE, + @TABLE_WEIGHTS, %TABLE_NUMBER ); my %new_name_used = (); -open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini"; -open(INI,$config) or die "Can't read $config"; -while(my $line = <INI>) { - chomp($line); - my @toks = split(/ /, $line); - if ($line =~ /PhraseDictionaryMemory / - || $line =~ /PhraseDictionaryBinary / - || $line =~ /PhraseDictionaryOnDisk / - || $line =~ /PhraseDictionarySCFG / - || $line =~ /RuleTable / - ) { - print STDERR "pt:$line\n"; - - my ($phrase_table_impl,$source_factor,$t,$w,$file,$table_flag,$skip); # = ($1,$2,$3,$4,$5,$6,$7); - $table_flag = ""; - $phrase_table_impl = $toks[0]; - $skip = 0; - - for (my $i = 1; $i < scalar(@toks); ++$i) { - my @args = split(/=/, $toks[$i]); - chomp($args[0]); - chomp($args[1]); - - if ($args[0] eq "num-features") { - $w = $args[1]; - } - elsif ($args[0] eq "input-factor") { - $source_factor = $args[1]; - } - elsif ($args[0] eq "output-factor") { - $t = $args[1]; - } - elsif ($args[0] eq "path") { - $file = $args[1]; - } - elsif ($args[0] eq "filterable" && $args[1] eq "false") { - $skip = 1; - } - } #for (my $i = 1; $i < scalar(@toks); ++$i) { - - if (($phrase_table_impl ne "PhraseDictionaryMemory" && $phrase_table_impl ne "PhraseDictionarySCFG" && $phrase_table_impl ne "RuleTable") || $file =~ /glue-grammar/ || $skip) { - # Only Memory ("0") and NewFormat ("6") can be filtered. - print INI_OUT "$line\n"; - next; - } +open( INI_OUT, ">$dir/moses.ini" ) or die "Can't write $dir/moses.ini"; +open( INI, $config ) or die "Can't read $config"; +while ( my $line = <INI> ) { + chomp($line); + my @toks = split( / /, $line ); + if ( $line =~ /PhraseDictionaryMemory / + || $line =~ /PhraseDictionaryBinary / + || $line =~ /PhraseDictionaryOnDisk / + || $line =~ /PhraseDictionarySCFG / + || $line =~ /RuleTable / ) + { + print STDERR "pt:$line\n"; + + my ( $phrase_table_impl, $source_factor, $t, $w, $file, $table_flag, + $skip ); # = ($1,$2,$3,$4,$5,$6,$7); + $table_flag = ""; + $phrase_table_impl = $toks[0]; + $skip = 0; + + for ( my $i = 1 ; $i < scalar(@toks) ; ++$i ) { + my @args = split( /=/, $toks[$i] ); + chomp( $args[0] ); + chomp( $args[1] ); + + if ( $args[0] eq "num-features" ) { + $w = $args[1]; + } + elsif ( $args[0] eq "input-factor" ) { + $source_factor = $args[1]; + } + elsif ( $args[0] eq "output-factor" ) { + $t = $args[1]; + } + elsif ( $args[0] eq "path" ) { + $file = $args[1]; + } + elsif ( $args[0] eq "filterable" && $args[1] eq "false" ) { + $skip = 1; + } + } #for (my $i = 1; $i < scalar(@toks); ++$i) { + + if ( + ( + $phrase_table_impl ne "PhraseDictionaryMemory" + && $phrase_table_impl ne "PhraseDictionarySCFG" + && $phrase_table_impl ne "RuleTable" + ) + || $file =~ /glue-grammar/ + || $skip + ) + { + # Only Memory ("0") and NewFormat ("6") can be filtered. + print INI_OUT "$line\n"; + next; + } - push @TABLE, $file; - push @TABLE_WEIGHTS,$w; - $KNOWN_TTABLE{$#TABLE}++; - - my $new_name = "$dir/phrase-table.$source_factor-$t.".(++$TABLE_NUMBER{"$source_factor-$t"}); - my $cnt = 1; - $cnt ++ while (defined $new_name_used{"$new_name.$cnt"}); - $new_name .= ".$cnt"; - $new_name_used{$new_name} = 1; - if ($binarizer && $phrase_table_impl eq "PhraseDictionarySCFG") { - $phrase_table_impl = "PhraseDictionaryOnDisk"; - @toks = set_value(\@toks, "path", "$new_name.bin$table_flag"); - } - elsif ($binarizer && $phrase_table_impl eq "PhraseDictionaryMemory") { - if ($binarizer =~ /processPhraseTableMin/) { - $phrase_table_impl = "PhraseDictionaryCompact"; - @toks = set_value(\@toks, "path", "$new_name$table_flag"); - } - elsif ($binarizer =~ /CreateOnDiskPt/) { - $phrase_table_impl = "PhraseDictionaryOnDisk"; - @toks = set_value(\@toks, "path", "$new_name.bin$table_flag"); - } - else { - $phrase_table_impl = "PhraseDictionaryBinary"; - @toks = set_value(\@toks, "path", "$new_name$table_flag"); - } - } - else { - $new_name .= ".gz" if $opt_gzip; - @toks = set_value(\@toks, "path", "$new_name$table_flag"); - } + push @TABLE, $file; + push @TABLE_WEIGHTS, $w; + $KNOWN_TTABLE{$#TABLE}++; + + my $new_name = "$dir/phrase-table.$source_factor-$t." + . ( ++$TABLE_NUMBER{"$source_factor-$t"} ); + my $cnt = 1; + $cnt++ while ( defined $new_name_used{"$new_name.$cnt"} ); + $new_name .= ".$cnt"; + $new_name_used{$new_name} = 1; + if ( $binarizer && $phrase_table_impl eq "PhraseDictionarySCFG" ) { + $phrase_table_impl = "PhraseDictionaryOnDisk"; + @toks = set_value( \@toks, "path", "$new_name.bin$table_flag" ); + } + elsif ( $binarizer && $phrase_table_impl eq "PhraseDictionaryMemory" ) { + if ( $binarizer =~ /processPhraseTableMin/ ) { + $phrase_table_impl = "PhraseDictionaryCompact"; + @toks = set_value( \@toks, "path", "$new_name$table_flag" ); + } + elsif ( $binarizer =~ /CreateOnDiskPt/ ) { + $phrase_table_impl = "PhraseDictionaryOnDisk"; + @toks = set_value( \@toks, "path", "$new_name.bin$table_flag" ); + } + elsif ( $binarizer =~ /CreateProbingPT2/ ) { + $phrase_table_impl = "ProbingPT"; + @toks = set_value( \@toks, "path", "$new_name.probing$table_flag" ); + } + else { + $phrase_table_impl = "PhraseDictionaryBinary"; + @toks = set_value( \@toks, "path", "$new_name$table_flag" ); + } + } + else { + $new_name .= ".gz" if $opt_gzip; + @toks = set_value( \@toks, "path", "$new_name$table_flag" ); + } - $toks[0] = $phrase_table_impl; + $toks[0] = $phrase_table_impl; - print INI_OUT join_array(\@toks)."\n"; + print INI_OUT join_array( \@toks ) . "\n"; - push @TABLE_NEW_NAME,$new_name; + push @TABLE_NEW_NAME, $new_name; - $CONSIDER_FACTORS{$source_factor} = 1; - print STDERR "Considering factor $source_factor\n"; - push @TABLE_FACTORS, $source_factor; + $CONSIDER_FACTORS{$source_factor} = 1; + print STDERR "Considering factor $source_factor\n"; + push @TABLE_FACTORS, $source_factor; - } #if (/PhraseModel /) { - elsif ($line =~ /LexicalReordering /) { - print STDERR "ro:$line\n"; - my ($source_factor, $t, $w, $file); # = ($1,$2,$3,$4); + } #if (/PhraseModel /) { + elsif ( $line =~ /LexicalReordering / ) { + print STDERR "ro:$line\n"; + my ( $source_factor, $t, $w, $file ); # = ($1,$2,$3,$4); + my $dest_factor; - for (my $i = 1; $i < scalar(@toks); ++$i) { - my @args = split(/=/, $toks[$i]); - chomp($args[0]); - chomp($args[1]); + for ( my $i = 1 ; $i < scalar(@toks) ; ++$i ) { + my @args = split( /=/, $toks[$i] ); + chomp( $args[0] ); + chomp( $args[1] ); - if ($args[0] eq "num-features") { - $w = $args[1]; - } - elsif ($args[0] eq "input-factor") { - $source_factor = $args[1]; - } - elsif ($args[0] eq "output-factor") { - #$t = chomp($args[1]); - } - elsif ($args[0] eq "type") { - $t = $args[1]; - } - elsif ($args[0] eq "path") { - $file = $args[1]; - } + if ( $args[0] eq "num-features" ) { + $w = $args[1]; + } + elsif ( $args[0] eq "input-factor" ) { + $source_factor = $args[1]; + } + elsif ( $args[0] eq "output-factor" ) { + + #$t = chomp($args[1]); + $dest_factor = $args[1]; + } + elsif ( $args[0] eq "type" ) { + $t = $args[1]; + } + elsif ( $args[0] eq "path" ) { + $file = $args[1]; + } - } # for (my $i = 1; $i < scalar(@toks); ++$i) { + } # for (my $i = 1; $i < scalar(@toks); ++$i) { - push @TABLE, $file; - push @TABLE_WEIGHTS,$w; + push @TABLE, $file; + push @TABLE_WEIGHTS, $w; - $file =~ s/^.*\/+([^\/]+)/$1/g; - my $new_name = "$dir/$file"; - $new_name =~ s/\.gz//; + $file =~ s/^.*\/+([^\/]+)/$1/g; + my $new_name = "$dir/$file"; + $new_name =~ s/\.gz//; - #print INI_OUT "$source_factor $t $w $new_name\n"; - @toks = set_value(\@toks, "path", "$new_name"); - print INI_OUT join_array(\@toks)."\n"; +# avoid name collisions for multiple reordering tables; using phrase-table numbering scheme (except for TABLE_NUMBER) + $new_name .= ".$source_factor-$dest_factor"; + my $cnt = 1; + $cnt++ while ( defined $new_name_used{"$new_name.$cnt"} ); + $new_name .= ".$cnt"; + $new_name_used{$new_name} = 1; - push @TABLE_NEW_NAME,$new_name; + #print INI_OUT "$source_factor $t $w $new_name\n"; + @toks = set_value( \@toks, "path", "$new_name" ); + print INI_OUT join_array( \@toks ) . "\n"; - $CONSIDER_FACTORS{$source_factor} = 1; - print STDERR "Considering factor $source_factor\n"; - push @TABLE_FACTORS,$source_factor; + push @TABLE_NEW_NAME, $new_name; + $CONSIDER_FACTORS{$source_factor} = 1; + print STDERR "Considering factor $source_factor\n"; + push @TABLE_FACTORS, $source_factor; - } #elsif (/LexicalReordering /) { - else { - print INI_OUT "$line\n"; - } -} # while(<INI>) { + } #elsif (/LexicalReordering /) { + else { + print INI_OUT "$line\n"; + } +} # while(<INI>) { close(INI); close(INI_OUT); my %TMP_INPUT_FILENAME; if ($opt_hierarchical) { - if (!$opt_strip_xml) { - print STDERR "WARNING: source factor reduction is disabled due to use of -noStripXML option\n"; - } else { - # Write a separate, temporary input file for each combination of source - # factors - foreach my $key (keys %CONSIDER_FACTORS) { - my $filename = "$dir/input-$key"; - open(FILEHANDLE,">$filename") or die "Can't open $filename for writing"; - $TMP_INPUT_FILENAME{$key} = $filename; - my @FACTOR = split(/,/, $key); - my $cmd = "$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |"; - print STDERR "Executing: $cmd\n"; - open(PIPE,$cmd); - while (my $line = <PIPE>) { - print FILEHANDLE $line - } - close(FILEHANDLE); + if ( !$opt_strip_xml ) { + print STDERR +"WARNING: source factor reduction is disabled due to use of -noStripXML option\n"; + } + else { + # Write a separate, temporary input file for each combination of source + # factors + foreach my $key ( keys %CONSIDER_FACTORS ) { + my $filename = "$dir/input-$key"; + open( FILEHANDLE, ">$filename" ) + or die "Can't open $filename for writing"; + $TMP_INPUT_FILENAME{$key} = $filename; + my @FACTOR = split( /,/, $key ); + my $cmd = + "$SCRIPTS_ROOTDIR/training/reduce_combine.pl $input @FACTOR |"; + print STDERR "Executing: $cmd\n"; + open( PIPE, $cmd ); + while ( my $line = <PIPE> ) { + print FILEHANDLE $line; + } + close(FILEHANDLE); + } } - } } my %PHRASE_USED; -if ($opt_filter && !$opt_hierarchical) { +if ( $opt_filter && !$opt_hierarchical ) { + # get the phrase pairs appearing in the input text, up to the $MAX_LENGTH - open(INPUT,mk_open_string($input)) or die "Can't read $input"; - while(my $line = <INPUT>) { + open( INPUT, mk_open_string($input) ) or die "Can't read $input"; + while ( my $line = <INPUT> ) { chomp($line); - my @WORD = split(/ +/,$line); - for(my $i=0;$i<=$#WORD;$i++) { - for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { - foreach (keys %CONSIDER_FACTORS) { + my @WORD = split( / +/, $line ); + for ( my $i = 0 ; $i <= $#WORD ; $i++ ) { + for ( my $j = 0 ; $j < $MAX_LENGTH && $j + $i <= $#WORD ; $j++ ) { + foreach ( keys %CONSIDER_FACTORS ) { my @FACTOR = split(/,/); my $phrase = ""; - for(my $k=$i;$k<=$i+$j;$k++) { - my @WORD_FACTOR = split(/\|/,$WORD[$k]); - for(my $f=0;$f<=$#FACTOR;$f++) { - $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; + for ( my $k = $i ; $k <= $i + $j ; $k++ ) { + my @WORD_FACTOR = split( /\|/, $WORD[$k] ); + for ( my $f = 0 ; $f <= $#FACTOR ; $f++ ) { + $phrase .= $WORD_FACTOR[ $FACTOR[$f] ] . "|"; } chop($phrase); $phrase .= " "; @@ -329,183 +375,220 @@ if ($opt_filter && !$opt_hierarchical) { # filter files print STDERR "Filtering files...\n"; -for(my $i=0;$i<=$#TABLE;$i++) { - my ($used,$total) = (0,0); - my $file = $TABLE[$i]; - my $factors = $TABLE_FACTORS[$i]; +for ( my $i = 0 ; $i <= $#TABLE ; $i++ ) { + my ( $used, $total ) = ( 0, 0 ); + my $file = $TABLE[$i]; + my $factors = $TABLE_FACTORS[$i]; my $new_file = $TABLE_NEW_NAME[$i]; print STDERR "filtering $file -> $new_file...\n"; - my $mid_file = $new_file; # used when both filtering and binarizing - if (!$opt_filter) { - # check if original file was gzipped - if ($file !~ /\.gz$/ && -e "$file.gz") { - $file .= ".gz"; - } - $mid_file .= ".gz" if $file =~ /\.gz$/; - $cmd = "ln -s $file $mid_file"; - safesystem($cmd) or die "Failed to make symlink"; - } else { - - $mid_file .= ".gz" - if $mid_file !~ /\.gz/ - && $binarizer && $binarizer =~ /processPhraseTable/; - - my $openstring = mk_open_string($file); - - my $mid_openstring; - if ($mid_file =~ /\.gz$/) { + my $mid_file = $new_file; # used when both filtering and binarizing + + $mid_file .= ".gz" + if $mid_file !~ /\.gz/ + && $binarizer + && $binarizer =~ /processPhraseTable/; + + my $openstring = mk_open_string($file); + + my $mid_openstring; + if ( $mid_file =~ /\.gz$/ ) { $mid_openstring = "| gzip -c > $mid_file"; - } else { + } + else { $mid_openstring = ">$mid_file"; - } - - - open(FILE_OUT,$mid_openstring) or die "Can't write to $mid_openstring"; - - if ($opt_hierarchical) { - my $input_file = $opt_strip_xml ? $TMP_INPUT_FILENAME{$factors} : $input; - $cmd = "$openstring $syntax_filter_cmd $input_file |"; - print STDERR "Executing: $cmd\n"; - open(PIPE,$cmd); - while (my $line = <PIPE>) { - print FILE_OUT $line - } - close(FILEHANDLE); - } else { - open(FILE,$openstring) or die "Can't open '$openstring'"; - while(my $entry = <FILE>) { - my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); - $foreign =~ s/ $//; - if (defined($PHRASE_USED{$factors}{$foreign})) { - # handle min_score thresholds - if ($min_score) { - my @ITEM = split(/ *\|\|\| */,$rest); - if(scalar (@ITEM)>2) { # do not filter reordering table - my @SCORE = split(/ /,$ITEM[1]); - my $okay = 1; - foreach my $id (keys %MIN_SCORE) { - $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id}; - } - next unless $okay; - } - } - print FILE_OUT $entry; - $used++; - } - $total++; - } - close(FILE); - die "No phrases found in $file!" if $total == 0; - printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; - } - - close(FILE_OUT); + } + + if ( !$opt_filter ) { + # not filtering + if ( defined($min_score) and $KNOWN_TTABLE{$i} ) { + + # Threshold pruning + $cmd = +"$openstring $RealBin/threshold-filter.perl $min_score $mid_openstring"; + safesystem($cmd) or die "Threshold pruning of phrase table failed"; + } + else { + # If we are not filtering, or threshold pruning a phrase table, then + # we can just sym-link it. + # check if original file was gzipped + if ( $file !~ /\.gz$/ && -e "$file.gz" ) { + $file .= ".gz"; + } + $cmd = "ln -s $file $mid_file"; + safesystem($cmd) or die "Failed to make symlink"; + } } + else { + + open( FILE_OUT, $mid_openstring ) + or die "Can't write to $mid_openstring"; - my $catcmd = ($mid_file =~ /\.gz$/ ? "$ZCAT" : "cat"); - if(defined($binarizer)) { - print STDERR "binarizing...\n"; - # translation model - if ($KNOWN_TTABLE{$i}) { - # ... hierarchical translation model if ($opt_hierarchical) { - my $cmd = "$binarizer $mid_file $new_file.bin"; - safesystem($cmd) or die "Can't binarize"; + my $input_file = + $opt_strip_xml ? $TMP_INPUT_FILENAME{$factors} : $input; + $cmd = "$openstring $syntax_filter_cmd $input_file |"; + print STDERR "Executing: $cmd\n"; + open( PIPE, $cmd ); + while ( my $line = <PIPE> ) { + print FILE_OUT $line; + } + close(FILEHANDLE); } - # ... phrase translation model - elsif ($binarizer =~ /processPhraseTableMin/) { - #compact phrase table - my $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $binarizer -in $mid_file.sorted.gz -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted.gz"; - safesystem($cmd) or die "Can't binarize"; - } elsif ($binarizer =~ /CreateOnDiskPt/) { - my $cmd = "$binarizer $mid_file $new_file.bin"; - safesystem($cmd) or die "Can't binarize"; - } else { - my $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; - safesystem($cmd) or die "Can't binarize"; + else { + open( FILE, $openstring ) or die "Can't open '$openstring'"; + while ( my $entry = <FILE> ) { + my ( $foreign, $rest ) = split( / \|\|\| /, $entry, 2 ); + $foreign =~ s/ $//; + if ( defined( $PHRASE_USED{$factors}{$foreign} ) ) { + + # handle min_score thresholds + if ($min_score) { + my @ITEM = split( / *\|\|\| */, $rest ); + if ( scalar(@ITEM) > 2 ) + { # do not filter reordering table + my @SCORE = split( / /, $ITEM[1] ); + my $okay = 1; + foreach my $id ( keys %MIN_SCORE ) { + $okay = 0 if $SCORE[$id] < $MIN_SCORE{$id}; + } + next unless $okay; + } + } + print FILE_OUT $entry; + $used++; + } + $total++; + } + close(FILE); + die "No phrases found in $file!" if $total == 0; + printf STDERR +"$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n", + ( 100 * $used / $total ), '%'; } - } - # reordering model - else { - my $lexbin; - $lexbin = $binarizer; - if ($binarizer =~ /CreateOnDiskPt/) { - $lexbin =~ s/CreateOnDiskPt/processLexicalTable/; + + close(FILE_OUT); + + } + + my $catcmd = ( $mid_file =~ /\.gz$/ ? "$ZCAT" : "cat" ); + if ( defined($binarizer) ) { + print STDERR "binarizing...\n"; + + # translation model + if ( $KNOWN_TTABLE{$i} ) { + if ( $binarizer =~ /processPhraseTableMin/ ) { + + #compact phrase table + my $cmd = +"$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $binarizer -in $mid_file.sorted.gz -out $new_file -nscores $TABLE_WEIGHTS[$i] -threads $threads && rm $mid_file.sorted.gz"; + safesystem($cmd) or die "Can't binarize"; + } + elsif ( $binarizer =~ /CreateOnDiskPt/ ) { + my $cmd = "$binarizer $mid_file $new_file.bin"; + safesystem($cmd) or die "Can't binarize"; + } + elsif ( $binarizer =~ /CreateProbingPT2/ ) { + my $cmd = "$binarizer --input-pt $mid_file --output-dir $new_file.probing"; + if ($opt_hierarchical) { + $cmd .= " --scfg"; + } + safesystem($cmd) or die "Can't binarize"; + } + else { + my $cmd = +"$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; + safesystem($cmd) or die "Can't binarize"; + } } - $lexbin =~ s/PhraseTable/LexicalTable/; - my $cmd; - if ($lexbin =~ /processLexicalTableMin/) { - $cmd = "$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $lexbin -in $mid_file.sorted.gz -out $new_file -threads $threads && rm $mid_file.sorted.gz"; - } else { - $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options - $cmd = "$lexbin -in $mid_file -out $new_file"; + # reordering model + else { + my $lexbin; + $lexbin = $binarizer; + if ( $binarizer =~ /CreateOnDiskPt/ ) { + $lexbin =~ s/CreateOnDiskPt/processLexicalTable/; + } + elsif ( $binarizer =~ /CreateProbingPT2/ ) { + $lexbin =~ s/CreateProbingPT2/processLexicalTableMin/; + } + + $lexbin =~ s/PhraseTable/LexicalTable/; + my $cmd; + if ( $lexbin =~ /processLexicalTableMin/ ) { + $cmd = +"$catcmd $mid_file | LC_ALL=C sort $sort_option -T $tempdir | gzip - > $mid_file.sorted.gz && $lexbin -in $mid_file.sorted.gz -out $new_file -threads $threads && rm $mid_file.sorted.gz"; + } + else { + $lexbin =~ s/^\s*(\S+)\s.+/$1/; # no options + $cmd = "$lexbin -in $mid_file -out $new_file"; + } + safesystem($cmd) or die "Can't binarize"; } - safesystem($cmd) or die "Can't binarize"; - } } } # Remove any temporary input files unlink values %TMP_INPUT_FILENAME; -open(INFO,">$dir/info"); +open( INFO, ">$dir/info" ); print INFO "$config\n$input\n"; close(INFO); - print "To run the decoder, please call: moses -f $dir/moses.ini -i $input\n"; # functions sub mk_open_string { - my $file = shift; - my $openstring; - if ($file !~ /\.gz$/ && -e "$file.gz") { - $openstring = "$ZCAT $file.gz |"; - } elsif ($file =~ /\.gz$/) { - $openstring = "$ZCAT $file |"; - } elsif ($opt_hierarchical) { - $openstring = "cat $file |"; - } else { - $openstring = "< $file"; - } - return $openstring; + my $file = shift; + my $openstring; + if ( $file !~ /\.gz$/ && -e "$file.gz" ) { + $openstring = "$ZCAT $file.gz |"; + } + elsif ( $file =~ /\.gz$/ ) { + $openstring = "$ZCAT $file |"; + } + elsif ($opt_hierarchical) { + $openstring = "cat $file |"; + } + else { + $openstring = "< $file"; + } + return $openstring; } - sub safesystem { - print STDERR "Executing: @_\n"; - system("bash", "-c", @_); - if ($? == -1) { - print STDERR "Failed to execute: @_\n $!\n"; - exit(1); - } - elsif ($? & 127) { - printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - exit(1); - } - else { - my $exitcode = $? >> 8; - print STDERR "Exit code: $exitcode\n" if $exitcode; - return ! $exitcode; - } + print STDERR "Executing: @_\n"; + system( "bash", "-c", @_ ); + if ( $? == -1 ) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ( $? & 127 ) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ( $? & 127 ), ( $? & 128 ) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return !$exitcode; + } } sub ensure_full_path { my $PATH = shift; return $PATH if $PATH =~ /^\//; my $dir = `pawd 2>/dev/null`; - if (!$dir) {$dir = `pwd`;} + if ( !$dir ) { $dir = `pwd`; } chomp $dir; - $PATH = $dir."/".$PATH; + $PATH = $dir . "/" . $PATH; $PATH =~ s/[\r\n]//g; $PATH =~ s/\/\.\//\//g; $PATH =~ s/\/+/\//g; my $sanity = 0; - while($PATH =~ /\/\.\.\// && $sanity++<10) { + + while ( $PATH =~ /\/\.\.\// && $sanity++ < 10 ) { $PATH =~ s/\/+/\//g; $PATH =~ s/\/[^\/]+\/\.\.\//\//g; } @@ -515,32 +598,31 @@ sub ensure_full_path { } sub join_array { - my @outside = @{$_[0]}; + my @outside = @{ $_[0] }; - my $ret = ""; - for (my $i = 0; $i < scalar(@outside); ++$i) { - my $tok = $outside[$i]; - $ret .= "$tok "; - } + my $ret = ""; + for ( my $i = 0 ; $i < scalar(@outside) ; ++$i ) { + my $tok = $outside[$i]; + $ret .= "$tok "; + } - return $ret; + return $ret; } sub set_value { - my @arr = @{$_[0]}; - my $keySought = $_[1]; - my $newValue = $_[2]; + my @arr = @{ $_[0] }; + my $keySought = $_[1]; + my $newValue = $_[2]; - for (my $i = 1; $i < scalar(@arr); ++$i) { - my @inside = split(/=/, $arr[$i]); + for ( my $i = 1 ; $i < scalar(@arr) ; ++$i ) { + my @inside = split( /=/, $arr[$i] ); - my $key = $inside[0]; - if ($key eq $keySought) { - $arr[$i] = "$key=$newValue"; - return @arr; - } - } - return @arr; + my $key = $inside[0]; + if ( $key eq $keySought ) { + $arr[$i] = "$key=$newValue"; + return @arr; + } + } + return @arr; } - |