From 54ab89deabf98663a2970565044169014892ecfe Mon Sep 17 00:00:00 2001 From: eherbst Date: Mon, 31 Jul 2006 22:14:08 +0000 Subject: seems this script does not have the same functionality as Ondrej's, and his are meant for training and this for analysis git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@430 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/analysis/smtgui/filter-phrase-table.pl | 83 ++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 scripts/analysis/smtgui/filter-phrase-table.pl diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl new file mode 100644 index 000000000..a7e998794 --- /dev/null +++ b/scripts/analysis/smtgui/filter-phrase-table.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl -w + +#by Philipp Koehn, de-augmented by Evan Herbst +#filter a phrase table for a specific input corpus +#arguments: phrasetable_filename input_filename factor_index (0...) +#outputs to phrasetable_filename.short + +#similar function to filter-model-given-input.pl, but only operates +#on the phrase table and doesn't require that any subdirectories exist + +use strict; + +my $MAX_LENGTH = 10; + +my ($file, $input, $source_factor) = @ARGV; +my $dir = "."; + + # get tables to be filtered (and modify config file) + my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS); + push @TABLE,$file; + + my $new_name = "$file.short";#"$dir/phrase-table.$source_factor"; + push @TABLE_NEW_NAME,$new_name; + + $CONSIDER_FACTORS{$source_factor} = 1; + push @TABLE_FACTORS,$source_factor; + + # get the phrase pairs appearing in the input text + my %PHRASE_USED; + die("could not find input file $input") unless -e $input; + open(INPUT,$input); + while(my $line = ) { + chop($line); + my @WORD = split(/ +/,$line); + for(my $i=0;$i<=$#WORD;$i++) { + for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { + foreach (keys %CONSIDER_FACTORS) { + my @FACTOR = split(/,/); + my $phrase = ""; + for(my $k=$i;$k<=$i+$j;$k++) { + my @WORD_FACTOR = split(/\|/,$WORD[$k]); + for(my $f=0;$f<=$#FACTOR;$f++) { + $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; + } + chop($phrase); + $phrase .= " "; + } + chop($phrase); + $PHRASE_USED{$_}{$phrase}++; + } + } + } + } + close(INPUT); + + # filter files + for(my $i=0;$i<=$#TABLE;$i++) { + my ($used,$total) = (0,0); + my $file = $TABLE[$i]; + my $factors = $TABLE_FACTORS[$i]; + my $new_file = $TABLE_NEW_NAME[$i]; + print STDERR "filtering $file -> $new_file...\n"; + + if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); } + elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); } + elsif (-e $file) { open(FILE,$file); } + else { die("could not find model file $file"); } + + open(FILE_OUT,">$new_file"); + + while(my $entry = ) { + my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); + $foreign =~ s/ $//; + if (defined($PHRASE_USED{$factors}{$foreign})) { + print FILE_OUT $entry; + $used++; + } + $total++; + } + close(FILE); + close(FILE_OUT); + printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; + } -- cgit v1.2.3