Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-01 02:14:08 +0400
committereherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-01 02:14:08 +0400
commit54ab89deabf98663a2970565044169014892ecfe (patch)
tree3c9ed774c9a173044a45ebfb703c577cb3a672cd /scripts/analysis/smtgui
parent6f80f8c12a27fe7afd8f52ed32bd8d2af3f12b3d (diff)
seems this script does not have the same functionality as Ondrej's, and his are meant for training and this for analysis
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@430 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis/smtgui')
-rw-r--r--scripts/analysis/smtgui/filter-phrase-table.pl83
1 files changed, 83 insertions, 0 deletions
diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl
new file mode 100644
index 000000000..a7e998794
--- /dev/null
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@@ -0,0 +1,83 @@
+#!/usr/bin/perl -w
+
+#by Philipp Koehn, de-augmented by Evan Herbst
+#filter a phrase table for a specific input corpus
+#arguments: phrasetable_filename input_filename factor_index (0...)
+#outputs to phrasetable_filename.short
+
+#similar function to filter-model-given-input.pl, but only operates
+#on the phrase table and doesn't require that any subdirectories exist
+
+use strict;
+
+my $MAX_LENGTH = 10;
+
+my ($file, $input, $source_factor) = @ARGV;
+my $dir = ".";
+
+ # get tables to be filtered (and modify config file)
+ my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
+ push @TABLE,$file;
+
+ my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
+ push @TABLE_NEW_NAME,$new_name;
+
+ $CONSIDER_FACTORS{$source_factor} = 1;
+ push @TABLE_FACTORS,$source_factor;
+
+ # get the phrase pairs appearing in the input text
+ my %PHRASE_USED;
+ die("could not find input file $input") unless -e $input;
+ open(INPUT,$input);
+ while(my $line = <INPUT>) {
+ chop($line);
+ my @WORD = split(/ +/,$line);
+ for(my $i=0;$i<=$#WORD;$i++) {
+ for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
+ foreach (keys %CONSIDER_FACTORS) {
+ my @FACTOR = split(/,/);
+ my $phrase = "";
+ for(my $k=$i;$k<=$i+$j;$k++) {
+ my @WORD_FACTOR = split(/\|/,$WORD[$k]);
+ for(my $f=0;$f<=$#FACTOR;$f++) {
+ $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
+ }
+ chop($phrase);
+ $phrase .= " ";
+ }
+ chop($phrase);
+ $PHRASE_USED{$_}{$phrase}++;
+ }
+ }
+ }
+ }
+ close(INPUT);
+
+ # filter files
+ for(my $i=0;$i<=$#TABLE;$i++) {
+ my ($used,$total) = (0,0);
+ my $file = $TABLE[$i];
+ my $factors = $TABLE_FACTORS[$i];
+ my $new_file = $TABLE_NEW_NAME[$i];
+ print STDERR "filtering $file -> $new_file...\n";
+
+ if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
+ elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
+ elsif (-e $file) { open(FILE,$file); }
+ else { die("could not find model file $file"); }
+
+ open(FILE_OUT,">$new_file");
+
+ while(my $entry = <FILE>) {
+ my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
+ $foreign =~ s/ $//;
+ if (defined($PHRASE_USED{$factors}{$foreign})) {
+ print FILE_OUT $entry;
+ $used++;
+ }
+ $total++;
+ }
+ close(FILE);
+ close(FILE_OUT);
+ printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+ }