From 54ab89deabf98663a2970565044169014892ecfe Mon Sep 17 00:00:00 2001
From: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>
Date: Mon, 31 Jul 2006 22:14:08 +0000
Subject: seems this script does not have the same functionality as Ondrej's,
 and his are meant for training and this for analysis

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@430 1f5c12ca-751b-0410-a591-d2e778427230
---
 scripts/analysis/smtgui/filter-phrase-table.pl | 83 ++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 scripts/analysis/smtgui/filter-phrase-table.pl
diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl
new file mode 100644
index 000000000..a7e998794
--- /dev/null
+++ b/scripts/analysis/smtgui/filter-phrase-table.pl
@@ -0,0 +1,83 @@
+#!/usr/bin/perl -w
+
+#by Philipp Koehn, de-augmented by Evan Herbst
+#filter a phrase table for a specific input corpus
+#arguments: phrasetable_filename input_filename factor_index (0...)
+#outputs to phrasetable_filename.short
+
+#similar function to filter-model-given-input.pl, but only operates
+#on the phrase table and doesn't require that any subdirectories exist
+
+use strict;
+
+my $MAX_LENGTH = 10;
+
+my ($file, $input, $source_factor) = @ARGV;
+my $dir = ".";
+
+    # get tables to be filtered (and modify config file)
+    my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
+		push @TABLE,$file;
+
+		my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
+		push @TABLE_NEW_NAME,$new_name;
+
+		$CONSIDER_FACTORS{$source_factor} = 1;
+		push @TABLE_FACTORS,$source_factor;
+
+    # get the phrase pairs appearing in the input text
+    my %PHRASE_USED;
+    die("could not find input file $input") unless -e $input;
+    open(INPUT,$input);
+    while(my $line = <INPUT>) {
+	chop($line);
+	my @WORD = split(/ +/,$line);
+	for(my $i=0;$i<=$#WORD;$i++) {
+	    for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
+		foreach (keys %CONSIDER_FACTORS) {
+		    my @FACTOR = split(/,/);
+		    my $phrase = "";
+		    for(my $k=$i;$k<=$i+$j;$k++) {
+			my @WORD_FACTOR = split(/\|/,$WORD[$k]);
+			for(my $f=0;$f<=$#FACTOR;$f++) {
+			    $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
+			}
+			chop($phrase);
+			$phrase .= " ";
+		    }
+		    chop($phrase);
+		    $PHRASE_USED{$_}{$phrase}++;
+		}
+	    }
+	}
+    }
+    close(INPUT);
+
+    # filter files
+    for(my $i=0;$i<=$#TABLE;$i++) {
+	my ($used,$total) = (0,0);
+	my $file = $TABLE[$i];
+	my $factors = $TABLE_FACTORS[$i];
+	my $new_file = $TABLE_NEW_NAME[$i];
+	print STDERR "filtering $file -> $new_file...\n";
+
+        if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
+        elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
+        elsif (-e $file) { open(FILE,$file); }
+	else { die("could not find model file $file");  }
+
+	open(FILE_OUT,">$new_file");
+
+	while(my $entry = <FILE>) {
+	    my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
+	    $foreign =~ s/ $//;
+	    if (defined($PHRASE_USED{$factors}{$foreign})) {
+		print FILE_OUT $entry;
+		$used++;
+	    }
+	    $total++;
+	}
+	close(FILE);
+	close(FILE_OUT);
+	printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
+    }
-- 
cgit v1.2.3