believe Ondrej has a script w/same functionality; will investigate

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@428 1f5c12ca-751b-0410-a591-d2e778427230
author: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-01 02:07:34 +0400
committer: eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230> 2006-08-01 02:07:34 +0400
commit: 3b46c17ace935653275697beaafeea393c59ce76 (patch)
tree: 954986d459c8379084cf21a6746da7e0791087d5 /scripts/analysis
parent: 5cce8336c076371dfa30c5cd37c29bebb56a558a (diff)
1 files changed, 0 insertions, 79 deletions
diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl
deleted file mode 100644
index 85f325935..000000000
--- a/scripts/analysis/smtgui/filter-phrase-table.pl
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/perl -w
-
-#filter phrase tables
-#arguments: phrasetable_filename input_filename factor_index (0...)
-#outputs to phrasetable_filename.short
-
-use strict;
-
-my $MAX_LENGTH = 10;
-
-my ($file, $input, $source_factor) = @ARGV;
-my $dir = ".";
-
-    # get tables to be filtered (and modify config file)
-    my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
-		push @TABLE,$file;
-
-		my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
-		push @TABLE_NEW_NAME,$new_name;
-
-		$CONSIDER_FACTORS{$source_factor} = 1;
-		push @TABLE_FACTORS,$source_factor;
-
-    # get the phrase pairs appearing in the input text
-    my %PHRASE_USED;
-    die("could not find input file $input") unless -e $input;
-    open(INPUT,$input);
-    while(my $line = <INPUT>) {
-	chop($line);
-	my @WORD = split(/ +/,$line);
-	for(my $i=0;$i<=$#WORD;$i++) {
-	    for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
-		foreach (keys %CONSIDER_FACTORS) {
-		    my @FACTOR = split(/,/);
-		    my $phrase = "";
-		    for(my $k=$i;$k<=$i+$j;$k++) {
-			my @WORD_FACTOR = split(/\|/,$WORD[$k]);
-			for(my $f=0;$f<=$#FACTOR;$f++) {
-			    $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
-			}
-			chop($phrase);
-			$phrase .= " ";
-		    }
-		    chop($phrase);
-		    $PHRASE_USED{$_}{$phrase}++;
-		}
-	    }
-	}
-    }
-    close(INPUT);
-
-    # filter files
-    for(my $i=0;$i<=$#TABLE;$i++) {
-	my ($used,$total) = (0,0);
-	my $file = $TABLE[$i];
-	my $factors = $TABLE_FACTORS[$i];
-	my $new_file = $TABLE_NEW_NAME[$i];
-	print STDERR "filtering $file -> $new_file...\n";
-
-        if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
-        elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
-        elsif (-e $file) { open(FILE,$file); }
-	else { die("could not find model file $file");  }
-
-	open(FILE_OUT,">$new_file");
-
-	while(my $entry = <FILE>) {
-	    my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
-	    $foreign =~ s/ $//;
-	    if (defined($PHRASE_USED{$factors}{$foreign})) {
-		print FILE_OUT $entry;
-		$used++;
-	    }
-	    $total++;
-	}
-	close(FILE);
-	close(FILE_OUT);
-	printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
-    }
author	eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-01 02:07:34 +0400
committer	eherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>	2006-08-01 02:07:34 +0400
commit	3b46c17ace935653275697beaafeea393c59ce76 (patch)
tree	954986d459c8379084cf21a6746da7e0791087d5 /scripts/analysis
parent	5cce8336c076371dfa30c5cd37c29bebb56a558a (diff)