scripts/analysis/smtgui/filter-phrase-table.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

#!/usr/bin/perl -w

#by Philipp Koehn, de-augmented by Evan Herbst
#filter a phrase table for a specific input corpus
#arguments: phrasetable_filename input_filename factor_index (0...)
#outputs to phrasetable_filename.short

#similar function to filter-model-given-input.pl, but only operates
#on the phrase table and doesn't require that any subdirectories exist

use strict;

my $MAX_LENGTH = 10;

my ($file, $input, $source_factor) = @ARGV;
my $dir = ".";

    # get tables to be filtered (and modify config file)
    my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
		push @TABLE,$file;

		my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
		push @TABLE_NEW_NAME,$new_name;

		$CONSIDER_FACTORS{$source_factor} = 1;
		push @TABLE_FACTORS,$source_factor;

    # get the phrase pairs appearing in the input text
    my %PHRASE_USED;
    die("could not find input file $input") unless -e $input;
    open(INPUT,$input);
    while(my $line = <INPUT>) {
	chop($line);
	my @WORD = split(/ +/,$line);
	for(my $i=0;$i<=$#WORD;$i++) {
	    for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
		foreach (keys %CONSIDER_FACTORS) {
		    my @FACTOR = split(/,/);
		    my $phrase = "";
		    for(my $k=$i;$k<=$i+$j;$k++) {
			my @WORD_FACTOR = split(/\|/,$WORD[$k]);
			for(my $f=0;$f<=$#FACTOR;$f++) {
			    $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
			}
			chop($phrase);
			$phrase .= " ";
		    }
		    chop($phrase);
		    $PHRASE_USED{$_}{$phrase}++;
		}
	    }
	}
    }
    close(INPUT);

    # filter files
    for(my $i=0;$i<=$#TABLE;$i++) {
	my ($used,$total) = (0,0);
	my $file = $TABLE[$i];
	my $factors = $TABLE_FACTORS[$i];
	my $new_file = $TABLE_NEW_NAME[$i];
	print STDERR "filtering $file -> $new_file...\n";

        if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
        elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
        elsif (-e $file) { open(FILE,$file); }
	else { die("could not find model file $file");  }

	open(FILE_OUT,">$new_file");

	while(my $entry = <FILE>) {
	    my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
	    $foreign =~ s/ $//;
	    if (defined($PHRASE_USED{$factors}{$foreign})) {
		print FILE_OUT $entry;
		$used++;
	    }
	    $total++;
	}
	close(FILE);
	close(FILE_OUT);
	printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
    }