1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
#!/usr/bin/perl -w
#by Philipp Koehn, de-augmented by Evan Herbst
#filter a phrase table for a specific input corpus
#arguments: phrasetable_filename input_filename factor_index (0...)
#outputs to phrasetable_filename.short
#similar function to filter-model-given-input.pl, but only operates
#on the phrase table and doesn't require that any subdirectories exist
use strict;
my $MAX_LENGTH = 10;
my ($file, $input, $source_factor) = @ARGV;
my $dir = ".";
# get tables to be filtered (and modify config file)
my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
push @TABLE,$file;
my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
push @TABLE_NEW_NAME,$new_name;
$CONSIDER_FACTORS{$source_factor} = 1;
push @TABLE_FACTORS,$source_factor;
# get the phrase pairs appearing in the input text
my %PHRASE_USED;
die("could not find input file $input") unless -e $input;
open(INPUT,$input);
while(my $line = <INPUT>) {
chop($line);
my @WORD = split(/ +/,$line);
for(my $i=0;$i<=$#WORD;$i++) {
for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
foreach (keys %CONSIDER_FACTORS) {
my @FACTOR = split(/,/);
my $phrase = "";
for(my $k=$i;$k<=$i+$j;$k++) {
my @WORD_FACTOR = split(/\|/,$WORD[$k]);
for(my $f=0;$f<=$#FACTOR;$f++) {
$phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
}
chop($phrase);
$phrase .= " ";
}
chop($phrase);
$PHRASE_USED{$_}{$phrase}++;
}
}
}
}
close(INPUT);
# filter files
for(my $i=0;$i<=$#TABLE;$i++) {
my ($used,$total) = (0,0);
my $file = $TABLE[$i];
my $factors = $TABLE_FACTORS[$i];
my $new_file = $TABLE_NEW_NAME[$i];
print STDERR "filtering $file -> $new_file...\n";
if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
elsif (-e $file) { open(FILE,$file); }
else { die("could not find model file $file"); }
open(FILE_OUT,">$new_file");
while(my $entry = <FILE>) {
my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
$foreign =~ s/ $//;
if (defined($PHRASE_USED{$factors}{$foreign})) {
print FILE_OUT $entry;
$used++;
}
$total++;
}
close(FILE);
close(FILE_OUT);
printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
}
|