Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoreherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-01 02:07:34 +0400
committereherbst <eherbst@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-01 02:07:34 +0400
commit3b46c17ace935653275697beaafeea393c59ce76 (patch)
tree954986d459c8379084cf21a6746da7e0791087d5 /scripts/analysis
parent5cce8336c076371dfa30c5cd37c29bebb56a558a (diff)
believe Ondrej has a script w/same functionality; will investigate
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@428 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/analysis')
-rw-r--r--scripts/analysis/smtgui/filter-phrase-table.pl79
1 files changed, 0 insertions, 79 deletions
diff --git a/scripts/analysis/smtgui/filter-phrase-table.pl b/scripts/analysis/smtgui/filter-phrase-table.pl
deleted file mode 100644
index 85f325935..000000000
--- a/scripts/analysis/smtgui/filter-phrase-table.pl
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/perl -w
-
-#filter phrase tables
-#arguments: phrasetable_filename input_filename factor_index (0...)
-#outputs to phrasetable_filename.short
-
-use strict;
-
-my $MAX_LENGTH = 10;
-
-my ($file, $input, $source_factor) = @ARGV;
-my $dir = ".";
-
- # get tables to be filtered (and modify config file)
- my (@TABLE,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS);
- push @TABLE,$file;
-
- my $new_name = "$file.short";#"$dir/phrase-table.$source_factor";
- push @TABLE_NEW_NAME,$new_name;
-
- $CONSIDER_FACTORS{$source_factor} = 1;
- push @TABLE_FACTORS,$source_factor;
-
- # get the phrase pairs appearing in the input text
- my %PHRASE_USED;
- die("could not find input file $input") unless -e $input;
- open(INPUT,$input);
- while(my $line = <INPUT>) {
- chop($line);
- my @WORD = split(/ +/,$line);
- for(my $i=0;$i<=$#WORD;$i++) {
- for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
- foreach (keys %CONSIDER_FACTORS) {
- my @FACTOR = split(/,/);
- my $phrase = "";
- for(my $k=$i;$k<=$i+$j;$k++) {
- my @WORD_FACTOR = split(/\|/,$WORD[$k]);
- for(my $f=0;$f<=$#FACTOR;$f++) {
- $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|";
- }
- chop($phrase);
- $phrase .= " ";
- }
- chop($phrase);
- $PHRASE_USED{$_}{$phrase}++;
- }
- }
- }
- }
- close(INPUT);
-
- # filter files
- for(my $i=0;$i<=$#TABLE;$i++) {
- my ($used,$total) = (0,0);
- my $file = $TABLE[$i];
- my $factors = $TABLE_FACTORS[$i];
- my $new_file = $TABLE_NEW_NAME[$i];
- print STDERR "filtering $file -> $new_file...\n";
-
- if (-e $file && $file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
- elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
- elsif (-e $file) { open(FILE,$file); }
- else { die("could not find model file $file"); }
-
- open(FILE_OUT,">$new_file");
-
- while(my $entry = <FILE>) {
- my ($foreign,$rest) = split(/ \|\|\| /,$entry,2);
- $foreign =~ s/ $//;
- if (defined($PHRASE_USED{$factors}{$foreign})) {
- print FILE_OUT $entry;
- $used++;
- }
- $total++;
- }
- close(FILE);
- close(FILE_OUT);
- printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
- }