Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'experimental/dual-model/MGIZA/scripts/giza2bal.pl')
-rwxr-xr-xexperimental/dual-model/MGIZA/scripts/giza2bal.pl112
1 files changed, 112 insertions, 0 deletions
diff --git a/experimental/dual-model/MGIZA/scripts/giza2bal.pl b/experimental/dual-model/MGIZA/scripts/giza2bal.pl
new file mode 100755
index 0000000..fb134c0
--- /dev/null
+++ b/experimental/dual-model/MGIZA/scripts/giza2bal.pl
@@ -0,0 +1,112 @@
+#! /usr/bin/perl
+
+# $Id: giza2bal.pl 1562 2008-02-19 20:48:14Z redpony $
+#Converts direct and inverted alignments into a more compact
+#bi-alignment format. It optionally reads the counting file
+#produced by giza containing the frequency of each traning sentence.
+
+#Copyright Marcello Federico, November 2004
+
+($cnt,$dir,$inv)=();
+
+while ($w=shift @ARGV){
+ $dir=shift(@ARGV),next if $w eq "-d";
+ $inv=shift(@ARGV),next if $w eq "-i";
+ $cnt=shift(@ARGV),next if $w eq "-c";
+}
+
+my $lc = 0;
+
+if (!$dir || !inv){
+ print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n";
+ print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n";
+ exit(0);
+}
+
+$|=1;
+
+open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n";
+open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n";
+
+if ($cnt){
+open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n";
+}
+
+
+sub ReadBiAlign{
+ local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_;
+ local($dummy,$n);
+
+ chop($c=<$fd0>); ## count
+ $dummy=<$fd0>; ## header
+ $dummy=<$fd0>; ## header
+ $c=1 if !$c;
+
+ $dummy=<$fd1>; ## header
+ chop($s1=<$fd1>);
+ chop($t1=<$fd1>);
+
+ $dummy=<$fd2>; ## header
+ chop($s2=<$fd2>);
+ chop($t2=<$fd2>);
+
+ @a=@b=();
+ $lc++;
+
+ #get target statistics
+ $n=1;
+ $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
+ while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
+ grep($a[$_]=$n,split(/\s+/,$2));
+ $n++;
+ }
+
+ $m=1;
+ $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
+ while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
+ grep($b[$_]=$m,split(/\s+/,$2));
+ $m++;
+ }
+
+ $M=split(/\s+/,$s1);
+ $N=split(/\s+/,$s2);
+
+ if ($m != ($M+1) || $n != ($N+1)) {
+ print STDERR "Sentence mismatch error! Line #$lc\n";
+ $s1 = "ALIGN_ERR";
+ $s2 = "ALIGN_ERR";
+ @a=(); @b=();
+ for ($j=1;$j<2;$j++){ $a[$j]=1; }
+ for ($i=1;$i<2;$i++){ $b[$i]=1; }
+ return 1;
+ }
+
+ for ($j=1;$j<$m;$j++){
+ $a[$j]=0 if !$a[$j];
+ }
+
+ for ($i=1;$i<$n;$i++){
+ $b[$i]=0 if !$b[$i];
+ }
+
+
+ return 1;
+}
+
+$skip=0;
+$ccc=0;
+while(!eof(DIR)){
+
+ if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c))
+ {
+ $ccc++;
+ print "$c\n";
+ print $#a," $src \# @a[1..$#a]\n";
+ print $#b," $tgt \# @b[1..$#b]\n";
+ }
+ else{
+ print "\n";
+ print STDERR "." if !(++$skip % 1000);
+ }
+};
+print STDERR "skip=<$skip> counts=<$ccc>\n";