diff options
Diffstat (limited to 'experimental/dual-model/MGIZA/scripts/giza2bal.pl')
-rwxr-xr-x | experimental/dual-model/MGIZA/scripts/giza2bal.pl | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/experimental/dual-model/MGIZA/scripts/giza2bal.pl b/experimental/dual-model/MGIZA/scripts/giza2bal.pl new file mode 100755 index 0000000..fb134c0 --- /dev/null +++ b/experimental/dual-model/MGIZA/scripts/giza2bal.pl @@ -0,0 +1,112 @@ +#! /usr/bin/perl + +# $Id: giza2bal.pl 1562 2008-02-19 20:48:14Z redpony $ +#Converts direct and inverted alignments into a more compact +#bi-alignment format. It optionally reads the counting file +#produced by giza containing the frequency of each traning sentence. + +#Copyright Marcello Federico, November 2004 + +($cnt,$dir,$inv)=(); + +while ($w=shift @ARGV){ + $dir=shift(@ARGV),next if $w eq "-d"; + $inv=shift(@ARGV),next if $w eq "-i"; + $cnt=shift(@ARGV),next if $w eq "-c"; +} + +my $lc = 0; + +if (!$dir || !inv){ + print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n"; + print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n"; + exit(0); +} + +$|=1; + +open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n"; +open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n"; + +if ($cnt){ +open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n"; +} + + +sub ReadBiAlign{ + local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_; + local($dummy,$n); + + chop($c=<$fd0>); ## count + $dummy=<$fd0>; ## header + $dummy=<$fd0>; ## header + $c=1 if !$c; + + $dummy=<$fd1>; ## header + chop($s1=<$fd1>); + chop($t1=<$fd1>); + + $dummy=<$fd2>; ## header + chop($s2=<$fd2>); + chop($t2=<$fd2>); + + @a=@b=(); + $lc++; + + #get target statistics + $n=1; + $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; + while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ + grep($a[$_]=$n,split(/\s+/,$2)); + $n++; + } + + $m=1; + $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; + while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ + grep($b[$_]=$m,split(/\s+/,$2)); + $m++; + } + + $M=split(/\s+/,$s1); + $N=split(/\s+/,$s2); + + if ($m != ($M+1) || $n != ($N+1)) { + print STDERR "Sentence mismatch error! Line #$lc\n"; + $s1 = "ALIGN_ERR"; + $s2 = "ALIGN_ERR"; + @a=(); @b=(); + for ($j=1;$j<2;$j++){ $a[$j]=1; } + for ($i=1;$i<2;$i++){ $b[$i]=1; } + return 1; + } + + for ($j=1;$j<$m;$j++){ + $a[$j]=0 if !$a[$j]; + } + + for ($i=1;$i<$n;$i++){ + $b[$i]=0 if !$b[$i]; + } + + + return 1; +} + +$skip=0; +$ccc=0; +while(!eof(DIR)){ + + if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c)) + { + $ccc++; + print "$c\n"; + print $#a," $src \# @a[1..$#a]\n"; + print $#b," $tgt \# @b[1..$#b]\n"; + } + else{ + print "\n"; + print STDERR "." if !(++$skip % 1000); + } +}; +print STDERR "skip=<$skip> counts=<$ccc>\n"; |