move giza2bal.pl back into script/training

author: Hieu Hoang <fishandfrolick@gmail.com> 2012-05-31 21:04:29 +0400
committer: Hieu Hoang <fishandfrolick@gmail.com> 2012-05-31 21:04:29 +0400
commit: 48faedb3bdfe1507d19cf2b76fbc81f3be2ff244 (patch)
tree: d2c06136634eb81ae7ec057996c114252dee8ee6 /scripts
parent: a5ca652a766ddb687891adac8e7ef252fa2f430d (diff)
2 files changed, 113 insertions, 1 deletions
diff --git a/scripts/training/giza2bal.pl b/scripts/training/giza2bal.pl
new file mode 100755
index 000000000..553ff2b3e
--- /dev/null
+++ b/scripts/training/giza2bal.pl
@@ -0,0 +1,112 @@
+#! /usr/bin/perl
+
+# $Id$
+#Converts direct and inverted alignments into a more compact 
+#bi-alignment format. It optionally reads the counting file 
+#produced by giza containing the frequency of each traning sentence.
+
+#Copyright Marcello Federico, November 2004
+
+($cnt,$dir,$inv)=();
+
+while ($w=shift @ARGV){
+  $dir=shift(@ARGV),next  if $w eq "-d";
+  $inv=shift(@ARGV),next  if $w eq "-i";
+  $cnt=shift(@ARGV),next  if $w eq "-c";
+} 
+
+my $lc = 0;
+
+if (!$dir || !inv){
+ print  "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n"; 
+ print  "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n";
+ exit(0);
+}
+
+$|=1;
+
+open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n";
+open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n";
+
+if ($cnt){
+open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n";
+}
+
+
+sub ReadBiAlign{
+    local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_;
+    local($dummy,$n);
+
+    chop($c=<$fd0>); ## count
+    $dummy=<$fd0>; ## header
+    $dummy=<$fd0>; ## header
+    $c=1 if !$c;
+
+    $dummy=<$fd1>; ## header
+    chop($s1=<$fd1>);
+    chop($t1=<$fd1>);
+
+    $dummy=<$fd2>; ## header
+    chop($s2=<$fd2>);
+    chop($t2=<$fd2>);
+
+    @a=@b=();
+    $lc++;
+
+    #get target statistics
+    $n=1;
+    $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
+    while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
+        grep($a[$_]=$n,split(/\s+/,$2));
+        $n++;
+    }
+
+    $m=1;
+    $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
+    while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
+        grep($b[$_]=$m,split(/\s+/,$2));
+        $m++;
+    }
+
+    $M=split(/\s+/,$s1);
+    $N=split(/\s+/,$s2);
+
+    if ($m != ($M+1) || $n != ($N+1)) {
+      print STDERR "Sentence mismatch error! Line #$lc\n";
+      $s1 = "ALIGN_ERR";
+      $s2 = "ALIGN_ERR";
+      @a=(); @b=();
+      for ($j=1;$j<2;$j++){ $a[$j]=1; }
+      for ($i=1;$i<2;$i++){ $b[$i]=1; }
+      return 1;
+    }
+
+    for ($j=1;$j<$m;$j++){
+        $a[$j]=0 if !$a[$j];
+    }
+
+    for ($i=1;$i<$n;$i++){
+        $b[$i]=0 if !$b[$i];
+    }
+
+
+    return 1;
+}
+
+$skip=0;
+$ccc=0;
+while(!eof(DIR)){
+
+    if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c))
+    {
+	$ccc++;
+        print "$c\n";
+        print $#a," $src \# @a[1..$#a]\n";
+        print $#b," $tgt \# @b[1..$#b]\n";
+    }
+    else{
+    	print "\n";
+        print STDERR "." if !(++$skip % 1000);
+    }
+};
+print STDERR "skip=<$skip> counts=<$ccc>\n";
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index b0cb26275..d609200aa 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -266,7 +266,7 @@ my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/../bin/lexical-reordering-score";
 my $MEMSCORE = "$SCRIPTS_ROOTDIR/../bin/memscore";
 my $EPPEX = "$SCRIPTS_ROOTDIR/../bin/eppex";
 my $SYMAL = "$SCRIPTS_ROOTDIR/../bin/symal";
-my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl";
+my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/giza2bal.pl";
 
 my $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/../bin/score";
 $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/generic/score-parallel.perl $_CORES \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_SCORE";
author	Hieu Hoang <fishandfrolick@gmail.com>	2012-05-31 21:04:29 +0400
committer	Hieu Hoang <fishandfrolick@gmail.com>	2012-05-31 21:04:29 +0400
commit	48faedb3bdfe1507d19cf2b76fbc81f3be2ff244 (patch)
tree	d2c06136634eb81ae7ec057996c114252dee8ee6 /scripts
parent	a5ca652a766ddb687891adac8e7ef252fa2f430d (diff)