Welcome to mirror list, hosted at ThFree Co, Russian Federation.

giza2bal.pl « scripts « mgizapp - github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: fb134c0eb6648bf8f5296760a80de3fad265b1bf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#! /usr/bin/perl

# $Id: giza2bal.pl 1562 2008-02-19 20:48:14Z redpony $
#Converts direct and inverted alignments into a more compact 
#bi-alignment format. It optionally reads the counting file 
#produced by giza containing the frequency of each traning sentence.

#Copyright Marcello Federico, November 2004

($cnt,$dir,$inv)=();

while ($w=shift @ARGV){
  $dir=shift(@ARGV),next  if $w eq "-d";
  $inv=shift(@ARGV),next  if $w eq "-i";
  $cnt=shift(@ARGV),next  if $w eq "-c";
} 

my $lc = 0;

if (!$dir || !inv){
 print  "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n"; 
 print  "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n";
 exit(0);
}

$|=1;

open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n";
open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n";

if ($cnt){
open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n";
}


sub ReadBiAlign{
    local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_;
    local($dummy,$n);

    chop($c=<$fd0>); ## count
    $dummy=<$fd0>; ## header
    $dummy=<$fd0>; ## header
    $c=1 if !$c;

    $dummy=<$fd1>; ## header
    chop($s1=<$fd1>);
    chop($t1=<$fd1>);

    $dummy=<$fd2>; ## header
    chop($s2=<$fd2>);
    chop($t2=<$fd2>);

    @a=@b=();
    $lc++;

    #get target statistics
    $n=1;
    $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
    while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
        grep($a[$_]=$n,split(/\s+/,$2));
        $n++;
    }

    $m=1;
    $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
    while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
        grep($b[$_]=$m,split(/\s+/,$2));
        $m++;
    }

    $M=split(/\s+/,$s1);
    $N=split(/\s+/,$s2);

    if ($m != ($M+1) || $n != ($N+1)) {
      print STDERR "Sentence mismatch error! Line #$lc\n";
      $s1 = "ALIGN_ERR";
      $s2 = "ALIGN_ERR";
      @a=(); @b=();
      for ($j=1;$j<2;$j++){ $a[$j]=1; }
      for ($i=1;$i<2;$i++){ $b[$i]=1; }
      return 1;
    }

    for ($j=1;$j<$m;$j++){
        $a[$j]=0 if !$a[$j];
    }

    for ($i=1;$i<$n;$i++){
        $b[$i]=0 if !$b[$i];
    }


    return 1;
}

$skip=0;
$ccc=0;
while(!eof(DIR)){

    if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c))
    {
	$ccc++;
        print "$c\n";
        print $#a," $src \# @a[1..$#a]\n";
        print $#b," $tgt \# @b[1..$#b]\n";
    }
    else{
    	print "\n";
        print STDERR "." if !(++$skip % 1000);
    }
};
print STDERR "skip=<$skip> counts=<$ccc>\n";