1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
|
#!/usr/bin/env perl
# $Id$
use warnings;
use strict;
use Getopt::Long "GetOptions";
my $_CORPUS;
my $_OUTPUT = "generation";
my $_GENERATION_FACTORS;
die "specify options" unless &GetOptions('corpus=s' => \$_CORPUS,
'output=s' => \$_OUTPUT,
'generation-factors=s' => \$_GENERATION_FACTORS);
die "Please use --corpus to specify the factored input corpus\n" unless $_CORPUS;
if (! defined $_GENERATION_FACTORS) {
die "Please use --generation-factors to set generation factors\n";
}
my $___GENERATION_FACTORS = $_GENERATION_FACTORS || "0-0";
die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n")
if $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;
print "output=$_OUTPUT.<factor-map>\n";
get_generation_factored();
print "Done\n";
exit 0;
sub get_generation_factored {
print STDERR "(8) learn generation model @ ".`date`;
foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {
my $factor = $f;
my ($factor_e_source,$factor_e) = split(/\-/,$factor);
&get_generation($factor, $factor_e_source, $factor_e);
}
}
sub get_generation {
my ($factor, $factor_e_source, $factor_e) = @_;
print STDERR "(8) [$factor] generate generation table @ ".`date`;
my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);
my %INCLUDE_SOURCE;
foreach my $factor (split(/,/,$factor_e_source)) {
$INCLUDE_SOURCE{$factor} = 1;
}
my %INCLUDE;
foreach my $factor (split(/,/,$factor_e)) {
$INCLUDE{$factor} = 1;
}
my (%GENERATION,%GENERATION_TOTAL_SOURCE,%GENERATION_TOTAL_TARGET);
open(E,$_CORPUS) or die "Can't read ".$_CORPUS;
while(<E>) {
chomp;
foreach (split) {
my @FACTOR = split(/\|/);
my ($source,$target);
my $first_factor = 1;
foreach my $factor (split(/,/,$factor_e_source)) {
$source .= "|" unless $first_factor;
$first_factor = 0;
$source .= $FACTOR[$factor];
}
$first_factor = 1;
foreach my $factor (split(/,/,$factor_e)) {
$target .= "|" unless $first_factor;
$first_factor = 0;
$target .= $FACTOR[$factor];
}
$GENERATION{$source}{$target}++;
$GENERATION_TOTAL_SOURCE{$source}++;
$GENERATION_TOTAL_TARGET{$target}++;
}
}
close(E);
open(GEN,">$_OUTPUT.$factor") or die "Can't write $_OUTPUT.$factor";
foreach my $source (keys %GENERATION) {
foreach my $target (keys %{$GENERATION{$source}}) {
printf GEN ("%s %s %.7f %.7f\n",$source,$target,
$GENERATION{$source}{$target}/$GENERATION_TOTAL_SOURCE{$source},
$GENERATION{$source}{$target}/$GENERATION_TOTAL_TARGET{$target});
}
}
close(GEN);
safesystem("rm -f $_OUTPUT.$factor.gz") or die;
safesystem("gzip $_OUTPUT.$factor") or die;
}
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
|