Welcome to mirror list, hosted at ThFree Co, Russian Federation.

build-generation-table.perl « training « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: fb59f4acc2264c6e5f4027532f395f3199fcc5e2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env perl 

# $Id$
use warnings;
use strict;
use Getopt::Long "GetOptions";

my $_CORPUS;
my $_OUTPUT = "generation";
my $_GENERATION_FACTORS;

die "specify options" unless &GetOptions('corpus=s' => \$_CORPUS,
       'output=s' => \$_OUTPUT,
       'generation-factors=s' => \$_GENERATION_FACTORS);


die "Please use --corpus to specify the factored input corpus\n" unless $_CORPUS;

if (! defined $_GENERATION_FACTORS) {
  die "Please use --generation-factors to set generation factors\n";
}

my $___GENERATION_FACTORS = $_GENERATION_FACTORS || "0-0";
die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n")
  if $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;

print "output=$_OUTPUT.<factor-map>\n";

get_generation_factored();
print "Done\n";
exit 0;

sub get_generation_factored {
    print STDERR "(8) learn generation model @ ".`date`;
    foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {
        my $factor = $f;
        my ($factor_e_source,$factor_e) = split(/\-/,$factor);
        &get_generation($factor, $factor_e_source, $factor_e);
    }
}


sub get_generation {
    my ($factor, $factor_e_source, $factor_e) = @_;

    print STDERR "(8) [$factor] generate generation table @ ".`date`;
    my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);

    my %INCLUDE_SOURCE;
    foreach my $factor (split(/,/,$factor_e_source)) {
	
	$INCLUDE_SOURCE{$factor} = 1;
    }
    my %INCLUDE;
    foreach my $factor (split(/,/,$factor_e)) {
	$INCLUDE{$factor} = 1;
    }

    my (%GENERATION,%GENERATION_TOTAL_SOURCE,%GENERATION_TOTAL_TARGET);
    open(E,$_CORPUS) or die "Can't read ".$_CORPUS;
    while(<E>) {
	chomp;
	foreach (split) {
	    my @FACTOR = split(/\|/);

	    my ($source,$target);
	    my $first_factor = 1;
	    foreach my $factor (split(/,/,$factor_e_source)) {
		$source .= "|" unless $first_factor;
		$first_factor = 0;
		$source .= $FACTOR[$factor];
	    }

	    $first_factor = 1;
	    foreach my $factor (split(/,/,$factor_e)) {
		$target .= "|" unless $first_factor;
		$first_factor = 0;
		$target .= $FACTOR[$factor];
	    }	    
	    $GENERATION{$source}{$target}++;
	    $GENERATION_TOTAL_SOURCE{$source}++;
	    $GENERATION_TOTAL_TARGET{$target}++;
	}
    } 
    close(E);
 
    open(GEN,">$_OUTPUT.$factor") or die "Can't write $_OUTPUT.$factor";
    foreach my $source (keys %GENERATION) {
	foreach my $target (keys %{$GENERATION{$source}}) {
	    printf GEN ("%s %s %.7f %.7f\n",$source,$target,
			$GENERATION{$source}{$target}/$GENERATION_TOTAL_SOURCE{$source},
			$GENERATION{$source}{$target}/$GENERATION_TOTAL_TARGET{$target});
	}
    }
    close(GEN);
    safesystem("rm -f $_OUTPUT.$factor.gz") or die;
    safesystem("gzip $_OUTPUT.$factor") or die;
}

sub safesystem {
  print STDERR "Executing: @_\n";
  system(@_);
  if ($? == -1) {
      print STDERR "Failed to execute: @_\n  $!\n";
      exit(1);
  }
  elsif ($? & 127) {
      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
          ($? & 127),  ($? & 128) ? 'with' : 'without';
  }
  else {
    my $exitcode = $? >> 8;
    print STDERR "Exit code: $exitcode\n" if $exitcode;
    return ! $exitcode;
  }
}