#!/usr/bin/perl # $Id$ use strict; use Getopt::Long "GetOptions"; my $_CORPUS; my $_OUTPUT = "generation"; my $_GENERATION_FACTORS; die "specify options" unless &GetOptions('corpus=s' => \$_CORPUS, 'output=s' => \$_OUTPUT, 'generation-factors=s' => \$_GENERATION_FACTORS); die "Please use --corpus to specify the factored input corpus\n" unless $_CORPUS; if (! defined $_GENERATION_FACTORS) { die "Please use --generation-factors to set generation factors\n"; } my $___GENERATION_FACTORS = $_GENERATION_FACTORS || "0-0"; die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n") if $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/; print "output=$_OUTPUT.\n"; get_generation_factored(); print "Done\n"; exit 0; sub get_generation_factored { print STDERR "(8) learn generation model @ ".`date`; foreach my $f (split(/\+/,$___GENERATION_FACTORS)) { my $factor = $f; my ($factor_e_source,$factor_e) = split(/\-/,$factor); &get_generation($factor, $factor_e_source, $factor_e); } } sub get_generation { my ($factor, $factor_e_source, $factor_e) = @_; print STDERR "(8) [$factor] generate generation table @ ".`date`; my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH); my %INCLUDE_SOURCE; foreach my $factor (split(/,/,$factor_e_source)) { $INCLUDE_SOURCE{$factor} = 1; } my %INCLUDE; foreach my $factor (split(/,/,$factor_e)) { $INCLUDE{$factor} = 1; } my (%GENERATION,%GENERATION_TOTAL_SOURCE,%GENERATION_TOTAL_TARGET); open(E,$_CORPUS) or die "Can't read ".$_CORPUS; while() { chomp; foreach (split) { my @FACTOR = split(/\|/); my ($source,$target); my $first_factor = 1; foreach my $factor (split(/,/,$factor_e_source)) { $source .= "|" unless $first_factor; $first_factor = 0; $source .= $FACTOR[$factor]; } $first_factor = 1; foreach my $factor (split(/,/,$factor_e)) { $target .= "|" unless $first_factor; $first_factor = 0; $target .= $FACTOR[$factor]; } $GENERATION{$source}{$target}++; $GENERATION_TOTAL_SOURCE{$source}++; $GENERATION_TOTAL_TARGET{$target}++; } } close(E); open(GEN,">$_OUTPUT.$factor") or die "Can't write $_OUTPUT.$factor"; foreach my $source (keys %GENERATION) { foreach my $target (keys %{$GENERATION{$source}}) { printf GEN ("%s %s %.7f %.7f\n",$source,$target, $GENERATION{$source}{$target}/$GENERATION_TOTAL_SOURCE{$source}, $GENERATION{$source}{$target}/$GENERATION_TOTAL_TARGET{$target}); } } close(GEN); safesystem("rm -f $_OUTPUT.$factor.gz") or die; safesystem("gzip $_OUTPUT.$factor") or die; } sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } }