Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/training/combine_factors.pl')
-rwxr-xr-xscripts/training/combine_factors.pl74
1 files changed, 74 insertions, 0 deletions
diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl
new file mode 100755
index 000000000..d8aa7297e
--- /dev/null
+++ b/scripts/training/combine_factors.pl
@@ -0,0 +1,74 @@
+#!/usr/bin/perl
+
+# $Id: combine_factors.pl 1307 2007-03-14 22:22:36Z hieuhoang1972 $
+# given a list of files, combines them to a single corpus (sent to stdout)
+
+use strict;
+use warnings;
+use Getopt::Long;
+use IO::File;
+use File::Basename;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+my @addfactors = @ARGV;
+die "usage: combine_factors.pl corpusfile1 corpusfile2 ..."
+ if 0 == scalar @addfactors;
+
+my @streams = map {
+ my $fn = $_;
+ my $opn = ($fn =~ /\.gz$/ ? "zcat $fn |" : "$fn");
+ my $stream = new IO::File;
+ $stream->open($opn) or die "Can't open '$opn'";
+ binmode($stream, ":utf8");
+ $stream;
+} @addfactors;
+
+my $nr=0;
+my $firststream = shift @streams;
+shift @addfactors; # just to keep the lengths sync'ed
+$_ = readline($firststream);
+while (defined $_) {
+ $nr++;
+ print STDERR "." if $nr % 10000 == 0;
+ print STDERR "($nr)" if $nr % 100000 == 0;
+ chomp;
+ my @intokens = split / /;
+ # load lines of corresponding streams and ensure equal number of words
+ my @lines_of_extratoks;
+ foreach my $factor (0..$#streams) {
+ my $line = readline($streams[$factor]);
+ die "Additional factor file $addfactors[$factor] contains too few sentences!"
+ if !defined $line;
+ chomp($line);
+ my @toks = split / /, $line;
+ die "Incompatible number of words in factor $factor on line $nr."
+ if $#toks != $#intokens;
+ $lines_of_extratoks[$factor] = \@toks;
+ }
+
+ # for every token, print the factors in the order as user wished
+ for(my $i=0; $i<=$#intokens; $i++) {
+ my $token = $intokens[$i];
+ my @outtoken = ();
+ push @outtoken, $token; # add the first one
+ # print STDERR "Token: $token\n";
+ foreach my $factor (0..$#streams) {
+ my $f = $lines_of_extratoks[$factor]->[$i];
+ die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]"
+ if !defined $f || $f eq "";
+ push @outtoken, $f;
+ }
+ print " " if $i != 0;
+ print join("|", @outtoken);
+ }
+ print "\n";
+ $_ = readline($firststream);
+}
+close $firststream;
+print STDERR "Done.\n";
+
+
+