blob: d8aa7297e0f4282107eec90686b9e26dd251cf87 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
|
#!/usr/bin/perl
# $Id: combine_factors.pl 1307 2007-03-14 22:22:36Z hieuhoang1972 $
# given a list of files, combines them to a single corpus (sent to stdout)
use strict;
use warnings;
use Getopt::Long;
use IO::File;
use File::Basename;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
my @addfactors = @ARGV;
die "usage: combine_factors.pl corpusfile1 corpusfile2 ..."
if 0 == scalar @addfactors;
my @streams = map {
my $fn = $_;
my $opn = ($fn =~ /\.gz$/ ? "zcat $fn |" : "$fn");
my $stream = new IO::File;
$stream->open($opn) or die "Can't open '$opn'";
binmode($stream, ":utf8");
$stream;
} @addfactors;
my $nr=0;
my $firststream = shift @streams;
shift @addfactors; # just to keep the lengths sync'ed
$_ = readline($firststream);
while (defined $_) {
$nr++;
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
chomp;
my @intokens = split / /;
# load lines of corresponding streams and ensure equal number of words
my @lines_of_extratoks;
foreach my $factor (0..$#streams) {
my $line = readline($streams[$factor]);
die "Additional factor file $addfactors[$factor] contains too few sentences!"
if !defined $line;
chomp($line);
my @toks = split / /, $line;
die "Incompatible number of words in factor $factor on line $nr."
if $#toks != $#intokens;
$lines_of_extratoks[$factor] = \@toks;
}
# for every token, print the factors in the order as user wished
for(my $i=0; $i<=$#intokens; $i++) {
my $token = $intokens[$i];
my @outtoken = ();
push @outtoken, $token; # add the first one
# print STDERR "Token: $token\n";
foreach my $factor (0..$#streams) {
my $f = $lines_of_extratoks[$factor]->[$i];
die "Missed factor value for word $i+1 on line $nr in $addfactors[$factor]"
if !defined $f || $f eq "";
push @outtoken, $f;
}
print " " if $i != 0;
print join("|", @outtoken);
}
print "\n";
$_ = readline($firststream);
}
close $firststream;
print STDERR "Done.\n";
|