blob: 683ef1ed78765447d1a55d002e7a92d992906a19 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
#!/usr/bin/env perl
use strict;
# Create domain file from corpora
# (helper for domain adatpation)
# Creates a file with domain names and end line numbers for different domains
# within the cleaned training corpus. This file is used by various domain
# adaptation methods.
my ($extension,@SUBCORPORA) = @ARGV;
my $line_count = 0;
my %UNIQUE_NAME;
my $number = 1;
foreach (@SUBCORPORA) {
# get number of lines
if (!-e "$_.$extension" && -e "$_.$extension.gz") {
$line_count += `zcat $_.$extension.gz | wc -l`;
}
elsif (-e "$_.$extension") {
$line_count += `wc -l < $_.$extension`;
}
else {
die("ERROR: could not open sub corpus file $_.$extension\n");
}
# construct name
my $name = $number++; # default: cardinal number
while(defined($UNIQUE_NAME{$name})) { $name = $number++; } # slightly paranoid
if (/\/([^\.\/]+)\.[^\/]+$/ && !defined($UNIQUE_NAME{$1})) { # reconstruct corpus name
$name = $1;
$UNIQUE_NAME{$1}++;
}
print "$line_count $name\n";
}
|