1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
#!/usr/bin/env perl
use strict;
# handle switches
use Getopt::Long "GetOptions";
my ($IN,$OUT,$TREE_TAGGER,$BASIC,$STEM,$LANGUAGE);
if (!&GetOptions('tree-tagger=s' => \$TREE_TAGGER,
'basic' => \$BASIC,
'stem' => \$STEM,
'l=s' => \$LANGUAGE) ||
!($IN = shift @ARGV) ||
!($OUT = shift @ARGV) ||
!defined($TREE_TAGGER) ||
!defined($LANGUAGE)) {
print "syntax: make-pos.tree-tagger.perl -tree-tagger INSTALL_DIR -l LANGUAGE IN_FILE OUT_FILE [-basic] [-stem]\n";
exit(1);
}
# define the model file for the given language
my $MODEL = undef;
$MODEL = "english" if $LANGUAGE eq "en";
$MODEL = "french-utf8" if $LANGUAGE eq "fr";
$MODEL = "spanish" if $LANGUAGE eq "es";
$MODEL = "german-utf8" if $LANGUAGE eq "de";
$MODEL = "italian-utf8" if $LANGUAGE eq "it";
$MODEL = "dutch" if $LANGUAGE eq "nl";
$MODEL = "bulgarian-utf8" if $LANGUAGE eq "bg";
$MODEL = "greek" if $LANGUAGE eq "el";
die("Unknown language '$LANGUAGE'") unless defined($MODEL);
$MODEL = $TREE_TAGGER."/lib/".$MODEL.".par";
# define encoding conversion into Latin1 or Greek if required
my $CONV = "";
#$CONV = "iconv --unicode-subst=X -f utf8 -t iso-8859-1|"
$CONV = "perl -ne 'use Encode; print encode(\"iso-8859-1\", decode(\"utf8\", \$_));' |"
unless $MODEL =~ /utf8/ || $LANGUAGE eq "bg";
$CONV = "perl -ne 'use Encode; print encode(\"iso-8859-7\", decode(\"utf8\", \$_));' |"
if $LANGUAGE eq "el";
# pipe in data into tagger, process its output
my $first = 1;
open(TAGGER,"cat $IN | $CONV".
"perl -ne 'foreach(split){print \$_.\"\n\";}print \"eND_oF_SeNTeNCe\n\";'|".
"$TREE_TAGGER/bin/tree-tagger -token -lemma -sgml $MODEL|");
open(OUT,">$OUT");
while(<TAGGER>) {
my ($word,$tag,$stem) = split;
if ($word eq "eND_oF_SeNTeNCe") {
print OUT "\n";
$first = 1;
}
else {
print OUT " " unless $first;
if ($STEM) {
$stem = $word if $stem eq "<unknown>";
$stem =~ s/\|.+//;
print OUT $stem;
}
else {
$tag =~ s/\:.+// if $BASIC;
print OUT $tag;
}
$first = 0;
}
}
close(TAGGER);
|