Welcome to mirror list, hosted at ThFree Co, Russian Federation.

make-factor-de-pos.perl « wrappers « training « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 585323bd4d563bae8a8da119b0820e4b4888dbe5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

use warnings;
use strict;

my ($in,$out,$tmpdir) = @ARGV;
use Encode;
use FindBin qw($RealBin);

`mkdir -p $tmpdir`;
`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;

open(LOPAR,"$tmpdir/lopar.$$");
open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out");
while(<LOPAR>) {
    chomp;
    s/ +/ /g;
    s/^ //;
    s/ $//;
    my $first = 1;
    foreach (split) {
        die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
        my ($word,$pos,$lemma) = ($1,$2,$3);
	$pos =~ s/\..+//;
	print OUT " " unless $first;
	$first = 0;
	print OUT encode('utf8', decode('iso-8859-1', $pos));
    }
    print OUT "\n";
}
close(LOPAR);
close(OUT);