Welcome to mirror list, hosted at ThFree Co, Russian Federation.

postprocess-lopar.perl « training « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 05a56a3b5ab63be386095a7e92b7d9bbfe450acc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

# $Id$

use warnings;
use strict;

use utf8;

my $out = shift @ARGV or die "Please specify the output file path (will be appended with .lemma .morph and .words and .factored";

my $wc = 0;
my $uc = 0;
open OUT, ">$out.factored" or die "Couldn't open joined";
open M, ">$out.morph" or die "Couldn't open morph";
open L, ">$out.lemma" or die "Couldn't open lemma";
open S, ">$out.words" or die "Couldn't open surface";
open P, ">$out.pos" or die "Couldn't open surface";
my $lc = 0;
while (my $l =<STDIN>) {
  chomp $l;
	$lc++;
  if ($lc % 1000 == 0) {print "$lc\n";}
	my @ls = (); my @ms = ();  my @ss = (); my @js = (); my @ps = ();
  my @ws = split /\s+/, $l;
  foreach my $w (@ws) {
		$wc++;
    my ($surface, $morph, $lemma);

		if ($w =~ /^(.+)_([^_]+)_(.+)$/o) {
      ($surface, $morph, $lemma) = ($1, $2, $3);
		} else {
 			print "can't parse: $w\n";
			next;
		}
		#next unless (defined $surface && !($surface eq ''));
		if (!defined $lemma) { $lemma=$surface; }
		if (!defined $morph) { $morph = 'NN.Neut.Cas.Sg'; }
		if ($lemma eq '<NUM>' || $lemma eq '<ORD>') {
			$lemma = $surface;
		}

    $surface =~ tr/A-Z/a-z/;
    $surface =~ tr/À-Þ/à-þ/;

		if ($lemma eq '<unknown>') {
			$uc++;
			$lemma = $surface;
      if ($surface =~ /ungen$/o) {
				$lemma =~ s/en$//o;
				$morph = 'NN.Fem.Cas.Pl';
      } elsif ($surface =~ /schaften$/o) {
				$lemma =~ s/en$//o;
				$morph = 'NN.Fem.Cas.Pl';
      } elsif ($surface =~ /eiten$/o) {
				$lemma =~ s/en$//o;
				$morph = 'NN.Fem.Cas.Pl';
			} elsif ($surface =~ /eit/o) {
				$morph = 'NN.Fem.Cas.Sg';
			} elsif ($surface =~ /schaft/o) {
				$morph = 'NN.Fem.Cas.Sg';
			} elsif ($surface =~ /ung/o) {
				$morph = 'NN.Fem.Cas.Sg';
			} elsif ($surface =~ /ismus$/o) {
				$morph =~ 'NN.Masc.Cas.Sg';
			}
    } else {
			if ($lemma =~ /\|/o) {
				my ($l, @rest) = split /\|/o, $lemma;
				$lemma = $l;
			}
		}
		my ($pos, @xs) = split /\./, $morph;
		$morph = join '.', @xs;
    if (!defined $morph || $morph eq '') {
			$morph = '-';
		}
#    if (defined($lemma) && defined($morph) && defined($surface)) {
			push @js, "$surface|$morph|$lemma";
			push @ls, $lemma;
			push @ms, $morph;
			push @ss, $surface;
			push @ps, $pos;
#		}
  }
	print OUT join(' ', @js) . "\n";
	print M join(' ', @ms) . "\n";
	print L join(' ', @ls) . "\n";
	print S join(' ', @ss) . "\n";
	print P join(' ', @ps) . "\n";
}
close OUT;

print "word count: $wc\nunknown lemmas: $uc\nratio: " . $uc/$wc . "\n";