Welcome to mirror list, hosted at ThFree Co, Russian Federation.

chunk_to_features.pl « train-sets « test - github.com/moses-smt/vowpal_wabbit.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 24ba80394ab87c6d7cfe4fa1b718833d7fbef4e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/perl -w
use strict;

my %cdict = (); my $cdictNum = 1;
while (1) {
    my $cdictFile = shift or last;
    open F, $cdictFile or die;
    while (<F>) {
        chomp;
        my ($c, $num) = split;
        $cdict{$c} = $num;
        if ($num+1 > $cdictNum) { $cdictNum = $num + 1; }
    }
    close F or die;
}

my @w = (); my @t = (); my @c = ();
while (<>) {
    chomp;
    if (/^[\s]*$/) { dumpit(); print "\n"; @w = (); @t = (); @c = (); next; }

    my ($w,$t,$c) = split;
    #if ($c =~ /-NP/) { push @c, "1"; } else { push @c, "-1"; }
    if (not exists $cdict{$c}) { 
        $cdict{$c} = $cdictNum;
        $cdictNum++;
        print STDERR "$c\t$cdict{$c}\n";
    }

    push @c, $cdict{$c};
    push @t, $t;
    push @w, $w;
}

sub dumpit {
    for (my $n=0; $n<@c; $n++) {
        my %f = ();
        for (my $m=-2; $m<=+2; $m++) {
            computef(\%f, '_'.$m, $n+$m);
        }
        print $c[$n] . ' |';
        foreach my $f (keys %f) { 
            $f =~ s/:/-COL-/g;
            $f =~ s/\|/-PIP-/g;
            print ' ' . $f; 
        }
        print "\n";
    }
}

sub computef {
    my ($f, $s0, $i) = @_;

    if ($i <   0) { $f->{"w".$s0."=<s>" } = 1; return; }
    if ($i >= @c) { $f->{"w".$s0."=</s>"} = 1; return; }

    my $w = $w[$i]; my $p = $t[$i]; my $l = lc($w[$i]);

    $f->{"w".$s0."=".$w} = 1;
#    $f->"p:=".$p} = 1;
    $f->{"l".$s0."=".$l} = 1;

    my $c = $w;
    $c =~ s/[A-Z]+/A/g;
    $c =~ s/[a-z]+/a/g;
    $c =~ s/[0-9]+/0/g;
    $c =~ s/[^\.Aa0]+/\#/g;
    $f->{"c".$s0."=".$c} = 1;
    $f->{"c".$s0."=".$c."_fw=".(($i==0) ? "y" : "n")} = 1;

    my $N = length($l);
    $f->{"pre1".$s0."=".substr($l,0,1)} = 1;
    $f->{"pre2".$s0."=".substr($l,0,2)} = 1;
    $f->{"pre3".$s0."=".substr($l,0,3)} = 1;
    $f->{"suf1".$s0."=".substr($l,$N-1,1)} = 1;
    $f->{"suf2".$s0."=".substr($l,$N-2,2)} = 1;
    $f->{"suf3".$s0."=".substr($l,$N-3,3)} = 1;
}