Welcome to mirror list, hosted at ThFree Co, Russian Federation.

ph_numbers.perl « generic « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: ea56927accc4f57956367b944d356559897c94b9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env perl 

package ph_numbers;

# Script to recognize and replace numbers in Moses training corpora
# and decoder input
#
# (c) 2013 TAUS

use warnings;
use strict;

run() unless caller();
use Getopt::Std;

my $debug = $ENV{DEBUG} || 0;

sub run {
    my %opts;
    if(!getopts('s:t:cm:hl',\%opts) || $opts{h}) {
        print "Usage: perl $0 [-s source_locale][-t target_locale][-c][-h][-l][-m symbol] < in > out\n";
        exit;
    }
    my $sourceLocale = $opts{s} || "";
    my $targetLocale = $opts{t} || "";
    my $numberSymbol = $opts{m} || '@num@';
    while(<>) {
        chomp;
        print mark_numbers($_,$opts{c},$opts{l},$numberSymbol,$_),"\n";
    }
}

sub mark_numbers {
    my $input = shift;
    my $corpusMode = shift;
    my $legacyMode = shift;
    my $numberSymbol = shift || '@num@';

    my $numref = recognize($input);
    my $input_length = length($input);
    my $output = "";
    my $position = 0;
    for(my $i = 0 ; $i < scalar(@{$numref}) ; $i++) {
        my $numstart = $numref->[$i][0];
        my $numend = $numref->[$i][1];
        if($position < $numstart) {
            $output .= substr($input,$position,$numstart-$position);
        }
        my $number = substr($input,$numstart,$numend-$numstart);
        if($corpusMode) {
      $output .= $numberSymbol;
        }
        else {
            if($legacyMode) {
                $output .= "<ne translation=\"$number\">$numberSymbol</ne>";
            }
            else {
                $output .= "<ne translation=\"$numberSymbol\" entity=\"$number\">$numberSymbol</ne>";
            }
        }
        $position = $numend;
    }
    $output .= substr($input,$position); 
    return $output; 
}

sub recognize {
    my $input = shift;
    #print STDERR "input=$input\n";

    my @recognized = ();
    while($input =~ /\G(.*?)(\s*)([+\-]?\p{Digit}*[\.,]?\p{Digit}+[\p{Digit}\.,+\-eE]*)/g) {
        my $start = $-[3];
        my $end = $+[3];
        while($input =~ /\G(\s+)(\p{Digit}+[\p{Digit}\.,+\-eE]*)/gc) {
            $end = $+[2];
        }

        # ALL characters in the word must be 
				my $isRecognized = 1;
				if ($start == 0 || substr($input, $start - 1, 1) eq " ") {
          # 1st word, or previous char is a space        
        }
        else {
          $isRecognized = 0;
        }

        if ($end == length($input) -1 || substr($input, $end, 1) eq " ") {
        # last word, or next char is a space        
        }
        else {
          $isRecognized = 0;
        }

        #print STDERR "start=$start end=$end len=" .length($input) ."\n";
        if ($isRecognized) {
          push @recognized,[$start,$end];
        }
    }
    return \@recognized;
}

1;