Welcome to mirror list, hosted at ThFree Co, Russian Federation.

syntax-hyphen-splitting.perl « wrappers « training « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 2c830f6b6b302cb7c3cd7d80c2cc2f8a9fec6f94 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env perl 

use strict;
use Getopt::Long "GetOptions";

my $MARK_HYP = 0;
my $BINARIZE = 0;
my $SLASH = 0;

die unless &GetOptions('binarize' => \$BINARIZE,'mark-split' => \$MARK_HYP,'slash' => \$SLASH);

my $punc = $SLASH ? "/" : "-";

while(<STDIN>) {
  chop;
  my @OUT = ();
  foreach (split) {
    if (/^</ || />$/) {
      push @OUT, $_;
    }
    elsif(/([\p{IsAlnum}])$punc([\p{IsAlnum}])/) {
      s/([\p{IsAlnum}])$punc([\p{IsAlnum}])/$1 \@$punc\@ $2/g;
      my @WORD = split;
      $OUT[$#OUT] =~ /label=\"([^\"]+)\"/;
      my $pos = $1;
      my $mark = $SLASH ? "SLASH-" : "HYP-";
      my $punc_pos = $SLASH ? "SLASH" : "HYP";
      if ($MARK_HYP) {
        $OUT[$#OUT] =~ s/label=\"/label=\"$mark/;
      }
      if ($BINARIZE) {
        for(my $i=0;$i<scalar(@WORD)-2;$i++) {
          push @OUT,"<tree label=\"\@".($MARK_HYP ? $mark : "")."$pos\">";
        }
      }
      for(my $i=0;$i<scalar(@WORD);$i++) {
        if ($BINARIZE && $i>=2) {
          push @OUT, "</tree>";
        }
        push @OUT,"<tree label=\"".(($WORD[$i] eq "\@$punc\@") ? $punc_pos : $pos)."\"> $WORD[$i] </tree>";
      }
    }
    else {
      push @OUT, $_;
    }
  }
  print join(" ",@OUT)."\n";
}