Welcome to mirror list, hosted at ThFree Co, Russian Federation.

mosesxml2berkeleyparsed.perl « wrappers « training « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: e929658ff3565b91e4d34bc214224dc58ca43d36 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env perl

use warnings;
use strict;

#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )

while(<STDIN>) {
  if (/^$/) {
    print "\n"; # parse failures
    next;
  }

  # parenheses
  s/\(/\-LRB\-/g; # tokens
  s/\)/\-RRB\-/g;
  s/\"LRB\"/\"\-LRB\-\"/g; # labels
  s/\"RRB\"/\"\-RRB\-\"/g;

  # main
  s/<tree label=\"([^\"]+)\">/\($1/g;
  s/ *<\/tree>/\)/g;
  s/^\(TOP/\(/;

  # de-escape
  s/\&bar;/\|/g;   # factor separator
  s/\&lt;/\</g;    # xml
  s/\&gt;/\>/g;    # xml
  s/\&bra;/\[/g;   # syntax non-terminal (legacy)
  s/\&ket;/\]/g;   # syntax non-terminal (legacy)
  s/\&quot;/\"/g;  # xml
  s/\&apos;/\'/g;  # xml
  s/\&#91;/\[/g;   # syntax non-terminal
  s/\&#93;/\]/g;   # syntax non-terminal
  s/\&amp;/\&/g;   # escape escape

  # cleanup
  s/ +/ /g;
  s/ $//g;
  s/\)$/ \)/g;

  # output
  print $_;
}