Welcome to mirror list, hosted at ThFree Co, Russian Federation.

mosesxml2berkeleyparsed.perl « wrappers « training « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 02bc7b88ebf4ab04803ae1565a78ae0c7e3dff41 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

use warnings;
use strict;

#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )

while(<STDIN>) {
  if (/^$/) {
    print "\n"; # parse failures
    next;
  }

  # parenheses
  s/\(/\-LRB\-/g; # tokens
  s/\)/\-RRB\-/g;
  s/\"LRB\"/\"\-LRB\-\"/g; # labels
  s/\"RRB\"/\"\-RRB\-\"/g;

  # main
  s/<tree label=\"([^\"]+)\">/\($1/g;
  s/ *<\/tree>/\)/g;
  s/^\(TOP/\(/;

  # de-escape
  s/\&bar;/\|/g;   # factor separator
  s/\&lt;/\</g;    # xml
  s/\&gt;/\>/g;    # xml
  s/\&bra;/\[/g;   # syntax non-terminal (legacy)
  s/\&ket;/\]/g;   # syntax non-terminal (legacy)
  s/\&quot;/\"/g;  # xml
  s/\&apos;/\'/g;  # xml
  s/\&#91;/\[/g;   # syntax non-terminal
  s/\&#93;/\]/g;   # syntax non-terminal
  s/\&amp;/\&/g;   # escape escape

  # cleanup
  s/ +/ /g;
  s/ $//g;
  s/\)$/ \)/g;

  # output
  print $_;
}