blob: e929658ff3565b91e4d34bc214224dc58ca43d36 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
#!/usr/bin/env perl
use warnings;
use strict;
#( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
#( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP (NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) (SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )
while(<STDIN>) {
if (/^$/) {
print "\n"; # parse failures
next;
}
# parenheses
s/\(/\-LRB\-/g; # tokens
s/\)/\-RRB\-/g;
s/\"LRB\"/\"\-LRB\-\"/g; # labels
s/\"RRB\"/\"\-RRB\-\"/g;
# main
s/<tree label=\"([^\"]+)\">/\($1/g;
s/ *<\/tree>/\)/g;
s/^\(TOP/\(/;
# de-escape
s/\&bar;/\|/g; # factor separator
s/\</\</g; # xml
s/\>/\>/g; # xml
s/\&bra;/\[/g; # syntax non-terminal (legacy)
s/\&ket;/\]/g; # syntax non-terminal (legacy)
s/\"/\"/g; # xml
s/\'/\'/g; # xml
s/\[/\[/g; # syntax non-terminal
s/\]/\]/g; # syntax non-terminal
s/\&/\&/g; # escape escape
# cleanup
s/ +/ /g;
s/ $//g;
s/\)$/ \)/g;
# output
print $_;
}
|