Welcome to mirror list, hosted at ThFree Co, Russian Federation.

mosesxml2brackets.py « wrappers « training « scripts - github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 6b90aa25666f8eb8252802d57b4f5d94bf5d9941 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

"""Convert trees in moses XML format to PTB-style bracketed format."""

from __future__ import print_function, unicode_literals
import sys
import codecs

from lxml import etree as ET


def escape(word):
    # Factor separator:
    word = word.replace('|', '|')
    # Syntax non-terminal:
    word = word.replace('[', '[')
    # Syntax non-terminal:
    word = word.replace(']', ']')
    word = word.replace('\'', ''')
    word = word.replace('\"', '"')

    return word


def make_brackets(xml):
    out = ' [' + xml.get('label')

    if xml.text and xml.text.strip():
        word = escape(xml.text.strip())
        out += ' ' + word + ']'

    else:
        for child in xml:
            out += make_brackets(child)

        out += ']'

    return out


if __name__ == '__main__':

    if sys.version_info < (3, 0):
        sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
        sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
        sys.stdin = codecs.getreader('UTF-8')(sys.stdin)

    for line in sys.stdin:
        if line == '\n':
            sys.stdout.write(line)
            continue
        out = make_brackets(ET.fromstring(line)).strip()
        sys.stdout.write(out + '\n')