1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
#!/usr/bin/env python
# Author : Qin Gao
# Date : Dec 31, 2007
#
# This file is part of mgiza++. Its use is licensed under the GNU General
# Public License version 2 or, at your option, any later version.
"""Combine multiple alignment files into a single one.
The files are prodcuced by MGIZA, which has sentence IDs, and every file is
ordered inside.
"""
from __future__ import unicode_literals
import sys
import re
import codecs
import io
import os
def normalize_path(path):
"""Normalize a filesystem path.
Convert Windows/Unix path separators to native ones, support "~" for
home directory portably, and convert the path to an absolute.
"""
path = path.replace('\\', os.sep).replace('/', os.sep)
path = os.path.expanduser(path)
path = os.path.abspath(path)
return path
ID_PATTERN = re.compile("\\((\\d+)\\)")
def extract_id(line):
"""Extract a sentence ID from `line`."""
match = ID_PATTERN.search(line)
return int(match.group(1))
def main():
"""Main body."""
if sys.version_info < (3, 0, 0):
sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
else:
sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf8', buffering=1)
if len(sys.argv) < 2:
sys.stderr.write("Provide me the file names (at least 2)\n")
sys.exit()
sent_id = 0
files = []
ids = []
sents = []
done = []
for i in range(1, len(sys.argv)):
fname = normalize_path(sys.argv[i])
files.append(io.open(fname, "r", encoding="UTF-8"))
ids.append(0)
sents.append("")
done.append(False)
i = 0
while i < len(files):
st1 = files[i].readline()
st2 = files[i].readline()
st3 = files[i].readline()
if len(st1) == 0 or len(st2) == 0 or len(st3) == 0:
done[i] = True
else:
ids[i] = extract_id(st1)
sents[i] = (st1, st2, st3)
i += 1
cont = True
while cont:
sent_id += 1
write_one = False
# Now try to read more sentences
i = 0
cont = False
while i < len(files):
if done[i]:
i += 1
continue
cont = True
if ids[i] == sent_id:
sys.stdout.write(
"%s%s%s" % (sents[i][0], sents[i][1], sents[i][2]))
write_one = True
st1 = files[i].readline()
st2 = files[i].readline()
st3 = files[i].readline()
if len(st1) == 0 or len(st2) == 0 or len(st3) == 0:
done[i] = True
else:
ids[i] = extract_id(st1)
sents[i] = (st1, st2, st3)
cont = True
break
elif ids[i] < sent_id:
sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i])
sys.exit()
else:
cont = True
i += 1
if (not write_one) and cont:
sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id)
sys.exit(1)
sys.stderr.write(
"Combined %d files, totally %d sents \n" % (len(files), sent_id - 1))
if __name__ == '__main__':
main()
|