Welcome to mirror list, hosted at ThFree Co, Russian Federation.

merge_alignment.py « scripts « v0.6.4 - github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 626bc68dffe2da7693830322e95eda0122ca529d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
# Author : Qin Gao
# Date   : Dec 31, 2007
# Purpose: Combine multiple alignment files into a single one, the files are
#          prodcuced by MGIZA, which has sentence IDs, and every file is 
#          ordered inside

import sys
import re

if len(sys.argv)<2:
	sys.stderr.write("Provide me the file names (at least 2)\n");
	sys.exit();

sent_id = 0;

files = [];
ids = [];

sents = [];
done = [];

for i in range(1,len(sys.argv)):
	files.append(open(sys.argv[i],"r"));
	ids.append(0);
	sents.append("");
	done.append(False);

r = re.compile("\\((\\d+)\\)");	
i = 0;
while i< len(files):
	st1 = files[i].readline();
	st2 = files[i].readline();
	st3 = files[i].readline();
	if len(st1)==0 or len(st2)==0 or len(st3)==0:
		done[i] = True;
	else:
		mt = r.search(st1);
		id = int(mt.group(1));
		ids[i] = id;
		sents[i] = (st1, st2, st3);
	i += 1
		
cont = True;
while (cont):
	sent_id += 1;
	writeOne = False;
# Now try to read more sentences
	i = 0;
	cont = False;
	while i < len(files):
		if done[i]:
			i+=1
			continue;
		cont = True;
		if ids[i] == sent_id:
			sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2]));
			writeOne = True;
			st1 = files[i].readline();
			st2 = files[i].readline();
			st3 = files[i].readline();
			if len(st1)==0 or len(st2)==0 or len(st3)==0:
				done[i] = True;
			else:
				mt = r.search(st1);
				id = int(mt.group(1));
				ids[i] = id;
				sents[i] = (st1, st2, st3);
				cont = True;
			break;
		elif ids[i] < sent_id:
			sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]);
			sys.exit();
		else:
			cont = True;
		i+=1;
	if (not writeOne) and cont:
		sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id);
		#sys.exit();
sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1));