Welcome to mirror list, hosted at ThFree Co, Russian Federation.

plain2snt-hasvcb.py « scripts « v0.6.4 - github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 490c493c6fabe7c02b392eae958510f03f9c7a44 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python

from sys import *

def loadvcb(fname,out):
	dict={};
	df = open(fname,"r");
	for line in df:
		out.write(line);
		ws = line.strip().split();
	        id = int(ws[0]);
		wd = ws[1];
		dict[wd]=id;
	return dict;

if len(argv)<9:
	stderr.write("Error, the input should be \n");
	stderr.write("%s evcb fvcb etxt ftxt esnt(out) fsnt(out) evcbx(out) fvcbx(out)\n" % argv[0]);
	stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n");
	exit();

ein = open(argv[3],"r");
fin = open(argv[4],"r");

eout = open(argv[5],"w");
fout = open(argv[6],"w");

evcbx = open(argv[7],"w");
fvcbx = open(argv[8],"w");
evcb = loadvcb(argv[1],evcbx);
fvcb = loadvcb(argv[2],fvcbx);

i=0
while True:
	i+=1;
	eline=ein.readline();
	fline=fin.readline();
	if len(eline)==0 or len(fline)==0:
		break;
	ewords = eline.strip().split();
	fwords = fline.strip().split();
	el = [];
	fl = [];
	j=0;
	for w in ewords:
		j+=1
		if evcb.has_key(w):
			el.append(evcb[w]);
		else:
			if evcb.has_key(w.lower()):
				el.append(evcb[w.lower()]);
			else:
				##stdout.write("#E %d %d %s\n" % (i,j,w))
				#el.append(1);
				nid = len(evcb)+1;
				evcb[w.lower()] = nid;
				evcbx.write("%d %s 1\n" % (nid, w));
				el.append(nid);

	j=0;
	for w in fwords:
		j+=1
		if fvcb.has_key(w):
			fl.append(fvcb[w]);
		else:
			if fvcb.has_key(w.lower()):
				fl.append(fvcb[w.lower()]);
			else:
				#stdout.write("#F %d %d %s\n" % (i,j,w))
				nid = len(fvcb)+1;
				fvcb[w.lower()] = nid;
				fvcbx.write("%d %s 1\n" % (nid, w));
				fl.append(nid);
				#fl.append(1);
	eout.write("1\n");
	fout.write("1\n");
	for I in el:
		eout.write("%d " % I);
	eout.write("\n");
	for I in fl:
		eout.write("%d " % I);
		fout.write("%d " % I);
	eout.write("\n");
	fout.write("\n");
	for I in el:
		fout.write("%d " % I);
	fout.write("\n");

fout.close();
eout.close();
fvcbx.close();
evcbx.close();