Merge branch 'master' of github.com:moses-smt/mgiza

author: Hieu Hoang <hieuhoang@gmail.com> 2014-11-17 12:31:41 +0300
committer: Hieu Hoang <hieuhoang@gmail.com> 2014-11-17 12:31:41 +0300
commit: a2825ccd1a3be70d6a4d6a33a75c57f8072591ce (patch)
tree: 80ba481a651aae3aaa210dacd688caf33ede3431
parent: 7bda1e27da84bdfd4b2733e578b1d31da89348fc (diff)
parent: c3be9fdcf82cb5a7a0ecc2d5ad993d0d73fbdf4b (diff)
3 files changed, 38 insertions, 12 deletions
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py
index 626bc68..c4e8b95 100755
--- a/mgizapp/scripts/merge_alignment.py
+++ b/mgizapp/scripts/merge_alignment.py
@@ -5,8 +5,16 @@
 #          prodcuced by MGIZA, which has sentence IDs, and every file is 
 #          ordered inside
 
+from __future__ import unicode_literals
 import sys
 import re
+import codecs
+import io
+
+if sys.version_info < (3,0,0):
+    sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+    sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 
 if len(sys.argv)<2:
 	sys.stderr.write("Provide me the file names (at least 2)\n");
@@ -21,7 +29,7 @@ sents = [];
 done = [];
 
 for i in range(1,len(sys.argv)):
-	files.append(open(sys.argv[i],"r"));
+	files.append(io.open(sys.argv[i],"r", encoding="UTF-8"));
 	ids.append(0);
 	sents.append("");
 	done.append(False);
diff --git a/mgizapp/scripts/plain2snt-hasvcb.py b/mgizapp/scripts/plain2snt-hasvcb.py
index 490c493..5e7c6b0 100755
--- a/mgizapp/scripts/plain2snt-hasvcb.py
+++ b/mgizapp/scripts/plain2snt-hasvcb.py
@@ -1,10 +1,18 @@
 #!/usr/bin/env python
 
+from __future__ import unicode_literals
 from sys import *
+import codecs
+import io
+
+if sys.version_info < (3,0,0):
+    sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+    sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
 
 def loadvcb(fname,out):
 	dict={};
-	df = open(fname,"r");
+	df = io.open(fname,"r", encoding="UTF-8");
 	for line in df:
 		out.write(line);
 		ws = line.strip().split();
@@ -19,14 +27,14 @@ if len(argv)<9:
 	stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n");
 	exit();
 
-ein = open(argv[3],"r");
-fin = open(argv[4],"r");
+ein = io.open(argv[3],"r", encoding="UTF-8");
+fin = io.open(argv[4],"r", encoding="UTF-8");
 
-eout = open(argv[5],"w");
-fout = open(argv[6],"w");
+eout = io.open(argv[5],"w", encoding="UTF-8");
+fout = io.open(argv[6],"w", encoding="UTF-8");
 
-evcbx = open(argv[7],"w");
-fvcbx = open(argv[8],"w");
+evcbx = io.open(argv[7],"w", encoding="UTF-8");
+fvcbx = io.open(argv[8],"w", encoding="UTF-8");
 evcb = loadvcb(argv[1],evcbx);
 fvcb = loadvcb(argv[2],fvcbx);
 
diff --git a/mgizapp/scripts/sntpostproc.py b/mgizapp/scripts/sntpostproc.py
index b3bf528..f2f1f35 100755
--- a/mgizapp/scripts/sntpostproc.py
+++ b/mgizapp/scripts/sntpostproc.py
@@ -3,15 +3,25 @@
 # This script post process the snt file -- either in single-line format or in multi-line format
 # The output, however, will always be in single-line format
 
+from __future__ import unicode_literals
 from sys import *
 from optparse import OptionParser
 import re;
+import codecs
+import io
+
 usage = """
 The script post process the snt file, the input could be single-line snt 
 file or multi-line, (triple line) and can insert sentence weight to the
 file (-w) or add partial alignment to the file (-a)
 Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile
 """
+
+if sys.version_info < (3,0,0):
+    sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+    sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+    sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+
 parser = OptionParser(usage=usage)
 
 
@@ -37,21 +47,21 @@ if options.snt == None:
 	parser.print_help();
 	exit();
 else:
-	sfile = open(options.snt,"r");
+	sfile = io.open(options.snt,"r", encoding="UTF-8");
 
 if options.output=="-":
 	ofile = stdout;
 else:
-	ofile = open(options.output,"w");
+	ofile = io.open(options.output,"w", encoding="UTF-8");
 
 wfile = None;
 
 if options.weight <> None:
-	wfile = open(options.weight,"r");
+	wfile = io.open(options.weight,"r", encoding="UTF-8");
 
 afile = None;
 if options.align <> None:
-	afile = open(options.align,"r");
+	afile = io.open(options.align,"r", encoding="UTF-8");
 
 rr = re.compile("[\\|\\#\\*]");
 wt = 0.0;
author	Hieu Hoang <hieuhoang@gmail.com>	2014-11-17 12:31:41 +0300
committer	Hieu Hoang <hieuhoang@gmail.com>	2014-11-17 12:31:41 +0300
commit	a2825ccd1a3be70d6a4d6a33a75c57f8072591ce (patch)
tree	80ba481a651aae3aaa210dacd688caf33ede3431
parent	7bda1e27da84bdfd4b2733e578b1d31da89348fc (diff)
parent	c3be9fdcf82cb5a7a0ecc2d5ad993d0d73fbdf4b (diff)