Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2014-11-17 12:31:41 +0300
committerHieu Hoang <hieuhoang@gmail.com>2014-11-17 12:31:41 +0300
commita2825ccd1a3be70d6a4d6a33a75c57f8072591ce (patch)
tree80ba481a651aae3aaa210dacd688caf33ede3431
parent7bda1e27da84bdfd4b2733e578b1d31da89348fc (diff)
parentc3be9fdcf82cb5a7a0ecc2d5ad993d0d73fbdf4b (diff)
Merge branch 'master' of github.com:moses-smt/mgiza
-rwxr-xr-xmgizapp/scripts/merge_alignment.py10
-rwxr-xr-xmgizapp/scripts/plain2snt-hasvcb.py22
-rwxr-xr-xmgizapp/scripts/sntpostproc.py18
3 files changed, 38 insertions, 12 deletions
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py
index 626bc68..c4e8b95 100755
--- a/mgizapp/scripts/merge_alignment.py
+++ b/mgizapp/scripts/merge_alignment.py
@@ -5,8 +5,16 @@
# prodcuced by MGIZA, which has sentence IDs, and every file is
# ordered inside
+from __future__ import unicode_literals
import sys
import re
+import codecs
+import io
+
+if sys.version_info < (3,0,0):
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
if len(sys.argv)<2:
sys.stderr.write("Provide me the file names (at least 2)\n");
@@ -21,7 +29,7 @@ sents = [];
done = [];
for i in range(1,len(sys.argv)):
- files.append(open(sys.argv[i],"r"));
+ files.append(io.open(sys.argv[i],"r", encoding="UTF-8"));
ids.append(0);
sents.append("");
done.append(False);
diff --git a/mgizapp/scripts/plain2snt-hasvcb.py b/mgizapp/scripts/plain2snt-hasvcb.py
index 490c493..5e7c6b0 100755
--- a/mgizapp/scripts/plain2snt-hasvcb.py
+++ b/mgizapp/scripts/plain2snt-hasvcb.py
@@ -1,10 +1,18 @@
#!/usr/bin/env python
+from __future__ import unicode_literals
from sys import *
+import codecs
+import io
+
+if sys.version_info < (3,0,0):
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
def loadvcb(fname,out):
dict={};
- df = open(fname,"r");
+ df = io.open(fname,"r", encoding="UTF-8");
for line in df:
out.write(line);
ws = line.strip().split();
@@ -19,14 +27,14 @@ if len(argv)<9:
stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n");
exit();
-ein = open(argv[3],"r");
-fin = open(argv[4],"r");
+ein = io.open(argv[3],"r", encoding="UTF-8");
+fin = io.open(argv[4],"r", encoding="UTF-8");
-eout = open(argv[5],"w");
-fout = open(argv[6],"w");
+eout = io.open(argv[5],"w", encoding="UTF-8");
+fout = io.open(argv[6],"w", encoding="UTF-8");
-evcbx = open(argv[7],"w");
-fvcbx = open(argv[8],"w");
+evcbx = io.open(argv[7],"w", encoding="UTF-8");
+fvcbx = io.open(argv[8],"w", encoding="UTF-8");
evcb = loadvcb(argv[1],evcbx);
fvcb = loadvcb(argv[2],fvcbx);
diff --git a/mgizapp/scripts/sntpostproc.py b/mgizapp/scripts/sntpostproc.py
index b3bf528..f2f1f35 100755
--- a/mgizapp/scripts/sntpostproc.py
+++ b/mgizapp/scripts/sntpostproc.py
@@ -3,15 +3,25 @@
# This script post process the snt file -- either in single-line format or in multi-line format
# The output, however, will always be in single-line format
+from __future__ import unicode_literals
from sys import *
from optparse import OptionParser
import re;
+import codecs
+import io
+
usage = """
The script post process the snt file, the input could be single-line snt
file or multi-line, (triple line) and can insert sentence weight to the
file (-w) or add partial alignment to the file (-a)
Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile
"""
+
+if sys.version_info < (3,0,0):
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+
parser = OptionParser(usage=usage)
@@ -37,21 +47,21 @@ if options.snt == None:
parser.print_help();
exit();
else:
- sfile = open(options.snt,"r");
+ sfile = io.open(options.snt,"r", encoding="UTF-8");
if options.output=="-":
ofile = stdout;
else:
- ofile = open(options.output,"w");
+ ofile = io.open(options.output,"w", encoding="UTF-8");
wfile = None;
if options.weight <> None:
- wfile = open(options.weight,"r");
+ wfile = io.open(options.weight,"r", encoding="UTF-8");
afile = None;
if options.align <> None:
- afile = open(options.align,"r");
+ afile = io.open(options.align,"r", encoding="UTF-8");
rr = re.compile("[\\|\\#\\*]");
wt = 0.0;