diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2014-11-17 12:31:41 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2014-11-17 12:31:41 +0300 |
commit | a2825ccd1a3be70d6a4d6a33a75c57f8072591ce (patch) | |
tree | 80ba481a651aae3aaa210dacd688caf33ede3431 | |
parent | 7bda1e27da84bdfd4b2733e578b1d31da89348fc (diff) | |
parent | c3be9fdcf82cb5a7a0ecc2d5ad993d0d73fbdf4b (diff) |
Merge branch 'master' of github.com:moses-smt/mgiza
-rwxr-xr-x | mgizapp/scripts/merge_alignment.py | 10 | ||||
-rwxr-xr-x | mgizapp/scripts/plain2snt-hasvcb.py | 22 | ||||
-rwxr-xr-x | mgizapp/scripts/sntpostproc.py | 18 |
3 files changed, 38 insertions, 12 deletions
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py index 626bc68..c4e8b95 100755 --- a/mgizapp/scripts/merge_alignment.py +++ b/mgizapp/scripts/merge_alignment.py @@ -5,8 +5,16 @@ # prodcuced by MGIZA, which has sentence IDs, and every file is # ordered inside +from __future__ import unicode_literals import sys import re +import codecs +import io + +if sys.version_info < (3,0,0): + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) if len(sys.argv)<2: sys.stderr.write("Provide me the file names (at least 2)\n"); @@ -21,7 +29,7 @@ sents = []; done = []; for i in range(1,len(sys.argv)): - files.append(open(sys.argv[i],"r")); + files.append(io.open(sys.argv[i],"r", encoding="UTF-8")); ids.append(0); sents.append(""); done.append(False); diff --git a/mgizapp/scripts/plain2snt-hasvcb.py b/mgizapp/scripts/plain2snt-hasvcb.py index 490c493..5e7c6b0 100755 --- a/mgizapp/scripts/plain2snt-hasvcb.py +++ b/mgizapp/scripts/plain2snt-hasvcb.py @@ -1,10 +1,18 @@ #!/usr/bin/env python +from __future__ import unicode_literals from sys import * +import codecs +import io + +if sys.version_info < (3,0,0): + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) def loadvcb(fname,out): dict={}; - df = open(fname,"r"); + df = io.open(fname,"r", encoding="UTF-8"); for line in df: out.write(line); ws = line.strip().split(); @@ -19,14 +27,14 @@ if len(argv)<9: stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n"); exit(); -ein = open(argv[3],"r"); -fin = open(argv[4],"r"); +ein = io.open(argv[3],"r", encoding="UTF-8"); +fin = io.open(argv[4],"r", encoding="UTF-8"); -eout = open(argv[5],"w"); -fout = open(argv[6],"w"); +eout = io.open(argv[5],"w", encoding="UTF-8"); +fout = io.open(argv[6],"w", encoding="UTF-8"); -evcbx = open(argv[7],"w"); -fvcbx = open(argv[8],"w"); +evcbx = io.open(argv[7],"w", encoding="UTF-8"); +fvcbx = io.open(argv[8],"w", encoding="UTF-8"); evcb = loadvcb(argv[1],evcbx); fvcb = loadvcb(argv[2],fvcbx); diff --git a/mgizapp/scripts/sntpostproc.py b/mgizapp/scripts/sntpostproc.py index b3bf528..f2f1f35 100755 --- a/mgizapp/scripts/sntpostproc.py +++ b/mgizapp/scripts/sntpostproc.py @@ -3,15 +3,25 @@ # This script post process the snt file -- either in single-line format or in multi-line format # The output, however, will always be in single-line format +from __future__ import unicode_literals from sys import * from optparse import OptionParser import re; +import codecs +import io + usage = """ The script post process the snt file, the input could be single-line snt file or multi-line, (triple line) and can insert sentence weight to the file (-w) or add partial alignment to the file (-a) Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile """ + +if sys.version_info < (3,0,0): + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + parser = OptionParser(usage=usage) @@ -37,21 +47,21 @@ if options.snt == None: parser.print_help(); exit(); else: - sfile = open(options.snt,"r"); + sfile = io.open(options.snt,"r", encoding="UTF-8"); if options.output=="-": ofile = stdout; else: - ofile = open(options.output,"w"); + ofile = io.open(options.output,"w", encoding="UTF-8"); wfile = None; if options.weight <> None: - wfile = open(options.weight,"r"); + wfile = io.open(options.weight,"r", encoding="UTF-8"); afile = None; if options.align <> None: - afile = open(options.align,"r"); + afile = io.open(options.align,"r", encoding="UTF-8"); rr = re.compile("[\\|\\#\\*]"); wt = 0.0; |