From 91b12abd53538177b32644c26784be0bc67aaa16 Mon Sep 17 00:00:00 2001 From: Jeroen Vermeulen Date: Mon, 25 May 2015 17:44:45 +0700 Subject: Make merge_alignment.py fail on missing entry. Also, clean up lint. --- mgizapp/scripts/merge_alignment.py | 177 +++++++++++++++++++++---------------- 1 file changed, 99 insertions(+), 78 deletions(-) diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py index a1b7e90..4ff5284 100755 --- a/mgizapp/scripts/merge_alignment.py +++ b/mgizapp/scripts/merge_alignment.py @@ -1,9 +1,12 @@ #!/usr/bin/env python # Author : Qin Gao # Date : Dec 31, 2007 -# Purpose: Combine multiple alignment files into a single one, the files are -# prodcuced by MGIZA, which has sentence IDs, and every file is -# ordered inside + +"""Combine multiple alignment files into a single one. + +The files are prodcuced by MGIZA, which has sentence IDs, and every file is +ordered inside. +""" from __future__ import unicode_literals import sys @@ -12,86 +15,104 @@ import codecs import io import os + def normalize_path(path): + """Normalize a filesystem path. + + Convert Windows/Unix path separators to native ones, support "~" for + home directory portably, and convert the path to an absolute. + """ path = path.replace('\\', os.sep).replace('/', os.sep) path = os.path.expanduser(path) path = os.path.abspath(path) return path -if sys.version_info < (3,0,0): - sys.stdin = codecs.getreader('UTF-8')(sys.stdin) - sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) - sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) - -if len(sys.argv)<2: - sys.stderr.write("Provide me the file names (at least 2)\n"); - sys.exit(); - -sent_id = 0; - -files = []; -ids = []; - -sents = []; -done = []; - -for i in range(1,len(sys.argv)): - fname = normalize_path(sys.argv[i]) - files.append(io.open(fname, "r", encoding="UTF-8")); - ids.append(0); - sents.append(""); - done.append(False); - -r = re.compile("\\((\\d+)\\)"); -i = 0; -while i< len(files): - st1 = files[i].readline(); - st2 = files[i].readline(); - st3 = files[i].readline(); - if len(st1)==0 or len(st2)==0 or len(st3)==0: - done[i] = True; - else: - mt = r.search(st1); - id = int(mt.group(1)); - ids[i] = id; - sents[i] = (st1, st2, st3); - i += 1 - -cont = True; -while (cont): - sent_id += 1; - writeOne = False; +ID_PATTERN = re.compile("\\((\\d+)\\)") + + +def extract_id(line): + """Extract a sentence ID from `line`.""" + match = ID_PATTERN.search(line) + return int(match.group(1)) + + +def main(): + """Main body.""" + if sys.version_info < (3, 0, 0): + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + + if len(sys.argv) < 2: + sys.stderr.write("Provide me the file names (at least 2)\n") + sys.exit() + + sent_id = 0 + + files = [] + ids = [] + + sents = [] + done = [] + + for i in range(1, len(sys.argv)): + fname = normalize_path(sys.argv[i]) + files.append(io.open(fname, "r", encoding="UTF-8")) + ids.append(0) + sents.append("") + done.append(False) + + i = 0 + while i < len(files): + st1 = files[i].readline() + st2 = files[i].readline() + st3 = files[i].readline() + if len(st1) == 0 or len(st2) == 0 or len(st3) == 0: + done[i] = True + else: + ids[i] = extract_id(st1) + sents[i] = (st1, st2, st3) + i += 1 + + cont = True + while cont: + sent_id += 1 + write_one = False # Now try to read more sentences - i = 0; - cont = False; - while i < len(files): - if done[i]: - i+=1 - continue; - cont = True; - if ids[i] == sent_id: - sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2])); - writeOne = True; - st1 = files[i].readline(); - st2 = files[i].readline(); - st3 = files[i].readline(); - if len(st1)==0 or len(st2)==0 or len(st3)==0: - done[i] = True; - else: - mt = r.search(st1); - id = int(mt.group(1)); - ids[i] = id; - sents[i] = (st1, st2, st3); - cont = True; - break; - elif ids[i] < sent_id: - sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]); - sys.exit(); - else: - cont = True; - i+=1; - if (not writeOne) and cont: - sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id); - #sys.exit(); -sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1)); + i = 0 + cont = False + while i < len(files): + if done[i]: + i += 1 + continue + cont = True + if ids[i] == sent_id: + sys.stdout.write( + "%s%s%s" % (sents[i][0], sents[i][1], sents[i][2])) + write_one = True + st1 = files[i].readline() + st2 = files[i].readline() + st3 = files[i].readline() + if len(st1) == 0 or len(st2) == 0 or len(st3) == 0: + done[i] = True + else: + ids[i] = extract_id(st1) + sents[i] = (st1, st2, st3) + cont = True + break + elif ids[i] < sent_id: + sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]) + sys.exit() + else: + cont = True + i += 1 + if (not write_one) and cont: + sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id) + sys.exit(1) + sys.stderr.write( + "Combined %d files, totally %d sents \n" % (len(files), sent_id - 1)) + + +if __name__ == '__main__': + main() -- cgit v1.2.3