Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-05-25 13:44:45 +0300
committerJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-05-25 13:44:45 +0300
commit91b12abd53538177b32644c26784be0bc67aaa16 (patch)
tree08323df659365bcd6353c3a4c1ad4996d7b62f5f
parentd3597a69e2aab0a7111aae196fd81e310cc70fea (diff)
Make merge_alignment.py fail on missing entry.
Also, clean up lint.
-rwxr-xr-xmgizapp/scripts/merge_alignment.py177
1 files changed, 99 insertions, 78 deletions
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py
index a1b7e90..4ff5284 100755
--- a/mgizapp/scripts/merge_alignment.py
+++ b/mgizapp/scripts/merge_alignment.py
@@ -1,9 +1,12 @@
#!/usr/bin/env python
# Author : Qin Gao
# Date : Dec 31, 2007
-# Purpose: Combine multiple alignment files into a single one, the files are
-# prodcuced by MGIZA, which has sentence IDs, and every file is
-# ordered inside
+
+"""Combine multiple alignment files into a single one.
+
+The files are prodcuced by MGIZA, which has sentence IDs, and every file is
+ordered inside.
+"""
from __future__ import unicode_literals
import sys
@@ -12,86 +15,104 @@ import codecs
import io
import os
+
def normalize_path(path):
+ """Normalize a filesystem path.
+
+ Convert Windows/Unix path separators to native ones, support "~" for
+ home directory portably, and convert the path to an absolute.
+ """
path = path.replace('\\', os.sep).replace('/', os.sep)
path = os.path.expanduser(path)
path = os.path.abspath(path)
return path
-if sys.version_info < (3,0,0):
- sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
- sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
- sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
-
-if len(sys.argv)<2:
- sys.stderr.write("Provide me the file names (at least 2)\n");
- sys.exit();
-
-sent_id = 0;
-
-files = [];
-ids = [];
-
-sents = [];
-done = [];
-
-for i in range(1,len(sys.argv)):
- fname = normalize_path(sys.argv[i])
- files.append(io.open(fname, "r", encoding="UTF-8"));
- ids.append(0);
- sents.append("");
- done.append(False);
-
-r = re.compile("\\((\\d+)\\)");
-i = 0;
-while i< len(files):
- st1 = files[i].readline();
- st2 = files[i].readline();
- st3 = files[i].readline();
- if len(st1)==0 or len(st2)==0 or len(st3)==0:
- done[i] = True;
- else:
- mt = r.search(st1);
- id = int(mt.group(1));
- ids[i] = id;
- sents[i] = (st1, st2, st3);
- i += 1
-
-cont = True;
-while (cont):
- sent_id += 1;
- writeOne = False;
+ID_PATTERN = re.compile("\\((\\d+)\\)")
+
+
+def extract_id(line):
+ """Extract a sentence ID from `line`."""
+ match = ID_PATTERN.search(line)
+ return int(match.group(1))
+
+
+def main():
+ """Main body."""
+ if sys.version_info < (3, 0, 0):
+ sys.stdin = codecs.getreader('UTF-8')(sys.stdin)
+ sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
+ sys.stderr = codecs.getwriter('UTF-8')(sys.stderr)
+
+ if len(sys.argv) < 2:
+ sys.stderr.write("Provide me the file names (at least 2)\n")
+ sys.exit()
+
+ sent_id = 0
+
+ files = []
+ ids = []
+
+ sents = []
+ done = []
+
+ for i in range(1, len(sys.argv)):
+ fname = normalize_path(sys.argv[i])
+ files.append(io.open(fname, "r", encoding="UTF-8"))
+ ids.append(0)
+ sents.append("")
+ done.append(False)
+
+ i = 0
+ while i < len(files):
+ st1 = files[i].readline()
+ st2 = files[i].readline()
+ st3 = files[i].readline()
+ if len(st1) == 0 or len(st2) == 0 or len(st3) == 0:
+ done[i] = True
+ else:
+ ids[i] = extract_id(st1)
+ sents[i] = (st1, st2, st3)
+ i += 1
+
+ cont = True
+ while cont:
+ sent_id += 1
+ write_one = False
# Now try to read more sentences
- i = 0;
- cont = False;
- while i < len(files):
- if done[i]:
- i+=1
- continue;
- cont = True;
- if ids[i] == sent_id:
- sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2]));
- writeOne = True;
- st1 = files[i].readline();
- st2 = files[i].readline();
- st3 = files[i].readline();
- if len(st1)==0 or len(st2)==0 or len(st3)==0:
- done[i] = True;
- else:
- mt = r.search(st1);
- id = int(mt.group(1));
- ids[i] = id;
- sents[i] = (st1, st2, st3);
- cont = True;
- break;
- elif ids[i] < sent_id:
- sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]);
- sys.exit();
- else:
- cont = True;
- i+=1;
- if (not writeOne) and cont:
- sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id);
- #sys.exit();
-sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1));
+ i = 0
+ cont = False
+ while i < len(files):
+ if done[i]:
+ i += 1
+ continue
+ cont = True
+ if ids[i] == sent_id:
+ sys.stdout.write(
+ "%s%s%s" % (sents[i][0], sents[i][1], sents[i][2]))
+ write_one = True
+ st1 = files[i].readline()
+ st2 = files[i].readline()
+ st3 = files[i].readline()
+ if len(st1) == 0 or len(st2) == 0 or len(st3) == 0:
+ done[i] = True
+ else:
+ ids[i] = extract_id(st1)
+ sents[i] = (st1, st2, st3)
+ cont = True
+ break
+ elif ids[i] < sent_id:
+ sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i])
+ sys.exit()
+ else:
+ cont = True
+ i += 1
+ if (not write_one) and cont:
+ sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id)
+ sys.exit(1)
+ sys.stderr.write(
+ "Combined %d files, totally %d sents \n" % (len(files), sent_id - 1))
+
+
+if __name__ == '__main__':
+ main()