diff options
Diffstat (limited to 'mgizapp/scripts/merge_alignment.py')
-rw-r--r-- | mgizapp/scripts/merge_alignment.py | 80 |
1 files changed, 0 insertions, 80 deletions
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py deleted file mode 100644 index 626bc68..0000000 --- a/mgizapp/scripts/merge_alignment.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python -# Author : Qin Gao -# Date : Dec 31, 2007 -# Purpose: Combine multiple alignment files into a single one, the files are -# prodcuced by MGIZA, which has sentence IDs, and every file is -# ordered inside - -import sys -import re - -if len(sys.argv)<2: - sys.stderr.write("Provide me the file names (at least 2)\n"); - sys.exit(); - -sent_id = 0; - -files = []; -ids = []; - -sents = []; -done = []; - -for i in range(1,len(sys.argv)): - files.append(open(sys.argv[i],"r")); - ids.append(0); - sents.append(""); - done.append(False); - -r = re.compile("\\((\\d+)\\)"); -i = 0; -while i< len(files): - st1 = files[i].readline(); - st2 = files[i].readline(); - st3 = files[i].readline(); - if len(st1)==0 or len(st2)==0 or len(st3)==0: - done[i] = True; - else: - mt = r.search(st1); - id = int(mt.group(1)); - ids[i] = id; - sents[i] = (st1, st2, st3); - i += 1 - -cont = True; -while (cont): - sent_id += 1; - writeOne = False; -# Now try to read more sentences - i = 0; - cont = False; - while i < len(files): - if done[i]: - i+=1 - continue; - cont = True; - if ids[i] == sent_id: - sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2])); - writeOne = True; - st1 = files[i].readline(); - st2 = files[i].readline(); - st3 = files[i].readline(); - if len(st1)==0 or len(st2)==0 or len(st3)==0: - done[i] = True; - else: - mt = r.search(st1); - id = int(mt.group(1)); - ids[i] = id; - sents[i] = (st1, st2, st3); - cont = True; - break; - elif ids[i] < sent_id: - sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]); - sys.exit(); - else: - cont = True; - i+=1; - if (not writeOne) and cont: - sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id); - #sys.exit(); -sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1)); |