diff options
Diffstat (limited to 'mgizapp/scripts/merge_alignment.py')
-rwxr-xr-x | mgizapp/scripts/merge_alignment.py | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py new file mode 100755 index 0000000..626bc68 --- /dev/null +++ b/mgizapp/scripts/merge_alignment.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# Author : Qin Gao +# Date : Dec 31, 2007 +# Purpose: Combine multiple alignment files into a single one, the files are +# prodcuced by MGIZA, which has sentence IDs, and every file is +# ordered inside + +import sys +import re + +if len(sys.argv)<2: + sys.stderr.write("Provide me the file names (at least 2)\n"); + sys.exit(); + +sent_id = 0; + +files = []; +ids = []; + +sents = []; +done = []; + +for i in range(1,len(sys.argv)): + files.append(open(sys.argv[i],"r")); + ids.append(0); + sents.append(""); + done.append(False); + +r = re.compile("\\((\\d+)\\)"); +i = 0; +while i< len(files): + st1 = files[i].readline(); + st2 = files[i].readline(); + st3 = files[i].readline(); + if len(st1)==0 or len(st2)==0 or len(st3)==0: + done[i] = True; + else: + mt = r.search(st1); + id = int(mt.group(1)); + ids[i] = id; + sents[i] = (st1, st2, st3); + i += 1 + +cont = True; +while (cont): + sent_id += 1; + writeOne = False; +# Now try to read more sentences + i = 0; + cont = False; + while i < len(files): + if done[i]: + i+=1 + continue; + cont = True; + if ids[i] == sent_id: + sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2])); + writeOne = True; + st1 = files[i].readline(); + st2 = files[i].readline(); + st3 = files[i].readline(); + if len(st1)==0 or len(st2)==0 or len(st3)==0: + done[i] = True; + else: + mt = r.search(st1); + id = int(mt.group(1)); + ids[i] = id; + sents[i] = (st1, st2, st3); + cont = True; + break; + elif ids[i] < sent_id: + sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]); + sys.exit(); + else: + cont = True; + i+=1; + if (not writeOne) and cont: + sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id); + #sys.exit(); +sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1)); |