Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'mgizapp/scripts/sntpostproc.py')
-rwxr-xr-xmgizapp/scripts/sntpostproc.py116
1 files changed, 116 insertions, 0 deletions
diff --git a/mgizapp/scripts/sntpostproc.py b/mgizapp/scripts/sntpostproc.py
new file mode 100755
index 0000000..b3bf528
--- /dev/null
+++ b/mgizapp/scripts/sntpostproc.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+# This script post process the snt file -- either in single-line format or in multi-line format
+# The output, however, will always be in single-line format
+
+from sys import *
+from optparse import OptionParser
+import re;
+usage = """
+The script post process the snt file, the input could be single-line snt
+file or multi-line, (triple line) and can insert sentence weight to the
+file (-w) or add partial alignment to the file (-a)
+Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile
+"""
+parser = OptionParser(usage=usage)
+
+
+parser = OptionParser()
+
+parser.add_option("-s", "--snt", dest="snt",default=None,
+ help="The input snt file", metavar="FILE")
+
+parser.add_option("-w", "--weight", dest="weight",default=None,
+ help="The input weight file", metavar="FILE")
+
+
+parser.add_option("-o", "--output", dest="output",default="-",
+ help="The input partial alignment file, one sentence per line", metavar="FILE")
+
+parser.add_option("-a", "--align", dest="align",default=None,
+ help="The input partial alignment file, one sentence per line", metavar="FILE")
+
+
+(options, args) = parser.parse_args()
+
+if options.snt == None:
+ parser.print_help();
+ exit();
+else:
+ sfile = open(options.snt,"r");
+
+if options.output=="-":
+ ofile = stdout;
+else:
+ ofile = open(options.output,"w");
+
+wfile = None;
+
+if options.weight <> None:
+ wfile = open(options.weight,"r");
+
+afile = None;
+if options.align <> None:
+ afile = open(options.align,"r");
+
+rr = re.compile("[\\|\\#\\*]");
+wt = 0.0;
+al = {};
+e = "";
+f = "";
+
+def parse_ax(line):
+ alq = {};
+ als = line.strip().split(" ");
+ for e in als:
+ if len(e.strip())>0:
+ alo = e.split("-");
+ if len(alo)==2:
+ alq[tuple(alo)] = 1;
+ return alq;
+
+
+
+
+
+
+while True:
+ l = sfile.readline();
+ if len(l) == 0:
+ break;
+ lp = rr.split(l.strip());
+ if len(lp)>=3:
+ wt = float(lp[0]);
+ e = lp[1];
+ f = lp[2];
+ if len(lp) > 3:
+ al = parse_ax(lp[3]);
+ else:
+ al = {};
+ else:
+ wt = float(l);
+ e = sfile.readline().strip();
+ f = sfile.readline().strip();
+ al={}
+ if wfile <> None:
+ lw = wfile.readline().strip();
+ if len(lw)>0:
+ wt = float(lw);
+ else:
+ wt = 1;
+ if afile <> None:
+ la = afile.readline().strip();
+ if len(la)>0:
+ al1 = parse_ax(la);
+ for entry in al1.keys():
+ al[entry] = 1;
+
+ ofile.write("%g | %s | %s" % (wt, e, f));
+ if len(al)>0:
+ ofile.write(" |");
+
+ for entry in al.keys():
+ ofile.write(" %s-%s" % entry);
+ ofile.write("\n");
+
+