Update

author: edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d> 2010-01-25 02:36:54 +0300
committer: edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d> 2010-01-25 02:36:54 +0300
commit: 2be195bdfab57981e178f86f2dc0252f78e923a8 (patch)
tree: 7c21643c74834102e0ce24feded9b48b8c7baf68 /mgizapp
parent: cd574c22ba2556e80e8cfbc035ccdaa5ecec10a8 (diff)
7 files changed, 451 insertions, 2 deletions
diff --git a/mgizapp/scripts/force-align-moses.sh b/mgizapp/scripts/force-align-moses.sh
new file mode 100755
index 0000000..fd4cf12
--- /dev/null
+++ b/mgizapp/scripts/force-align-moses.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+MGIZA=${QMT_HOME}/bin/mgiza
+
+if [ $# -lt 4 ]; then
+	echo "OK, this is simple, put me into your Moses training directory, link your source/target corpus" 1>&2
+	echo "and run " $0 " PREFIX src_tag tgt_tag root-dir." 1>&2
+	echo "and get force-aligned data: root-dir/giza.[src-tgt|tgt-src]/*.A3.final.* " 1>&2
+	echo "make sure I can find PREFIX.src_tag-tgt_tag and PREFIX.tgt_tag-src_tag, and \${QMT_HOME} is set" 1>&2
+	exit
+fi
+
+PRE=$1
+SRC=$2
+TGT=$3
+ROOT=$4
+
+mkdir -p $ROOT/giza.${SRC}-${TGT}
+mkdir -p $ROOT/giza.${TGT}-${SRC}
+mkdir -p $ROOT/corpus
+
+echo "Generating corpus file " 1>&2
+
+${QMT_HOME}/scripts/plain2snt-hasvcb.py corpus/$SRC.vcb corpus/$TGT.vcb ${PRE}.${SRC} ${PRE}.${TGT} $ROOT/corpus/${TGT}-${SRC}.snt $ROOT/corpus/${SRC}-${TGT}.snt $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb
+
+ln -sf $PWD/corpus/$SRC.vcb.classes $PWD/corpus/$TGT.vcb.classes $ROOT/corpus/
+
+echo "Generating co-occurrence file " 1>&2
+
+${QMT_HOME}/bin/snt2cooc $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb $ROOT/corpus/${TGT}-${SRC}.snt
+${QMT_HOME}/bin//snt2cooc $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc $ROOT/corpus/$TGT.vcb $ROOT/corpus/$SRC.vcb $ROOT/corpus/${SRC}-${TGT}.snt
+
+echo "Running force alignment " 1>&2
+
+$MGIZA giza.$TGT-$SRC/$TGT-$SRC.gizacfg -c $ROOT/corpus/$TGT-$SRC.snt -o $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC} \
+-s $ROOT/corpus/$SRC.vcb -t $ROOT/corpus/$TGT.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc \
+-restart 11 -previoust giza.$TGT-$SRC/$TGT-$SRC.t3.final \
+-previousa giza.$TGT-$SRC/$TGT-$SRC.a3.final -previousd giza.$TGT-$SRC/$TGT-$SRC.d3.final \
+-previousn giza.$TGT-$SRC/$TGT-$SRC.n3.final -previousd4 giza.$TGT-$SRC/$TGT-$SRC.d4.final \
+-previousd42 giza.$TGT-$SRC/$TGT-$SRC.D4.final -m3 0 -m4 1
+
+$MGIZA giza.$SRC-$TGT/$SRC-$TGT.gizacfg -c $ROOT/corpus/$SRC-$TGT.snt -o $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT} \
+-s $ROOT/corpus/$TGT.vcb -t $ROOT/corpus/$SRC.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc \
+-restart 11 -previoust giza.$SRC-$TGT/$SRC-$TGT.t3.final \
+-previousa giza.$SRC-$TGT/$SRC-$TGT.a3.final -previousd giza.$SRC-$TGT/$SRC-$TGT.d3.final \
+-previousn giza.$SRC-$TGT/$SRC-$TGT.n3.final -previousd4 giza.$SRC-$TGT/$SRC-$TGT.d4.final \
+-previousd42 giza.$SRC-$TGT/$SRC-$TGT.D4.final -m3 0 -m4 1
+
diff --git a/mgizapp/scripts/giza2bal.pl b/mgizapp/scripts/giza2bal.pl
new file mode 100755
index 0000000..fb134c0
--- /dev/null
+++ b/mgizapp/scripts/giza2bal.pl
@@ -0,0 +1,112 @@
+#! /usr/bin/perl
+
+# $Id: giza2bal.pl 1562 2008-02-19 20:48:14Z redpony $
+#Converts direct and inverted alignments into a more compact 
+#bi-alignment format. It optionally reads the counting file 
+#produced by giza containing the frequency of each traning sentence.
+
+#Copyright Marcello Federico, November 2004
+
+($cnt,$dir,$inv)=();
+
+while ($w=shift @ARGV){
+  $dir=shift(@ARGV),next  if $w eq "-d";
+  $inv=shift(@ARGV),next  if $w eq "-i";
+  $cnt=shift(@ARGV),next  if $w eq "-c";
+} 
+
+my $lc = 0;
+
+if (!$dir || !inv){
+ print  "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n"; 
+ print  "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n";
+ exit(0);
+}
+
+$|=1;
+
+open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n";
+open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n";
+
+if ($cnt){
+open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n";
+}
+
+
+sub ReadBiAlign{
+    local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_;
+    local($dummy,$n);
+
+    chop($c=<$fd0>); ## count
+    $dummy=<$fd0>; ## header
+    $dummy=<$fd0>; ## header
+    $c=1 if !$c;
+
+    $dummy=<$fd1>; ## header
+    chop($s1=<$fd1>);
+    chop($t1=<$fd1>);
+
+    $dummy=<$fd2>; ## header
+    chop($s2=<$fd2>);
+    chop($t2=<$fd2>);
+
+    @a=@b=();
+    $lc++;
+
+    #get target statistics
+    $n=1;
+    $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
+    while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
+        grep($a[$_]=$n,split(/\s+/,$2));
+        $n++;
+    }
+
+    $m=1;
+    $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
+    while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
+        grep($b[$_]=$m,split(/\s+/,$2));
+        $m++;
+    }
+
+    $M=split(/\s+/,$s1);
+    $N=split(/\s+/,$s2);
+
+    if ($m != ($M+1) || $n != ($N+1)) {
+      print STDERR "Sentence mismatch error! Line #$lc\n";
+      $s1 = "ALIGN_ERR";
+      $s2 = "ALIGN_ERR";
+      @a=(); @b=();
+      for ($j=1;$j<2;$j++){ $a[$j]=1; }
+      for ($i=1;$i<2;$i++){ $b[$i]=1; }
+      return 1;
+    }
+
+    for ($j=1;$j<$m;$j++){
+        $a[$j]=0 if !$a[$j];
+    }
+
+    for ($i=1;$i<$n;$i++){
+        $b[$i]=0 if !$b[$i];
+    }
+
+
+    return 1;
+}
+
+$skip=0;
+$ccc=0;
+while(!eof(DIR)){
+
+    if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c))
+    {
+	$ccc++;
+        print "$c\n";
+        print $#a," $src \# @a[1..$#a]\n";
+        print $#b," $tgt \# @b[1..$#b]\n";
+    }
+    else{
+    	print "\n";
+        print STDERR "." if !(++$skip % 1000);
+    }
+};
+print STDERR "skip=<$skip> counts=<$ccc>\n";
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py
new file mode 100755
index 0000000..626bc68
--- /dev/null
+++ b/mgizapp/scripts/merge_alignment.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# Author : Qin Gao
+# Date   : Dec 31, 2007
+# Purpose: Combine multiple alignment files into a single one, the files are
+#          prodcuced by MGIZA, which has sentence IDs, and every file is 
+#          ordered inside
+
+import sys
+import re
+
+if len(sys.argv)<2:
+	sys.stderr.write("Provide me the file names (at least 2)\n");
+	sys.exit();
+
+sent_id = 0;
+
+files = [];
+ids = [];
+
+sents = [];
+done = [];
+
+for i in range(1,len(sys.argv)):
+	files.append(open(sys.argv[i],"r"));
+	ids.append(0);
+	sents.append("");
+	done.append(False);
+
+r = re.compile("\\((\\d+)\\)");	
+i = 0;
+while i< len(files):
+	st1 = files[i].readline();
+	st2 = files[i].readline();
+	st3 = files[i].readline();
+	if len(st1)==0 or len(st2)==0 or len(st3)==0:
+		done[i] = True;
+	else:
+		mt = r.search(st1);
+		id = int(mt.group(1));
+		ids[i] = id;
+		sents[i] = (st1, st2, st3);
+	i += 1
+		
+cont = True;
+while (cont):
+	sent_id += 1;
+	writeOne = False;
+# Now try to read more sentences
+	i = 0;
+	cont = False;
+	while i < len(files):
+		if done[i]:
+			i+=1
+			continue;
+		cont = True;
+		if ids[i] == sent_id:
+			sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2]));
+			writeOne = True;
+			st1 = files[i].readline();
+			st2 = files[i].readline();
+			st3 = files[i].readline();
+			if len(st1)==0 or len(st2)==0 or len(st3)==0:
+				done[i] = True;
+			else:
+				mt = r.search(st1);
+				id = int(mt.group(1));
+				ids[i] = id;
+				sents[i] = (st1, st2, st3);
+				cont = True;
+			break;
+		elif ids[i] < sent_id:
+			sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]);
+			sys.exit();
+		else:
+			cont = True;
+		i+=1;
+	if (not writeOne) and cont:
+		sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id);
+		#sys.exit();
+sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1));
diff --git a/mgizapp/scripts/plain2snt-hasvcb.py b/mgizapp/scripts/plain2snt-hasvcb.py
new file mode 100755
index 0000000..490c493
--- /dev/null
+++ b/mgizapp/scripts/plain2snt-hasvcb.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+
+from sys import *
+
+def loadvcb(fname,out):
+	dict={};
+	df = open(fname,"r");
+	for line in df:
+		out.write(line);
+		ws = line.strip().split();
+	        id = int(ws[0]);
+		wd = ws[1];
+		dict[wd]=id;
+	return dict;
+
+if len(argv)<9:
+	stderr.write("Error, the input should be \n");
+	stderr.write("%s evcb fvcb etxt ftxt esnt(out) fsnt(out) evcbx(out) fvcbx(out)\n" % argv[0]);
+	stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n");
+	exit();
+
+ein = open(argv[3],"r");
+fin = open(argv[4],"r");
+
+eout = open(argv[5],"w");
+fout = open(argv[6],"w");
+
+evcbx = open(argv[7],"w");
+fvcbx = open(argv[8],"w");
+evcb = loadvcb(argv[1],evcbx);
+fvcb = loadvcb(argv[2],fvcbx);
+
+i=0
+while True:
+	i+=1;
+	eline=ein.readline();
+	fline=fin.readline();
+	if len(eline)==0 or len(fline)==0:
+		break;
+	ewords = eline.strip().split();
+	fwords = fline.strip().split();
+	el = [];
+	fl = [];
+	j=0;
+	for w in ewords:
+		j+=1
+		if evcb.has_key(w):
+			el.append(evcb[w]);
+		else:
+			if evcb.has_key(w.lower()):
+				el.append(evcb[w.lower()]);
+			else:
+				##stdout.write("#E %d %d %s\n" % (i,j,w))
+				#el.append(1);
+				nid = len(evcb)+1;
+				evcb[w.lower()] = nid;
+				evcbx.write("%d %s 1\n" % (nid, w));
+				el.append(nid);
+
+	j=0;
+	for w in fwords:
+		j+=1
+		if fvcb.has_key(w):
+			fl.append(fvcb[w]);
+		else:
+			if fvcb.has_key(w.lower()):
+				fl.append(fvcb[w.lower()]);
+			else:
+				#stdout.write("#F %d %d %s\n" % (i,j,w))
+				nid = len(fvcb)+1;
+				fvcb[w.lower()] = nid;
+				fvcbx.write("%d %s 1\n" % (nid, w));
+				fl.append(nid);
+				#fl.append(1);
+	eout.write("1\n");
+	fout.write("1\n");
+	for I in el:
+		eout.write("%d " % I);
+	eout.write("\n");
+	for I in fl:
+		eout.write("%d " % I);
+		fout.write("%d " % I);
+	eout.write("\n");
+	fout.write("\n");
+	for I in el:
+		fout.write("%d " % I);
+	fout.write("\n");
+
+fout.close();
+eout.close();
+fvcbx.close();
+evcbx.close();
+
diff --git a/mgizapp/scripts/sntpostproc.py b/mgizapp/scripts/sntpostproc.py
new file mode 100755
index 0000000..b3bf528
--- /dev/null
+++ b/mgizapp/scripts/sntpostproc.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+# This script post process the snt file -- either in single-line format or in multi-line format
+# The output, however, will always be in single-line format
+
+from sys import *
+from optparse import OptionParser
+import re;
+usage = """
+The script post process the snt file, the input could be single-line snt 
+file or multi-line, (triple line) and can insert sentence weight to the
+file (-w) or add partial alignment to the file (-a)
+Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile
+"""
+parser = OptionParser(usage=usage)
+
+
+parser = OptionParser()
+
+parser.add_option("-s", "--snt", dest="snt",default=None,
+		help="The input snt file", metavar="FILE")
+
+parser.add_option("-w", "--weight", dest="weight",default=None,
+		help="The input weight file", metavar="FILE")
+
+
+parser.add_option("-o", "--output", dest="output",default="-",
+		help="The input partial alignment file, one sentence per line", metavar="FILE")
+
+parser.add_option("-a", "--align", dest="align",default=None,
+		help="The input partial alignment file, one sentence per line", metavar="FILE")
+
+
+(options, args) = parser.parse_args()
+
+if options.snt == None:
+	parser.print_help();
+	exit();
+else:
+	sfile = open(options.snt,"r");
+
+if options.output=="-":
+	ofile = stdout;
+else:
+	ofile = open(options.output,"w");
+
+wfile = None;
+
+if options.weight <> None:
+	wfile = open(options.weight,"r");
+
+afile = None;
+if options.align <> None:
+	afile = open(options.align,"r");
+
+rr = re.compile("[\\|\\#\\*]");
+wt = 0.0;
+al = {};
+e = "";
+f = "";
+
+def parse_ax(line):
+	alq = {};
+	als = line.strip().split(" ");
+	for e in als:
+		if len(e.strip())>0:
+			alo = e.split("-");
+			if len(alo)==2:
+				alq[tuple(alo)] = 1;
+	return alq;
+	
+
+
+
+
+
+while True:
+	l = sfile.readline();
+	if len(l) == 0:
+		break;
+	lp = rr.split(l.strip());
+	if len(lp)>=3:
+		wt = float(lp[0]);
+		e = lp[1];
+		f = lp[2];
+		if len(lp) > 3:
+			al = parse_ax(lp[3]);
+		else:
+			al = {};
+	else:
+		wt = float(l);
+		e = sfile.readline().strip();
+		f = sfile.readline().strip();
+		al={}
+	if wfile <> None:
+		lw = wfile.readline().strip();
+		if len(lw)>0:
+			wt = float(lw);
+		else:
+			wt = 1;
+	if afile <> None:
+		la = afile.readline().strip();
+		if len(la)>0:
+			al1 = parse_ax(la);
+			for entry in al1.keys():
+				al[entry] = 1;
+
+	ofile.write("%g | %s | %s" % (wt, e, f));
+	if len(al)>0:
+		ofile.write(" |");
+
+		for entry in al.keys():
+			ofile.write(" %s-%s" % entry);
+	ofile.write("\n");
+
+	
diff --git a/mgizapp/src/hmm.cpp b/mgizapp/src/hmm.cpp
index dd8cde5..ca48477 100644
--- a/mgizapp/src/hmm.cpp
+++ b/mgizapp/src/hmm.cpp
@@ -124,7 +124,7 @@ int hmm::em_with_tricks(int noIterations,bool dumpCount,
         pair_no = 0;
         it_st = time(NULL) ;
         cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n';
-        dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0) && !NODUMPS;
+        dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0 || it == noIterations) && !NODUMPS;
         //dump_files = true;
         number = "";
         int n = it;
diff --git a/mgizapp/src/model1.cpp b/mgizapp/src/model1.cpp
index e649f8d..18fc22a 100644
--- a/mgizapp/src/model1.cpp
+++ b/mgizapp/src/model1.cpp
@@ -138,7 +138,7 @@ int model1::em_with_tricks(int noIterations, /*Perplexity& perp, sentenceHandler
         pair_no = 0 ;
         it_st = time(NULL);
         cout <<  "-----------\n" << modelName << ": Iteration " << it << '\n';
-        dump_files = (Model1_Dump_Freq != 0) &&  ((it % Model1_Dump_Freq)  == 0) && !NODUMPS ;
+        dump_files = (Model1_Dump_Freq != 0) &&  ((it % Model1_Dump_Freq)  == 0 || it == noIterations) && !NODUMPS ;
 	//dump_files = true;
         number = "";
         int n = it;
author	edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d>	2010-01-25 02:36:54 +0300
committer	edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d>	2010-01-25 02:36:54 +0300
commit	2be195bdfab57981e178f86f2dc0252f78e923a8 (patch)
tree	7c21643c74834102e0ce24feded9b48b8c7baf68 /mgizapp
parent	cd574c22ba2556e80e8cfbc035ccdaa5ecec10a8 (diff)