Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoredwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d>2010-01-25 02:36:54 +0300
committeredwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d>2010-01-25 02:36:54 +0300
commit2be195bdfab57981e178f86f2dc0252f78e923a8 (patch)
tree7c21643c74834102e0ce24feded9b48b8c7baf68 /mgizapp
parentcd574c22ba2556e80e8cfbc035ccdaa5ecec10a8 (diff)
Update
Diffstat (limited to 'mgizapp')
-rwxr-xr-xmgizapp/scripts/force-align-moses.sh48
-rwxr-xr-xmgizapp/scripts/giza2bal.pl112
-rwxr-xr-xmgizapp/scripts/merge_alignment.py80
-rwxr-xr-xmgizapp/scripts/plain2snt-hasvcb.py93
-rwxr-xr-xmgizapp/scripts/sntpostproc.py116
-rw-r--r--mgizapp/src/hmm.cpp2
-rw-r--r--mgizapp/src/model1.cpp2
7 files changed, 451 insertions, 2 deletions
diff --git a/mgizapp/scripts/force-align-moses.sh b/mgizapp/scripts/force-align-moses.sh
new file mode 100755
index 0000000..fd4cf12
--- /dev/null
+++ b/mgizapp/scripts/force-align-moses.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+MGIZA=${QMT_HOME}/bin/mgiza
+
+if [ $# -lt 4 ]; then
+ echo "OK, this is simple, put me into your Moses training directory, link your source/target corpus" 1>&2
+ echo "and run " $0 " PREFIX src_tag tgt_tag root-dir." 1>&2
+ echo "and get force-aligned data: root-dir/giza.[src-tgt|tgt-src]/*.A3.final.* " 1>&2
+ echo "make sure I can find PREFIX.src_tag-tgt_tag and PREFIX.tgt_tag-src_tag, and \${QMT_HOME} is set" 1>&2
+ exit
+fi
+
+PRE=$1
+SRC=$2
+TGT=$3
+ROOT=$4
+
+mkdir -p $ROOT/giza.${SRC}-${TGT}
+mkdir -p $ROOT/giza.${TGT}-${SRC}
+mkdir -p $ROOT/corpus
+
+echo "Generating corpus file " 1>&2
+
+${QMT_HOME}/scripts/plain2snt-hasvcb.py corpus/$SRC.vcb corpus/$TGT.vcb ${PRE}.${SRC} ${PRE}.${TGT} $ROOT/corpus/${TGT}-${SRC}.snt $ROOT/corpus/${SRC}-${TGT}.snt $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb
+
+ln -sf $PWD/corpus/$SRC.vcb.classes $PWD/corpus/$TGT.vcb.classes $ROOT/corpus/
+
+echo "Generating co-occurrence file " 1>&2
+
+${QMT_HOME}/bin/snt2cooc $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb $ROOT/corpus/${TGT}-${SRC}.snt
+${QMT_HOME}/bin//snt2cooc $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc $ROOT/corpus/$TGT.vcb $ROOT/corpus/$SRC.vcb $ROOT/corpus/${SRC}-${TGT}.snt
+
+echo "Running force alignment " 1>&2
+
+$MGIZA giza.$TGT-$SRC/$TGT-$SRC.gizacfg -c $ROOT/corpus/$TGT-$SRC.snt -o $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC} \
+-s $ROOT/corpus/$SRC.vcb -t $ROOT/corpus/$TGT.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc \
+-restart 11 -previoust giza.$TGT-$SRC/$TGT-$SRC.t3.final \
+-previousa giza.$TGT-$SRC/$TGT-$SRC.a3.final -previousd giza.$TGT-$SRC/$TGT-$SRC.d3.final \
+-previousn giza.$TGT-$SRC/$TGT-$SRC.n3.final -previousd4 giza.$TGT-$SRC/$TGT-$SRC.d4.final \
+-previousd42 giza.$TGT-$SRC/$TGT-$SRC.D4.final -m3 0 -m4 1
+
+$MGIZA giza.$SRC-$TGT/$SRC-$TGT.gizacfg -c $ROOT/corpus/$SRC-$TGT.snt -o $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT} \
+-s $ROOT/corpus/$TGT.vcb -t $ROOT/corpus/$SRC.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc \
+-restart 11 -previoust giza.$SRC-$TGT/$SRC-$TGT.t3.final \
+-previousa giza.$SRC-$TGT/$SRC-$TGT.a3.final -previousd giza.$SRC-$TGT/$SRC-$TGT.d3.final \
+-previousn giza.$SRC-$TGT/$SRC-$TGT.n3.final -previousd4 giza.$SRC-$TGT/$SRC-$TGT.d4.final \
+-previousd42 giza.$SRC-$TGT/$SRC-$TGT.D4.final -m3 0 -m4 1
+
diff --git a/mgizapp/scripts/giza2bal.pl b/mgizapp/scripts/giza2bal.pl
new file mode 100755
index 0000000..fb134c0
--- /dev/null
+++ b/mgizapp/scripts/giza2bal.pl
@@ -0,0 +1,112 @@
+#! /usr/bin/perl
+
+# $Id: giza2bal.pl 1562 2008-02-19 20:48:14Z redpony $
+#Converts direct and inverted alignments into a more compact
+#bi-alignment format. It optionally reads the counting file
+#produced by giza containing the frequency of each traning sentence.
+
+#Copyright Marcello Federico, November 2004
+
+($cnt,$dir,$inv)=();
+
+while ($w=shift @ARGV){
+ $dir=shift(@ARGV),next if $w eq "-d";
+ $inv=shift(@ARGV),next if $w eq "-i";
+ $cnt=shift(@ARGV),next if $w eq "-c";
+}
+
+my $lc = 0;
+
+if (!$dir || !inv){
+ print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n";
+ print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n";
+ exit(0);
+}
+
+$|=1;
+
+open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n";
+open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n";
+
+if ($cnt){
+open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n";
+}
+
+
+sub ReadBiAlign{
+ local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_;
+ local($dummy,$n);
+
+ chop($c=<$fd0>); ## count
+ $dummy=<$fd0>; ## header
+ $dummy=<$fd0>; ## header
+ $c=1 if !$c;
+
+ $dummy=<$fd1>; ## header
+ chop($s1=<$fd1>);
+ chop($t1=<$fd1>);
+
+ $dummy=<$fd2>; ## header
+ chop($s2=<$fd2>);
+ chop($t2=<$fd2>);
+
+ @a=@b=();
+ $lc++;
+
+ #get target statistics
+ $n=1;
+ $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
+ while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
+ grep($a[$_]=$n,split(/\s+/,$2));
+ $n++;
+ }
+
+ $m=1;
+ $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
+ while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
+ grep($b[$_]=$m,split(/\s+/,$2));
+ $m++;
+ }
+
+ $M=split(/\s+/,$s1);
+ $N=split(/\s+/,$s2);
+
+ if ($m != ($M+1) || $n != ($N+1)) {
+ print STDERR "Sentence mismatch error! Line #$lc\n";
+ $s1 = "ALIGN_ERR";
+ $s2 = "ALIGN_ERR";
+ @a=(); @b=();
+ for ($j=1;$j<2;$j++){ $a[$j]=1; }
+ for ($i=1;$i<2;$i++){ $b[$i]=1; }
+ return 1;
+ }
+
+ for ($j=1;$j<$m;$j++){
+ $a[$j]=0 if !$a[$j];
+ }
+
+ for ($i=1;$i<$n;$i++){
+ $b[$i]=0 if !$b[$i];
+ }
+
+
+ return 1;
+}
+
+$skip=0;
+$ccc=0;
+while(!eof(DIR)){
+
+ if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c))
+ {
+ $ccc++;
+ print "$c\n";
+ print $#a," $src \# @a[1..$#a]\n";
+ print $#b," $tgt \# @b[1..$#b]\n";
+ }
+ else{
+ print "\n";
+ print STDERR "." if !(++$skip % 1000);
+ }
+};
+print STDERR "skip=<$skip> counts=<$ccc>\n";
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py
new file mode 100755
index 0000000..626bc68
--- /dev/null
+++ b/mgizapp/scripts/merge_alignment.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# Author : Qin Gao
+# Date : Dec 31, 2007
+# Purpose: Combine multiple alignment files into a single one, the files are
+# prodcuced by MGIZA, which has sentence IDs, and every file is
+# ordered inside
+
+import sys
+import re
+
+if len(sys.argv)<2:
+ sys.stderr.write("Provide me the file names (at least 2)\n");
+ sys.exit();
+
+sent_id = 0;
+
+files = [];
+ids = [];
+
+sents = [];
+done = [];
+
+for i in range(1,len(sys.argv)):
+ files.append(open(sys.argv[i],"r"));
+ ids.append(0);
+ sents.append("");
+ done.append(False);
+
+r = re.compile("\\((\\d+)\\)");
+i = 0;
+while i< len(files):
+ st1 = files[i].readline();
+ st2 = files[i].readline();
+ st3 = files[i].readline();
+ if len(st1)==0 or len(st2)==0 or len(st3)==0:
+ done[i] = True;
+ else:
+ mt = r.search(st1);
+ id = int(mt.group(1));
+ ids[i] = id;
+ sents[i] = (st1, st2, st3);
+ i += 1
+
+cont = True;
+while (cont):
+ sent_id += 1;
+ writeOne = False;
+# Now try to read more sentences
+ i = 0;
+ cont = False;
+ while i < len(files):
+ if done[i]:
+ i+=1
+ continue;
+ cont = True;
+ if ids[i] == sent_id:
+ sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2]));
+ writeOne = True;
+ st1 = files[i].readline();
+ st2 = files[i].readline();
+ st3 = files[i].readline();
+ if len(st1)==0 or len(st2)==0 or len(st3)==0:
+ done[i] = True;
+ else:
+ mt = r.search(st1);
+ id = int(mt.group(1));
+ ids[i] = id;
+ sents[i] = (st1, st2, st3);
+ cont = True;
+ break;
+ elif ids[i] < sent_id:
+ sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]);
+ sys.exit();
+ else:
+ cont = True;
+ i+=1;
+ if (not writeOne) and cont:
+ sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id);
+ #sys.exit();
+sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1));
diff --git a/mgizapp/scripts/plain2snt-hasvcb.py b/mgizapp/scripts/plain2snt-hasvcb.py
new file mode 100755
index 0000000..490c493
--- /dev/null
+++ b/mgizapp/scripts/plain2snt-hasvcb.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+
+from sys import *
+
+def loadvcb(fname,out):
+ dict={};
+ df = open(fname,"r");
+ for line in df:
+ out.write(line);
+ ws = line.strip().split();
+ id = int(ws[0]);
+ wd = ws[1];
+ dict[wd]=id;
+ return dict;
+
+if len(argv)<9:
+ stderr.write("Error, the input should be \n");
+ stderr.write("%s evcb fvcb etxt ftxt esnt(out) fsnt(out) evcbx(out) fvcbx(out)\n" % argv[0]);
+ stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n");
+ exit();
+
+ein = open(argv[3],"r");
+fin = open(argv[4],"r");
+
+eout = open(argv[5],"w");
+fout = open(argv[6],"w");
+
+evcbx = open(argv[7],"w");
+fvcbx = open(argv[8],"w");
+evcb = loadvcb(argv[1],evcbx);
+fvcb = loadvcb(argv[2],fvcbx);
+
+i=0
+while True:
+ i+=1;
+ eline=ein.readline();
+ fline=fin.readline();
+ if len(eline)==0 or len(fline)==0:
+ break;
+ ewords = eline.strip().split();
+ fwords = fline.strip().split();
+ el = [];
+ fl = [];
+ j=0;
+ for w in ewords:
+ j+=1
+ if evcb.has_key(w):
+ el.append(evcb[w]);
+ else:
+ if evcb.has_key(w.lower()):
+ el.append(evcb[w.lower()]);
+ else:
+ ##stdout.write("#E %d %d %s\n" % (i,j,w))
+ #el.append(1);
+ nid = len(evcb)+1;
+ evcb[w.lower()] = nid;
+ evcbx.write("%d %s 1\n" % (nid, w));
+ el.append(nid);
+
+ j=0;
+ for w in fwords:
+ j+=1
+ if fvcb.has_key(w):
+ fl.append(fvcb[w]);
+ else:
+ if fvcb.has_key(w.lower()):
+ fl.append(fvcb[w.lower()]);
+ else:
+ #stdout.write("#F %d %d %s\n" % (i,j,w))
+ nid = len(fvcb)+1;
+ fvcb[w.lower()] = nid;
+ fvcbx.write("%d %s 1\n" % (nid, w));
+ fl.append(nid);
+ #fl.append(1);
+ eout.write("1\n");
+ fout.write("1\n");
+ for I in el:
+ eout.write("%d " % I);
+ eout.write("\n");
+ for I in fl:
+ eout.write("%d " % I);
+ fout.write("%d " % I);
+ eout.write("\n");
+ fout.write("\n");
+ for I in el:
+ fout.write("%d " % I);
+ fout.write("\n");
+
+fout.close();
+eout.close();
+fvcbx.close();
+evcbx.close();
+
diff --git a/mgizapp/scripts/sntpostproc.py b/mgizapp/scripts/sntpostproc.py
new file mode 100755
index 0000000..b3bf528
--- /dev/null
+++ b/mgizapp/scripts/sntpostproc.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+# This script post process the snt file -- either in single-line format or in multi-line format
+# The output, however, will always be in single-line format
+
+from sys import *
+from optparse import OptionParser
+import re;
+usage = """
+The script post process the snt file, the input could be single-line snt
+file or multi-line, (triple line) and can insert sentence weight to the
+file (-w) or add partial alignment to the file (-a)
+Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile
+"""
+parser = OptionParser(usage=usage)
+
+
+parser = OptionParser()
+
+parser.add_option("-s", "--snt", dest="snt",default=None,
+ help="The input snt file", metavar="FILE")
+
+parser.add_option("-w", "--weight", dest="weight",default=None,
+ help="The input weight file", metavar="FILE")
+
+
+parser.add_option("-o", "--output", dest="output",default="-",
+ help="The input partial alignment file, one sentence per line", metavar="FILE")
+
+parser.add_option("-a", "--align", dest="align",default=None,
+ help="The input partial alignment file, one sentence per line", metavar="FILE")
+
+
+(options, args) = parser.parse_args()
+
+if options.snt == None:
+ parser.print_help();
+ exit();
+else:
+ sfile = open(options.snt,"r");
+
+if options.output=="-":
+ ofile = stdout;
+else:
+ ofile = open(options.output,"w");
+
+wfile = None;
+
+if options.weight <> None:
+ wfile = open(options.weight,"r");
+
+afile = None;
+if options.align <> None:
+ afile = open(options.align,"r");
+
+rr = re.compile("[\\|\\#\\*]");
+wt = 0.0;
+al = {};
+e = "";
+f = "";
+
+def parse_ax(line):
+ alq = {};
+ als = line.strip().split(" ");
+ for e in als:
+ if len(e.strip())>0:
+ alo = e.split("-");
+ if len(alo)==2:
+ alq[tuple(alo)] = 1;
+ return alq;
+
+
+
+
+
+
+while True:
+ l = sfile.readline();
+ if len(l) == 0:
+ break;
+ lp = rr.split(l.strip());
+ if len(lp)>=3:
+ wt = float(lp[0]);
+ e = lp[1];
+ f = lp[2];
+ if len(lp) > 3:
+ al = parse_ax(lp[3]);
+ else:
+ al = {};
+ else:
+ wt = float(l);
+ e = sfile.readline().strip();
+ f = sfile.readline().strip();
+ al={}
+ if wfile <> None:
+ lw = wfile.readline().strip();
+ if len(lw)>0:
+ wt = float(lw);
+ else:
+ wt = 1;
+ if afile <> None:
+ la = afile.readline().strip();
+ if len(la)>0:
+ al1 = parse_ax(la);
+ for entry in al1.keys():
+ al[entry] = 1;
+
+ ofile.write("%g | %s | %s" % (wt, e, f));
+ if len(al)>0:
+ ofile.write(" |");
+
+ for entry in al.keys():
+ ofile.write(" %s-%s" % entry);
+ ofile.write("\n");
+
+
diff --git a/mgizapp/src/hmm.cpp b/mgizapp/src/hmm.cpp
index dd8cde5..ca48477 100644
--- a/mgizapp/src/hmm.cpp
+++ b/mgizapp/src/hmm.cpp
@@ -124,7 +124,7 @@ int hmm::em_with_tricks(int noIterations,bool dumpCount,
pair_no = 0;
it_st = time(NULL) ;
cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n';
- dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0) && !NODUMPS;
+ dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0 || it == noIterations) && !NODUMPS;
//dump_files = true;
number = "";
int n = it;
diff --git a/mgizapp/src/model1.cpp b/mgizapp/src/model1.cpp
index e649f8d..18fc22a 100644
--- a/mgizapp/src/model1.cpp
+++ b/mgizapp/src/model1.cpp
@@ -138,7 +138,7 @@ int model1::em_with_tricks(int noIterations, /*Perplexity& perp, sentenceHandler
pair_no = 0 ;
it_st = time(NULL);
cout << "-----------\n" << modelName << ": Iteration " << it << '\n';
- dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0) && !NODUMPS ;
+ dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0 || it == noIterations) && !NODUMPS ;
//dump_files = true;
number = "";
int n = it;