diff options
author | edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d> | 2010-01-25 02:36:54 +0300 |
---|---|---|
committer | edwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d> | 2010-01-25 02:36:54 +0300 |
commit | 2be195bdfab57981e178f86f2dc0252f78e923a8 (patch) | |
tree | 7c21643c74834102e0ce24feded9b48b8c7baf68 /mgizapp | |
parent | cd574c22ba2556e80e8cfbc035ccdaa5ecec10a8 (diff) |
Update
Diffstat (limited to 'mgizapp')
-rwxr-xr-x | mgizapp/scripts/force-align-moses.sh | 48 | ||||
-rwxr-xr-x | mgizapp/scripts/giza2bal.pl | 112 | ||||
-rwxr-xr-x | mgizapp/scripts/merge_alignment.py | 80 | ||||
-rwxr-xr-x | mgizapp/scripts/plain2snt-hasvcb.py | 93 | ||||
-rwxr-xr-x | mgizapp/scripts/sntpostproc.py | 116 | ||||
-rw-r--r-- | mgizapp/src/hmm.cpp | 2 | ||||
-rw-r--r-- | mgizapp/src/model1.cpp | 2 |
7 files changed, 451 insertions, 2 deletions
diff --git a/mgizapp/scripts/force-align-moses.sh b/mgizapp/scripts/force-align-moses.sh new file mode 100755 index 0000000..fd4cf12 --- /dev/null +++ b/mgizapp/scripts/force-align-moses.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +MGIZA=${QMT_HOME}/bin/mgiza + +if [ $# -lt 4 ]; then + echo "OK, this is simple, put me into your Moses training directory, link your source/target corpus" 1>&2 + echo "and run " $0 " PREFIX src_tag tgt_tag root-dir." 1>&2 + echo "and get force-aligned data: root-dir/giza.[src-tgt|tgt-src]/*.A3.final.* " 1>&2 + echo "make sure I can find PREFIX.src_tag-tgt_tag and PREFIX.tgt_tag-src_tag, and \${QMT_HOME} is set" 1>&2 + exit +fi + +PRE=$1 +SRC=$2 +TGT=$3 +ROOT=$4 + +mkdir -p $ROOT/giza.${SRC}-${TGT} +mkdir -p $ROOT/giza.${TGT}-${SRC} +mkdir -p $ROOT/corpus + +echo "Generating corpus file " 1>&2 + +${QMT_HOME}/scripts/plain2snt-hasvcb.py corpus/$SRC.vcb corpus/$TGT.vcb ${PRE}.${SRC} ${PRE}.${TGT} $ROOT/corpus/${TGT}-${SRC}.snt $ROOT/corpus/${SRC}-${TGT}.snt $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb + +ln -sf $PWD/corpus/$SRC.vcb.classes $PWD/corpus/$TGT.vcb.classes $ROOT/corpus/ + +echo "Generating co-occurrence file " 1>&2 + +${QMT_HOME}/bin/snt2cooc $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb $ROOT/corpus/${TGT}-${SRC}.snt +${QMT_HOME}/bin//snt2cooc $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc $ROOT/corpus/$TGT.vcb $ROOT/corpus/$SRC.vcb $ROOT/corpus/${SRC}-${TGT}.snt + +echo "Running force alignment " 1>&2 + +$MGIZA giza.$TGT-$SRC/$TGT-$SRC.gizacfg -c $ROOT/corpus/$TGT-$SRC.snt -o $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC} \ +-s $ROOT/corpus/$SRC.vcb -t $ROOT/corpus/$TGT.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc \ +-restart 11 -previoust giza.$TGT-$SRC/$TGT-$SRC.t3.final \ +-previousa giza.$TGT-$SRC/$TGT-$SRC.a3.final -previousd giza.$TGT-$SRC/$TGT-$SRC.d3.final \ +-previousn giza.$TGT-$SRC/$TGT-$SRC.n3.final -previousd4 giza.$TGT-$SRC/$TGT-$SRC.d4.final \ +-previousd42 giza.$TGT-$SRC/$TGT-$SRC.D4.final -m3 0 -m4 1 + +$MGIZA giza.$SRC-$TGT/$SRC-$TGT.gizacfg -c $ROOT/corpus/$SRC-$TGT.snt -o $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT} \ +-s $ROOT/corpus/$TGT.vcb -t $ROOT/corpus/$SRC.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc \ +-restart 11 -previoust giza.$SRC-$TGT/$SRC-$TGT.t3.final \ +-previousa giza.$SRC-$TGT/$SRC-$TGT.a3.final -previousd giza.$SRC-$TGT/$SRC-$TGT.d3.final \ +-previousn giza.$SRC-$TGT/$SRC-$TGT.n3.final -previousd4 giza.$SRC-$TGT/$SRC-$TGT.d4.final \ +-previousd42 giza.$SRC-$TGT/$SRC-$TGT.D4.final -m3 0 -m4 1 + diff --git a/mgizapp/scripts/giza2bal.pl b/mgizapp/scripts/giza2bal.pl new file mode 100755 index 0000000..fb134c0 --- /dev/null +++ b/mgizapp/scripts/giza2bal.pl @@ -0,0 +1,112 @@ +#! /usr/bin/perl + +# $Id: giza2bal.pl 1562 2008-02-19 20:48:14Z redpony $ +#Converts direct and inverted alignments into a more compact +#bi-alignment format. It optionally reads the counting file +#produced by giza containing the frequency of each traning sentence. + +#Copyright Marcello Federico, November 2004 + +($cnt,$dir,$inv)=(); + +while ($w=shift @ARGV){ + $dir=shift(@ARGV),next if $w eq "-d"; + $inv=shift(@ARGV),next if $w eq "-i"; + $cnt=shift(@ARGV),next if $w eq "-c"; +} + +my $lc = 0; + +if (!$dir || !inv){ + print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n"; + print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n"; + exit(0); +} + +$|=1; + +open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n"; +open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n"; + +if ($cnt){ +open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n"; +} + + +sub ReadBiAlign{ + local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_; + local($dummy,$n); + + chop($c=<$fd0>); ## count + $dummy=<$fd0>; ## header + $dummy=<$fd0>; ## header + $c=1 if !$c; + + $dummy=<$fd1>; ## header + chop($s1=<$fd1>); + chop($t1=<$fd1>); + + $dummy=<$fd2>; ## header + chop($s2=<$fd2>); + chop($t2=<$fd2>); + + @a=@b=(); + $lc++; + + #get target statistics + $n=1; + $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; + while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ + grep($a[$_]=$n,split(/\s+/,$2)); + $n++; + } + + $m=1; + $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; + while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ + grep($b[$_]=$m,split(/\s+/,$2)); + $m++; + } + + $M=split(/\s+/,$s1); + $N=split(/\s+/,$s2); + + if ($m != ($M+1) || $n != ($N+1)) { + print STDERR "Sentence mismatch error! Line #$lc\n"; + $s1 = "ALIGN_ERR"; + $s2 = "ALIGN_ERR"; + @a=(); @b=(); + for ($j=1;$j<2;$j++){ $a[$j]=1; } + for ($i=1;$i<2;$i++){ $b[$i]=1; } + return 1; + } + + for ($j=1;$j<$m;$j++){ + $a[$j]=0 if !$a[$j]; + } + + for ($i=1;$i<$n;$i++){ + $b[$i]=0 if !$b[$i]; + } + + + return 1; +} + +$skip=0; +$ccc=0; +while(!eof(DIR)){ + + if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c)) + { + $ccc++; + print "$c\n"; + print $#a," $src \# @a[1..$#a]\n"; + print $#b," $tgt \# @b[1..$#b]\n"; + } + else{ + print "\n"; + print STDERR "." if !(++$skip % 1000); + } +}; +print STDERR "skip=<$skip> counts=<$ccc>\n"; diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py new file mode 100755 index 0000000..626bc68 --- /dev/null +++ b/mgizapp/scripts/merge_alignment.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# Author : Qin Gao +# Date : Dec 31, 2007 +# Purpose: Combine multiple alignment files into a single one, the files are +# prodcuced by MGIZA, which has sentence IDs, and every file is +# ordered inside + +import sys +import re + +if len(sys.argv)<2: + sys.stderr.write("Provide me the file names (at least 2)\n"); + sys.exit(); + +sent_id = 0; + +files = []; +ids = []; + +sents = []; +done = []; + +for i in range(1,len(sys.argv)): + files.append(open(sys.argv[i],"r")); + ids.append(0); + sents.append(""); + done.append(False); + +r = re.compile("\\((\\d+)\\)"); +i = 0; +while i< len(files): + st1 = files[i].readline(); + st2 = files[i].readline(); + st3 = files[i].readline(); + if len(st1)==0 or len(st2)==0 or len(st3)==0: + done[i] = True; + else: + mt = r.search(st1); + id = int(mt.group(1)); + ids[i] = id; + sents[i] = (st1, st2, st3); + i += 1 + +cont = True; +while (cont): + sent_id += 1; + writeOne = False; +# Now try to read more sentences + i = 0; + cont = False; + while i < len(files): + if done[i]: + i+=1 + continue; + cont = True; + if ids[i] == sent_id: + sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2])); + writeOne = True; + st1 = files[i].readline(); + st2 = files[i].readline(); + st3 = files[i].readline(); + if len(st1)==0 or len(st2)==0 or len(st3)==0: + done[i] = True; + else: + mt = r.search(st1); + id = int(mt.group(1)); + ids[i] = id; + sents[i] = (st1, st2, st3); + cont = True; + break; + elif ids[i] < sent_id: + sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]); + sys.exit(); + else: + cont = True; + i+=1; + if (not writeOne) and cont: + sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id); + #sys.exit(); +sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1)); diff --git a/mgizapp/scripts/plain2snt-hasvcb.py b/mgizapp/scripts/plain2snt-hasvcb.py new file mode 100755 index 0000000..490c493 --- /dev/null +++ b/mgizapp/scripts/plain2snt-hasvcb.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python + +from sys import * + +def loadvcb(fname,out): + dict={}; + df = open(fname,"r"); + for line in df: + out.write(line); + ws = line.strip().split(); + id = int(ws[0]); + wd = ws[1]; + dict[wd]=id; + return dict; + +if len(argv)<9: + stderr.write("Error, the input should be \n"); + stderr.write("%s evcb fvcb etxt ftxt esnt(out) fsnt(out) evcbx(out) fvcbx(out)\n" % argv[0]); + stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n"); + exit(); + +ein = open(argv[3],"r"); +fin = open(argv[4],"r"); + +eout = open(argv[5],"w"); +fout = open(argv[6],"w"); + +evcbx = open(argv[7],"w"); +fvcbx = open(argv[8],"w"); +evcb = loadvcb(argv[1],evcbx); +fvcb = loadvcb(argv[2],fvcbx); + +i=0 +while True: + i+=1; + eline=ein.readline(); + fline=fin.readline(); + if len(eline)==0 or len(fline)==0: + break; + ewords = eline.strip().split(); + fwords = fline.strip().split(); + el = []; + fl = []; + j=0; + for w in ewords: + j+=1 + if evcb.has_key(w): + el.append(evcb[w]); + else: + if evcb.has_key(w.lower()): + el.append(evcb[w.lower()]); + else: + ##stdout.write("#E %d %d %s\n" % (i,j,w)) + #el.append(1); + nid = len(evcb)+1; + evcb[w.lower()] = nid; + evcbx.write("%d %s 1\n" % (nid, w)); + el.append(nid); + + j=0; + for w in fwords: + j+=1 + if fvcb.has_key(w): + fl.append(fvcb[w]); + else: + if fvcb.has_key(w.lower()): + fl.append(fvcb[w.lower()]); + else: + #stdout.write("#F %d %d %s\n" % (i,j,w)) + nid = len(fvcb)+1; + fvcb[w.lower()] = nid; + fvcbx.write("%d %s 1\n" % (nid, w)); + fl.append(nid); + #fl.append(1); + eout.write("1\n"); + fout.write("1\n"); + for I in el: + eout.write("%d " % I); + eout.write("\n"); + for I in fl: + eout.write("%d " % I); + fout.write("%d " % I); + eout.write("\n"); + fout.write("\n"); + for I in el: + fout.write("%d " % I); + fout.write("\n"); + +fout.close(); +eout.close(); +fvcbx.close(); +evcbx.close(); + diff --git a/mgizapp/scripts/sntpostproc.py b/mgizapp/scripts/sntpostproc.py new file mode 100755 index 0000000..b3bf528 --- /dev/null +++ b/mgizapp/scripts/sntpostproc.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +# This script post process the snt file -- either in single-line format or in multi-line format +# The output, however, will always be in single-line format + +from sys import * +from optparse import OptionParser +import re; +usage = """ +The script post process the snt file, the input could be single-line snt +file or multi-line, (triple line) and can insert sentence weight to the +file (-w) or add partial alignment to the file (-a) +Usage %prog -s sntfile -w weight-file -a alignfile -o outputfile +""" +parser = OptionParser(usage=usage) + + +parser = OptionParser() + +parser.add_option("-s", "--snt", dest="snt",default=None, + help="The input snt file", metavar="FILE") + +parser.add_option("-w", "--weight", dest="weight",default=None, + help="The input weight file", metavar="FILE") + + +parser.add_option("-o", "--output", dest="output",default="-", + help="The input partial alignment file, one sentence per line", metavar="FILE") + +parser.add_option("-a", "--align", dest="align",default=None, + help="The input partial alignment file, one sentence per line", metavar="FILE") + + +(options, args) = parser.parse_args() + +if options.snt == None: + parser.print_help(); + exit(); +else: + sfile = open(options.snt,"r"); + +if options.output=="-": + ofile = stdout; +else: + ofile = open(options.output,"w"); + +wfile = None; + +if options.weight <> None: + wfile = open(options.weight,"r"); + +afile = None; +if options.align <> None: + afile = open(options.align,"r"); + +rr = re.compile("[\\|\\#\\*]"); +wt = 0.0; +al = {}; +e = ""; +f = ""; + +def parse_ax(line): + alq = {}; + als = line.strip().split(" "); + for e in als: + if len(e.strip())>0: + alo = e.split("-"); + if len(alo)==2: + alq[tuple(alo)] = 1; + return alq; + + + + + + +while True: + l = sfile.readline(); + if len(l) == 0: + break; + lp = rr.split(l.strip()); + if len(lp)>=3: + wt = float(lp[0]); + e = lp[1]; + f = lp[2]; + if len(lp) > 3: + al = parse_ax(lp[3]); + else: + al = {}; + else: + wt = float(l); + e = sfile.readline().strip(); + f = sfile.readline().strip(); + al={} + if wfile <> None: + lw = wfile.readline().strip(); + if len(lw)>0: + wt = float(lw); + else: + wt = 1; + if afile <> None: + la = afile.readline().strip(); + if len(la)>0: + al1 = parse_ax(la); + for entry in al1.keys(): + al[entry] = 1; + + ofile.write("%g | %s | %s" % (wt, e, f)); + if len(al)>0: + ofile.write(" |"); + + for entry in al.keys(): + ofile.write(" %s-%s" % entry); + ofile.write("\n"); + + diff --git a/mgizapp/src/hmm.cpp b/mgizapp/src/hmm.cpp index dd8cde5..ca48477 100644 --- a/mgizapp/src/hmm.cpp +++ b/mgizapp/src/hmm.cpp @@ -124,7 +124,7 @@ int hmm::em_with_tricks(int noIterations,bool dumpCount, pair_no = 0; it_st = time(NULL) ; cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n'; - dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0) && !NODUMPS; + dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0 || it == noIterations) && !NODUMPS; //dump_files = true; number = ""; int n = it; diff --git a/mgizapp/src/model1.cpp b/mgizapp/src/model1.cpp index e649f8d..18fc22a 100644 --- a/mgizapp/src/model1.cpp +++ b/mgizapp/src/model1.cpp @@ -138,7 +138,7 @@ int model1::em_with_tricks(int noIterations, /*Perplexity& perp, sentenceHandler pair_no = 0 ; it_st = time(NULL); cout << "-----------\n" << modelName << ": Iteration " << it << '\n'; - dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0) && !NODUMPS ; + dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0 || it == noIterations) && !NODUMPS ; //dump_files = true; number = ""; int n = it; |