From cd574c22ba2556e80e8cfbc035ccdaa5ecec10a8 Mon Sep 17 00:00:00 2001 From: edwardgao Date: Sat, 23 Jan 2010 13:43:04 +0000 Subject: File execute privillege --- mgizapp/autogen.sh | 0 mgizapp/configure | 0 mgizapp/scripts/TrimBlanks.sh | 2 - mgizapp/scripts/force-align-moses.sh | 48 --------------- mgizapp/scripts/giza2bal.pl | 112 ----------------------------------- mgizapp/scripts/merge_alignment.py | 80 ------------------------- mgizapp/scripts/plain2snt-hasvcb.py | 93 ----------------------------- mgizapp/scripts/symal.sh | 15 ----- 8 files changed, 350 deletions(-) mode change 100644 => 100755 mgizapp/autogen.sh mode change 100644 => 100755 mgizapp/configure delete mode 100644 mgizapp/scripts/TrimBlanks.sh delete mode 100644 mgizapp/scripts/force-align-moses.sh delete mode 100644 mgizapp/scripts/giza2bal.pl delete mode 100644 mgizapp/scripts/merge_alignment.py delete mode 100644 mgizapp/scripts/plain2snt-hasvcb.py delete mode 100644 mgizapp/scripts/symal.sh (limited to 'mgizapp') diff --git a/mgizapp/autogen.sh b/mgizapp/autogen.sh old mode 100644 new mode 100755 diff --git a/mgizapp/configure b/mgizapp/configure old mode 100644 new mode 100755 diff --git a/mgizapp/scripts/TrimBlanks.sh b/mgizapp/scripts/TrimBlanks.sh deleted file mode 100644 index 78bd28b..0000000 --- a/mgizapp/scripts/TrimBlanks.sh +++ /dev/null @@ -1,2 +0,0 @@ -sed -e 's/^[ \t]*//' -e 's/[ \t][ \t]*/ /g' -e 's/[ \t]*$//' - diff --git a/mgizapp/scripts/force-align-moses.sh b/mgizapp/scripts/force-align-moses.sh deleted file mode 100644 index fd4cf12..0000000 --- a/mgizapp/scripts/force-align-moses.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash - -MGIZA=${QMT_HOME}/bin/mgiza - -if [ $# -lt 4 ]; then - echo "OK, this is simple, put me into your Moses training directory, link your source/target corpus" 1>&2 - echo "and run " $0 " PREFIX src_tag tgt_tag root-dir." 1>&2 - echo "and get force-aligned data: root-dir/giza.[src-tgt|tgt-src]/*.A3.final.* " 1>&2 - echo "make sure I can find PREFIX.src_tag-tgt_tag and PREFIX.tgt_tag-src_tag, and \${QMT_HOME} is set" 1>&2 - exit -fi - -PRE=$1 -SRC=$2 -TGT=$3 -ROOT=$4 - -mkdir -p $ROOT/giza.${SRC}-${TGT} -mkdir -p $ROOT/giza.${TGT}-${SRC} -mkdir -p $ROOT/corpus - -echo "Generating corpus file " 1>&2 - -${QMT_HOME}/scripts/plain2snt-hasvcb.py corpus/$SRC.vcb corpus/$TGT.vcb ${PRE}.${SRC} ${PRE}.${TGT} $ROOT/corpus/${TGT}-${SRC}.snt $ROOT/corpus/${SRC}-${TGT}.snt $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb - -ln -sf $PWD/corpus/$SRC.vcb.classes $PWD/corpus/$TGT.vcb.classes $ROOT/corpus/ - -echo "Generating co-occurrence file " 1>&2 - -${QMT_HOME}/bin/snt2cooc $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb $ROOT/corpus/${TGT}-${SRC}.snt -${QMT_HOME}/bin//snt2cooc $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc $ROOT/corpus/$TGT.vcb $ROOT/corpus/$SRC.vcb $ROOT/corpus/${SRC}-${TGT}.snt - -echo "Running force alignment " 1>&2 - -$MGIZA giza.$TGT-$SRC/$TGT-$SRC.gizacfg -c $ROOT/corpus/$TGT-$SRC.snt -o $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC} \ --s $ROOT/corpus/$SRC.vcb -t $ROOT/corpus/$TGT.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc \ --restart 11 -previoust giza.$TGT-$SRC/$TGT-$SRC.t3.final \ --previousa giza.$TGT-$SRC/$TGT-$SRC.a3.final -previousd giza.$TGT-$SRC/$TGT-$SRC.d3.final \ --previousn giza.$TGT-$SRC/$TGT-$SRC.n3.final -previousd4 giza.$TGT-$SRC/$TGT-$SRC.d4.final \ --previousd42 giza.$TGT-$SRC/$TGT-$SRC.D4.final -m3 0 -m4 1 - -$MGIZA giza.$SRC-$TGT/$SRC-$TGT.gizacfg -c $ROOT/corpus/$SRC-$TGT.snt -o $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT} \ --s $ROOT/corpus/$TGT.vcb -t $ROOT/corpus/$SRC.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc \ --restart 11 -previoust giza.$SRC-$TGT/$SRC-$TGT.t3.final \ --previousa giza.$SRC-$TGT/$SRC-$TGT.a3.final -previousd giza.$SRC-$TGT/$SRC-$TGT.d3.final \ --previousn giza.$SRC-$TGT/$SRC-$TGT.n3.final -previousd4 giza.$SRC-$TGT/$SRC-$TGT.d4.final \ --previousd42 giza.$SRC-$TGT/$SRC-$TGT.D4.final -m3 0 -m4 1 - diff --git a/mgizapp/scripts/giza2bal.pl b/mgizapp/scripts/giza2bal.pl deleted file mode 100644 index fb134c0..0000000 --- a/mgizapp/scripts/giza2bal.pl +++ /dev/null @@ -1,112 +0,0 @@ -#! /usr/bin/perl - -# $Id: giza2bal.pl 1562 2008-02-19 20:48:14Z redpony $ -#Converts direct and inverted alignments into a more compact -#bi-alignment format. It optionally reads the counting file -#produced by giza containing the frequency of each traning sentence. - -#Copyright Marcello Federico, November 2004 - -($cnt,$dir,$inv)=(); - -while ($w=shift @ARGV){ - $dir=shift(@ARGV),next if $w eq "-d"; - $inv=shift(@ARGV),next if $w eq "-i"; - $cnt=shift(@ARGV),next if $w eq "-c"; -} - -my $lc = 0; - -if (!$dir || !inv){ - print "usage: giza2bal.pl [-c ] -d -i \n"; - print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n"; - exit(0); -} - -$|=1; - -open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n"; -open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n"; - -if ($cnt){ -open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n"; -} - - -sub ReadBiAlign{ - local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_; - local($dummy,$n); - - chop($c=<$fd0>); ## count - $dummy=<$fd0>; ## header - $dummy=<$fd0>; ## header - $c=1 if !$c; - - $dummy=<$fd1>; ## header - chop($s1=<$fd1>); - chop($t1=<$fd1>); - - $dummy=<$fd2>; ## header - chop($s2=<$fd2>); - chop($t2=<$fd2>); - - @a=@b=(); - $lc++; - - #get target statistics - $n=1; - $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; - while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ - grep($a[$_]=$n,split(/\s+/,$2)); - $n++; - } - - $m=1; - $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; - while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ - grep($b[$_]=$m,split(/\s+/,$2)); - $m++; - } - - $M=split(/\s+/,$s1); - $N=split(/\s+/,$s2); - - if ($m != ($M+1) || $n != ($N+1)) { - print STDERR "Sentence mismatch error! Line #$lc\n"; - $s1 = "ALIGN_ERR"; - $s2 = "ALIGN_ERR"; - @a=(); @b=(); - for ($j=1;$j<2;$j++){ $a[$j]=1; } - for ($i=1;$i<2;$i++){ $b[$i]=1; } - return 1; - } - - for ($j=1;$j<$m;$j++){ - $a[$j]=0 if !$a[$j]; - } - - for ($i=1;$i<$n;$i++){ - $b[$i]=0 if !$b[$i]; - } - - - return 1; -} - -$skip=0; -$ccc=0; -while(!eof(DIR)){ - - if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c)) - { - $ccc++; - print "$c\n"; - print $#a," $src \# @a[1..$#a]\n"; - print $#b," $tgt \# @b[1..$#b]\n"; - } - else{ - print "\n"; - print STDERR "." if !(++$skip % 1000); - } -}; -print STDERR "skip=<$skip> counts=<$ccc>\n"; diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py deleted file mode 100644 index 626bc68..0000000 --- a/mgizapp/scripts/merge_alignment.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python -# Author : Qin Gao -# Date : Dec 31, 2007 -# Purpose: Combine multiple alignment files into a single one, the files are -# prodcuced by MGIZA, which has sentence IDs, and every file is -# ordered inside - -import sys -import re - -if len(sys.argv)<2: - sys.stderr.write("Provide me the file names (at least 2)\n"); - sys.exit(); - -sent_id = 0; - -files = []; -ids = []; - -sents = []; -done = []; - -for i in range(1,len(sys.argv)): - files.append(open(sys.argv[i],"r")); - ids.append(0); - sents.append(""); - done.append(False); - -r = re.compile("\\((\\d+)\\)"); -i = 0; -while i< len(files): - st1 = files[i].readline(); - st2 = files[i].readline(); - st3 = files[i].readline(); - if len(st1)==0 or len(st2)==0 or len(st3)==0: - done[i] = True; - else: - mt = r.search(st1); - id = int(mt.group(1)); - ids[i] = id; - sents[i] = (st1, st2, st3); - i += 1 - -cont = True; -while (cont): - sent_id += 1; - writeOne = False; -# Now try to read more sentences - i = 0; - cont = False; - while i < len(files): - if done[i]: - i+=1 - continue; - cont = True; - if ids[i] == sent_id: - sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2])); - writeOne = True; - st1 = files[i].readline(); - st2 = files[i].readline(); - st3 = files[i].readline(); - if len(st1)==0 or len(st2)==0 or len(st3)==0: - done[i] = True; - else: - mt = r.search(st1); - id = int(mt.group(1)); - ids[i] = id; - sents[i] = (st1, st2, st3); - cont = True; - break; - elif ids[i] < sent_id: - sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]); - sys.exit(); - else: - cont = True; - i+=1; - if (not writeOne) and cont: - sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id); - #sys.exit(); -sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1)); diff --git a/mgizapp/scripts/plain2snt-hasvcb.py b/mgizapp/scripts/plain2snt-hasvcb.py deleted file mode 100644 index 490c493..0000000 --- a/mgizapp/scripts/plain2snt-hasvcb.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python - -from sys import * - -def loadvcb(fname,out): - dict={}; - df = open(fname,"r"); - for line in df: - out.write(line); - ws = line.strip().split(); - id = int(ws[0]); - wd = ws[1]; - dict[wd]=id; - return dict; - -if len(argv)<9: - stderr.write("Error, the input should be \n"); - stderr.write("%s evcb fvcb etxt ftxt esnt(out) fsnt(out) evcbx(out) fvcbx(out)\n" % argv[0]); - stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n"); - exit(); - -ein = open(argv[3],"r"); -fin = open(argv[4],"r"); - -eout = open(argv[5],"w"); -fout = open(argv[6],"w"); - -evcbx = open(argv[7],"w"); -fvcbx = open(argv[8],"w"); -evcb = loadvcb(argv[1],evcbx); -fvcb = loadvcb(argv[2],fvcbx); - -i=0 -while True: - i+=1; - eline=ein.readline(); - fline=fin.readline(); - if len(eline)==0 or len(fline)==0: - break; - ewords = eline.strip().split(); - fwords = fline.strip().split(); - el = []; - fl = []; - j=0; - for w in ewords: - j+=1 - if evcb.has_key(w): - el.append(evcb[w]); - else: - if evcb.has_key(w.lower()): - el.append(evcb[w.lower()]); - else: - ##stdout.write("#E %d %d %s\n" % (i,j,w)) - #el.append(1); - nid = len(evcb)+1; - evcb[w.lower()] = nid; - evcbx.write("%d %s 1\n" % (nid, w)); - el.append(nid); - - j=0; - for w in fwords: - j+=1 - if fvcb.has_key(w): - fl.append(fvcb[w]); - else: - if fvcb.has_key(w.lower()): - fl.append(fvcb[w.lower()]); - else: - #stdout.write("#F %d %d %s\n" % (i,j,w)) - nid = len(fvcb)+1; - fvcb[w.lower()] = nid; - fvcbx.write("%d %s 1\n" % (nid, w)); - fl.append(nid); - #fl.append(1); - eout.write("1\n"); - fout.write("1\n"); - for I in el: - eout.write("%d " % I); - eout.write("\n"); - for I in fl: - eout.write("%d " % I); - fout.write("%d " % I); - eout.write("\n"); - fout.write("\n"); - for I in el: - fout.write("%d " % I); - fout.write("\n"); - -fout.close(); -eout.close(); -fvcbx.close(); -evcbx.close(); - diff --git a/mgizapp/scripts/symal.sh b/mgizapp/scripts/symal.sh deleted file mode 100644 index a446beb..0000000 --- a/mgizapp/scripts/symal.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -OUTPUT=$1 -shift -GIZA2BAL=$1 -shift -SYMAL=$1 -shift -STOT=$1 -shift -TTOS=$1 -shift - -perl $GIZA2BAL -d ${STOT} -i ${TTOS} | $SYMAL $* > $OUTPUT - -- cgit v1.2.3