Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoredwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d>2010-01-23 16:43:04 +0300
committeredwardgao <edwardgao@9a26d1b7-1c8f-445c-8fdd-6576f508279d>2010-01-23 16:43:04 +0300
commitcd574c22ba2556e80e8cfbc035ccdaa5ecec10a8 (patch)
tree6a3c863224a8522d0ce80fc462c1b1a720a42f21 /mgizapp
parente6691700cb8c925ef44942054c76141a682a5e22 (diff)
File execute privillege
Diffstat (limited to 'mgizapp')
-rwxr-xr-x[-rw-r--r--]mgizapp/autogen.sh0
-rwxr-xr-x[-rw-r--r--]mgizapp/configure0
-rw-r--r--mgizapp/scripts/TrimBlanks.sh2
-rw-r--r--mgizapp/scripts/force-align-moses.sh48
-rw-r--r--mgizapp/scripts/giza2bal.pl112
-rw-r--r--mgizapp/scripts/merge_alignment.py80
-rw-r--r--mgizapp/scripts/plain2snt-hasvcb.py93
-rw-r--r--mgizapp/scripts/symal.sh15
8 files changed, 0 insertions, 350 deletions
diff --git a/mgizapp/autogen.sh b/mgizapp/autogen.sh
index 9ab346a..9ab346a 100644..100755
--- a/mgizapp/autogen.sh
+++ b/mgizapp/autogen.sh
diff --git a/mgizapp/configure b/mgizapp/configure
index 2d5ca3d..2d5ca3d 100644..100755
--- a/mgizapp/configure
+++ b/mgizapp/configure
diff --git a/mgizapp/scripts/TrimBlanks.sh b/mgizapp/scripts/TrimBlanks.sh
deleted file mode 100644
index 78bd28b..0000000
--- a/mgizapp/scripts/TrimBlanks.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-sed -e 's/^[ \t]*//' -e 's/[ \t][ \t]*/ /g' -e 's/[ \t]*$//'
-
diff --git a/mgizapp/scripts/force-align-moses.sh b/mgizapp/scripts/force-align-moses.sh
deleted file mode 100644
index fd4cf12..0000000
--- a/mgizapp/scripts/force-align-moses.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-MGIZA=${QMT_HOME}/bin/mgiza
-
-if [ $# -lt 4 ]; then
- echo "OK, this is simple, put me into your Moses training directory, link your source/target corpus" 1>&2
- echo "and run " $0 " PREFIX src_tag tgt_tag root-dir." 1>&2
- echo "and get force-aligned data: root-dir/giza.[src-tgt|tgt-src]/*.A3.final.* " 1>&2
- echo "make sure I can find PREFIX.src_tag-tgt_tag and PREFIX.tgt_tag-src_tag, and \${QMT_HOME} is set" 1>&2
- exit
-fi
-
-PRE=$1
-SRC=$2
-TGT=$3
-ROOT=$4
-
-mkdir -p $ROOT/giza.${SRC}-${TGT}
-mkdir -p $ROOT/giza.${TGT}-${SRC}
-mkdir -p $ROOT/corpus
-
-echo "Generating corpus file " 1>&2
-
-${QMT_HOME}/scripts/plain2snt-hasvcb.py corpus/$SRC.vcb corpus/$TGT.vcb ${PRE}.${SRC} ${PRE}.${TGT} $ROOT/corpus/${TGT}-${SRC}.snt $ROOT/corpus/${SRC}-${TGT}.snt $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb
-
-ln -sf $PWD/corpus/$SRC.vcb.classes $PWD/corpus/$TGT.vcb.classes $ROOT/corpus/
-
-echo "Generating co-occurrence file " 1>&2
-
-${QMT_HOME}/bin/snt2cooc $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc $ROOT/corpus/$SRC.vcb $ROOT/corpus/$TGT.vcb $ROOT/corpus/${TGT}-${SRC}.snt
-${QMT_HOME}/bin//snt2cooc $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc $ROOT/corpus/$TGT.vcb $ROOT/corpus/$SRC.vcb $ROOT/corpus/${SRC}-${TGT}.snt
-
-echo "Running force alignment " 1>&2
-
-$MGIZA giza.$TGT-$SRC/$TGT-$SRC.gizacfg -c $ROOT/corpus/$TGT-$SRC.snt -o $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC} \
--s $ROOT/corpus/$SRC.vcb -t $ROOT/corpus/$TGT.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${TGT}-${SRC}/$TGT-${SRC}.cooc \
--restart 11 -previoust giza.$TGT-$SRC/$TGT-$SRC.t3.final \
--previousa giza.$TGT-$SRC/$TGT-$SRC.a3.final -previousd giza.$TGT-$SRC/$TGT-$SRC.d3.final \
--previousn giza.$TGT-$SRC/$TGT-$SRC.n3.final -previousd4 giza.$TGT-$SRC/$TGT-$SRC.d4.final \
--previousd42 giza.$TGT-$SRC/$TGT-$SRC.D4.final -m3 0 -m4 1
-
-$MGIZA giza.$SRC-$TGT/$SRC-$TGT.gizacfg -c $ROOT/corpus/$SRC-$TGT.snt -o $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT} \
--s $ROOT/corpus/$TGT.vcb -t $ROOT/corpus/$SRC.vcb -m1 0 -m2 0 -mh 0 -coocurrence $ROOT/giza.${SRC}-${TGT}/$SRC-${TGT}.cooc \
--restart 11 -previoust giza.$SRC-$TGT/$SRC-$TGT.t3.final \
--previousa giza.$SRC-$TGT/$SRC-$TGT.a3.final -previousd giza.$SRC-$TGT/$SRC-$TGT.d3.final \
--previousn giza.$SRC-$TGT/$SRC-$TGT.n3.final -previousd4 giza.$SRC-$TGT/$SRC-$TGT.d4.final \
--previousd42 giza.$SRC-$TGT/$SRC-$TGT.D4.final -m3 0 -m4 1
-
diff --git a/mgizapp/scripts/giza2bal.pl b/mgizapp/scripts/giza2bal.pl
deleted file mode 100644
index fb134c0..0000000
--- a/mgizapp/scripts/giza2bal.pl
+++ /dev/null
@@ -1,112 +0,0 @@
-#! /usr/bin/perl
-
-# $Id: giza2bal.pl 1562 2008-02-19 20:48:14Z redpony $
-#Converts direct and inverted alignments into a more compact
-#bi-alignment format. It optionally reads the counting file
-#produced by giza containing the frequency of each traning sentence.
-
-#Copyright Marcello Federico, November 2004
-
-($cnt,$dir,$inv)=();
-
-while ($w=shift @ARGV){
- $dir=shift(@ARGV),next if $w eq "-d";
- $inv=shift(@ARGV),next if $w eq "-i";
- $cnt=shift(@ARGV),next if $w eq "-c";
-}
-
-my $lc = 0;
-
-if (!$dir || !inv){
- print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n";
- print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n";
- exit(0);
-}
-
-$|=1;
-
-open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n";
-open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n";
-
-if ($cnt){
-open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n";
-}
-
-
-sub ReadBiAlign{
- local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_;
- local($dummy,$n);
-
- chop($c=<$fd0>); ## count
- $dummy=<$fd0>; ## header
- $dummy=<$fd0>; ## header
- $c=1 if !$c;
-
- $dummy=<$fd1>; ## header
- chop($s1=<$fd1>);
- chop($t1=<$fd1>);
-
- $dummy=<$fd2>; ## header
- chop($s2=<$fd2>);
- chop($t2=<$fd2>);
-
- @a=@b=();
- $lc++;
-
- #get target statistics
- $n=1;
- $t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
- while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
- grep($a[$_]=$n,split(/\s+/,$2));
- $n++;
- }
-
- $m=1;
- $t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//;
- while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){
- grep($b[$_]=$m,split(/\s+/,$2));
- $m++;
- }
-
- $M=split(/\s+/,$s1);
- $N=split(/\s+/,$s2);
-
- if ($m != ($M+1) || $n != ($N+1)) {
- print STDERR "Sentence mismatch error! Line #$lc\n";
- $s1 = "ALIGN_ERR";
- $s2 = "ALIGN_ERR";
- @a=(); @b=();
- for ($j=1;$j<2;$j++){ $a[$j]=1; }
- for ($i=1;$i<2;$i++){ $b[$i]=1; }
- return 1;
- }
-
- for ($j=1;$j<$m;$j++){
- $a[$j]=0 if !$a[$j];
- }
-
- for ($i=1;$i<$n;$i++){
- $b[$i]=0 if !$b[$i];
- }
-
-
- return 1;
-}
-
-$skip=0;
-$ccc=0;
-while(!eof(DIR)){
-
- if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c))
- {
- $ccc++;
- print "$c\n";
- print $#a," $src \# @a[1..$#a]\n";
- print $#b," $tgt \# @b[1..$#b]\n";
- }
- else{
- print "\n";
- print STDERR "." if !(++$skip % 1000);
- }
-};
-print STDERR "skip=<$skip> counts=<$ccc>\n";
diff --git a/mgizapp/scripts/merge_alignment.py b/mgizapp/scripts/merge_alignment.py
deleted file mode 100644
index 626bc68..0000000
--- a/mgizapp/scripts/merge_alignment.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env python
-# Author : Qin Gao
-# Date : Dec 31, 2007
-# Purpose: Combine multiple alignment files into a single one, the files are
-# prodcuced by MGIZA, which has sentence IDs, and every file is
-# ordered inside
-
-import sys
-import re
-
-if len(sys.argv)<2:
- sys.stderr.write("Provide me the file names (at least 2)\n");
- sys.exit();
-
-sent_id = 0;
-
-files = [];
-ids = [];
-
-sents = [];
-done = [];
-
-for i in range(1,len(sys.argv)):
- files.append(open(sys.argv[i],"r"));
- ids.append(0);
- sents.append("");
- done.append(False);
-
-r = re.compile("\\((\\d+)\\)");
-i = 0;
-while i< len(files):
- st1 = files[i].readline();
- st2 = files[i].readline();
- st3 = files[i].readline();
- if len(st1)==0 or len(st2)==0 or len(st3)==0:
- done[i] = True;
- else:
- mt = r.search(st1);
- id = int(mt.group(1));
- ids[i] = id;
- sents[i] = (st1, st2, st3);
- i += 1
-
-cont = True;
-while (cont):
- sent_id += 1;
- writeOne = False;
-# Now try to read more sentences
- i = 0;
- cont = False;
- while i < len(files):
- if done[i]:
- i+=1
- continue;
- cont = True;
- if ids[i] == sent_id:
- sys.stdout.write("%s%s%s"%(sents[i][0],sents[i][1],sents[i][2]));
- writeOne = True;
- st1 = files[i].readline();
- st2 = files[i].readline();
- st3 = files[i].readline();
- if len(st1)==0 or len(st2)==0 or len(st3)==0:
- done[i] = True;
- else:
- mt = r.search(st1);
- id = int(mt.group(1));
- ids[i] = id;
- sents[i] = (st1, st2, st3);
- cont = True;
- break;
- elif ids[i] < sent_id:
- sys.stderr.write("ERROR! DUPLICATED ENTRY %d\n" % ids[i]);
- sys.exit();
- else:
- cont = True;
- i+=1;
- if (not writeOne) and cont:
- sys.stderr.write("ERROR! MISSING ENTRy %d\n" % sent_id);
- #sys.exit();
-sys.stderr.write("Combined %d files, totally %d sents \n" %(len(files),sent_id-1));
diff --git a/mgizapp/scripts/plain2snt-hasvcb.py b/mgizapp/scripts/plain2snt-hasvcb.py
deleted file mode 100644
index 490c493..0000000
--- a/mgizapp/scripts/plain2snt-hasvcb.py
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env python
-
-from sys import *
-
-def loadvcb(fname,out):
- dict={};
- df = open(fname,"r");
- for line in df:
- out.write(line);
- ws = line.strip().split();
- id = int(ws[0]);
- wd = ws[1];
- dict[wd]=id;
- return dict;
-
-if len(argv)<9:
- stderr.write("Error, the input should be \n");
- stderr.write("%s evcb fvcb etxt ftxt esnt(out) fsnt(out) evcbx(out) fvcbx(out)\n" % argv[0]);
- stderr.write("You should concatenate the evcbx and fvcbx to existing vcb files\n");
- exit();
-
-ein = open(argv[3],"r");
-fin = open(argv[4],"r");
-
-eout = open(argv[5],"w");
-fout = open(argv[6],"w");
-
-evcbx = open(argv[7],"w");
-fvcbx = open(argv[8],"w");
-evcb = loadvcb(argv[1],evcbx);
-fvcb = loadvcb(argv[2],fvcbx);
-
-i=0
-while True:
- i+=1;
- eline=ein.readline();
- fline=fin.readline();
- if len(eline)==0 or len(fline)==0:
- break;
- ewords = eline.strip().split();
- fwords = fline.strip().split();
- el = [];
- fl = [];
- j=0;
- for w in ewords:
- j+=1
- if evcb.has_key(w):
- el.append(evcb[w]);
- else:
- if evcb.has_key(w.lower()):
- el.append(evcb[w.lower()]);
- else:
- ##stdout.write("#E %d %d %s\n" % (i,j,w))
- #el.append(1);
- nid = len(evcb)+1;
- evcb[w.lower()] = nid;
- evcbx.write("%d %s 1\n" % (nid, w));
- el.append(nid);
-
- j=0;
- for w in fwords:
- j+=1
- if fvcb.has_key(w):
- fl.append(fvcb[w]);
- else:
- if fvcb.has_key(w.lower()):
- fl.append(fvcb[w.lower()]);
- else:
- #stdout.write("#F %d %d %s\n" % (i,j,w))
- nid = len(fvcb)+1;
- fvcb[w.lower()] = nid;
- fvcbx.write("%d %s 1\n" % (nid, w));
- fl.append(nid);
- #fl.append(1);
- eout.write("1\n");
- fout.write("1\n");
- for I in el:
- eout.write("%d " % I);
- eout.write("\n");
- for I in fl:
- eout.write("%d " % I);
- fout.write("%d " % I);
- eout.write("\n");
- fout.write("\n");
- for I in el:
- fout.write("%d " % I);
- fout.write("\n");
-
-fout.close();
-eout.close();
-fvcbx.close();
-evcbx.close();
-
diff --git a/mgizapp/scripts/symal.sh b/mgizapp/scripts/symal.sh
deleted file mode 100644
index a446beb..0000000
--- a/mgizapp/scripts/symal.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-
-OUTPUT=$1
-shift
-GIZA2BAL=$1
-shift
-SYMAL=$1
-shift
-STOT=$1
-shift
-TTOS=$1
-shift
-
-perl $GIZA2BAL -d ${STOT} -i ${TTOS} | $SYMAL $* > $OUTPUT
-