Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'mgizapp/src/symal.cpp')
-rw-r--r--mgizapp/src/symal.cpp501
1 files changed, 501 insertions, 0 deletions
diff --git a/mgizapp/src/symal.cpp b/mgizapp/src/symal.cpp
new file mode 100644
index 0000000..5767859
--- /dev/null
+++ b/mgizapp/src/symal.cpp
@@ -0,0 +1,501 @@
+// $Id: symal.cpp 1905 2008-10-16 21:14:38Z phkoehn $
+
+#include <cassert>
+#include <iomanip>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <list>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include <cstring>
+#include "cmd.h"
+
+using namespace std;
+
+#define MAX_WORD 1000 //maximum lengthsource/target strings
+#define MAX_M 200 //maximum length of source strings
+#define MAX_N 200 //maximum length of target strings
+
+#define UNION 1
+#define INTERSECT 2
+#define GROW 3
+#define SRCTOTGT 4
+#define TGTTOSRC 5
+#define BOOL_YES 1
+#define BOOL_NO 0
+
+#define END_ENUM { (char*)0, 0 }
+
+static Enum_T AlignEnum [] = {
+{ "union", UNION },
+{ "u", UNION },
+{ "intersect", INTERSECT},
+{ "i", INTERSECT},
+{ "grow", GROW },
+{ "g", GROW },
+{ "srctotgt", SRCTOTGT },
+{ "s2t", SRCTOTGT },
+{ "tgttosrc", TGTTOSRC },
+{ "t2s", TGTTOSRC },
+ END_ENUM
+};
+
+static Enum_T BoolEnum [] = {
+ { "true", BOOL_YES },
+ { "yes", BOOL_YES },
+ { "y", BOOL_YES },
+ { "false", BOOL_NO },
+ { "no", BOOL_NO },
+ { "n", BOOL_NO },
+ END_ENUM
+};
+
+
+
+// global variables and constants
+
+int* fa; //counters of covered foreign positions
+int* ea; //counters of covered english positions
+int** A; //alignment matrix with information symmetric/direct/inverse alignments
+
+int verbose=0;
+
+//read an alignment pair from the input stream.
+
+int lc = 0;
+
+int getals(fstream& inp,int& m, int *a,int& n, int *b,ostream& out)
+{
+ char w[MAX_WORD], dummy[10];
+ string tgtsent;
+ int i,j,freq;
+ if (inp >> freq){
+ ++lc;
+ //target sentence
+ inp >> n; assert(n<MAX_N);
+ for (i=1;i<=n;i++){
+ inp >> setw(MAX_WORD) >> w;
+ if (strlen(w)>=MAX_WORD-1) {
+ cerr << lc << ": target len=" << strlen(w) << " is not less than MAX_WORD-1="
+ << MAX_WORD-1 << endl;
+ assert(strlen(w)<MAX_WORD-1);
+ }
+ tgtsent+=w;
+ tgtsent+=" ";
+ }
+
+ inp >> dummy; //# separator
+ // inverse alignment
+ for (i=1;i<=n;i++) inp >> b[i];
+
+ //source sentence
+ inp >> m; assert(m<MAX_M);
+ for (j=1;j<=m;j++){
+ inp >> setw(MAX_WORD) >> w;
+ if (strlen(w)>=MAX_WORD-1) {
+ cerr << lc << ": source len=" << strlen(w) << " is not less than MAX_WORD-1="
+ << MAX_WORD-1 << endl;
+ assert(strlen(w)<MAX_WORD-1);
+ }
+ out << w << " ";
+ }
+ out << "{##} " << tgtsent << "{##} ";
+
+
+ inp >> dummy; //# separator
+
+ // direct alignment
+ for (j=1;j<=m;j++) {
+ inp >> a[j];
+ assert(0<=a[j] && a[j]<=n);
+ }
+
+ //check inverse alignemnt
+ for (i=1;i<=n;i++)
+ assert(0<=b[i] && b[i]<=m);
+
+ return 1;
+
+ }
+ else
+ return 0;
+};
+
+
+//compute union alignment
+int prunionalignment(fstream& out,int m,int *a,int n,int* b){
+
+ ostringstream sout;
+
+ for (int j=1;j<=m;j++)
+ if (a[j])
+ sout << j-1 << "-" << a[j]-1 << " ";
+
+ for (int i=1;i<=n;i++)
+ if (b[i] && a[b[i]]!=i)
+ sout << b[i]-1 << "-" << i-1 << " ";
+
+ //fix the last " "
+ string str = sout.str();
+ if (str.length() == 0)
+ str = "\n";
+ else
+ str.replace(str.length()-1,1,"\n");
+
+ out << str;
+ out.flush();
+
+ return 1;
+}
+
+
+//Compute intersection alignment
+
+int printersect(fstream& out,int m,int *a,int n,int* b){
+
+ ostringstream sout;
+
+ for (int j=1;j<=m;j++)
+ if (a[j] && b[a[j]]==j)
+ sout << j-1 << "-" << a[j]-1 << " ";
+
+ //fix the last " "
+ string str = sout.str();
+ if (str.length() == 0)
+ str = "\n";
+ else
+ str.replace(str.length()-1,1,"\n");
+
+ out << str;
+ out.flush();
+
+ return 1;
+}
+
+//Compute target-to-source alignment
+
+int printtgttosrc(fstream& out,int m,int *a,int n,int* b){
+
+ ostringstream sout;
+
+ for (int i=1;i<=n;i++)
+ if (b[i])
+ sout << b[i]-1 << "-" << i-1 << " ";
+
+ //fix the last " "
+ string str = sout.str();
+ if (str.length() == 0)
+ str = "\n";
+ else
+ str.replace(str.length()-1,1,"\n");
+
+ out << str;
+ out.flush();
+
+ return 1;
+}
+
+//Compute source-to-target alignment
+
+int printsrctotgt(fstream& out,int m,int *a,int n,int* b){
+
+ ostringstream sout;
+
+ for (int j=1;j<=m;j++)
+ if (a[j])
+ sout << j-1 << "-" << a[j]-1 << " ";
+
+ //fix the last " "
+ string str = sout.str();
+ if (str.length() == 0)
+ str = "\n";
+ else
+ str.replace(str.length()-1,1,"\n");
+
+ out << str;
+ out.flush();
+
+ return 1;
+}
+
+//Compute Grow Diagonal Alignment
+//Nice property: you will never introduce more points
+//than the unionalignment alignemt. Hence, you will always be able
+//to represent the grow alignment as the unionalignment of a
+//directed and inverted alignment
+
+int printgrow(fstream& out,int m,int *a,int n,int* b, bool diagonal=false,bool final=false,bool bothuncovered=false){
+
+ ostringstream sout;
+
+ vector <pair <int,int> > neighbors; //neighbors
+
+ pair <int,int> entry;
+
+ neighbors.push_back(make_pair(-1,-0));
+ neighbors.push_back(make_pair(0,-1));
+ neighbors.push_back(make_pair(1,0));
+ neighbors.push_back(make_pair(0,1));
+
+
+ if (diagonal){
+ neighbors.push_back(make_pair(-1,-1));
+ neighbors.push_back(make_pair(-1,1));
+ neighbors.push_back(make_pair(1,-1));
+ neighbors.push_back(make_pair(1,1));
+ }
+
+
+ int i,j,o;
+
+
+ //covered foreign and english positions
+
+ memset(fa,0,(m+1)*sizeof(int));
+ memset(ea,0,(n+1)*sizeof(int));
+
+ //matrix to quickly check if one point is in the symmetric
+ //alignment (value=2), direct alignment (=1) and inverse alignment
+
+ for (int i=1;i<=n;i++) memset(A[i],0,(m+1)*sizeof(int));
+
+ set <pair <int,int> > currentpoints; //symmetric alignment
+ set <pair <int,int> > unionalignment; //union alignment
+
+ pair <int,int> point; //variable to store points
+ set<pair <int,int> >::const_iterator k; //iterator over sets
+
+ //fill in the alignments
+ for (j=1;j<=m;j++){
+ if (a[j]){
+ unionalignment.insert(make_pair(a[j],j));
+ if (b[a[j]]==j){
+ fa[j]=1;ea[a[j]]=1;
+ A[a[j]][j]=2;
+ currentpoints.insert(make_pair(a[j],j));
+ }
+ else
+ A[a[j]][j]=-1;
+ }
+ }
+
+ for (i=1;i<=n;i++)
+ if (b[i] && a[b[i]]!=i){ //not intersection
+ unionalignment.insert(make_pair(i,b[i]));
+ A[i][b[i]]=1;
+ }
+
+
+ int added=1;
+
+ while (added){
+ added=0;
+ ///scan the current alignment
+ for (k=currentpoints.begin();k!=currentpoints.end();k++){
+ //cout << "{"<< (k->second)-1 << "-" << (k->first)-1 << "}";
+ for (o=0;o<neighbors.size();o++){
+ //cout << "go over check all neighbors\n";
+ point.first=k->first+neighbors[o].first;
+ point.second=k->second+neighbors[o].second;
+ //cout << point.second-1 << " " << point.first-1 << "\n";
+ //check if neighbor is inside 'matrix'
+ if (point.first>0 && point.first <=n && point.second>0 && point.second<=m)
+ //check if neighbor is in the unionalignment alignment
+ if (b[point.first]==point.second || a[point.second]==point.first){
+ //cout << "In unionalignment ";cout.flush();
+ //check if it connects at least one uncovered word
+ if (!(ea[point.first] && fa[point.second]))
+ {
+ //insert point in currentpoints!
+ currentpoints.insert(point);
+ A[point.first][point.second]=2;
+ ea[point.first]=1; fa[point.second]=1;
+ added=1;
+ //cout << "added grow: " << point.second-1 << "-" << point.first-1 << "\n";cout.flush();
+ }
+ }
+ }
+ }
+ }
+
+ if (final){
+ for (k=unionalignment.begin();k!=unionalignment.end();k++)
+ if (A[k->first][k->second]==1)
+ {
+ point.first=k->first;point.second=k->second;
+ //one of the two words is not covered yet
+ //cout << "{" << point.second-1 << "-" << point.first-1 << "} ";
+ if ((bothuncovered && !ea[point.first] && !fa[point.second]) ||
+ (!bothuncovered && !(ea[point.first] && fa[point.second])))
+ {
+ //add it!
+ currentpoints.insert(point);
+ A[point.first][point.second]=2;
+ //keep track of new covered positions
+ ea[point.first]=1;fa[point.second]=1;
+
+ //added=1;
+ //cout << "added final: " << point.second-1 << "-" << point.first-1 << "\n";
+ }
+ }
+
+ for (k=unionalignment.begin();k!=unionalignment.end();k++)
+ if (A[k->first][k->second]==-1)
+ {
+ point.first=k->first;point.second=k->second;
+ //one of the two words is not covered yet
+ //cout << "{" << point.second-1 << "-" << point.first-1 << "} ";
+ if ((bothuncovered && !ea[point.first] && !fa[point.second]) ||
+ (!bothuncovered && !(ea[point.first] && fa[point.second])))
+ {
+ //add it!
+ currentpoints.insert(point);
+ A[point.first][point.second]=2;
+ //keep track of new covered positions
+ ea[point.first]=1;fa[point.second]=1;
+
+ //added=1;
+ //cout << "added final: " << point.second-1 << "-" << point.first-1 << "\n";
+ }
+ }
+ }
+
+
+ for (k=currentpoints.begin();k!=currentpoints.end();k++)
+ sout << k->second-1 << "-" << k->first-1 << " ";
+
+
+ //fix the last " "
+ string str = sout.str();
+ if (str.length() == 0)
+ str = "\n";
+ else
+ str.replace(str.length()-1,1,"\n");
+
+ out << str;
+ out.flush();
+ return 1;
+
+ return 1;
+}
+
+
+
+//Main file here
+
+
+int main(int argc, char** argv){
+
+int alignment=0;
+char* input="/dev/stdin";
+char* output="/dev/stdout";
+int diagonal=false;
+int final=false;
+int bothuncovered=false;
+
+
+ DeclareParams("a", CMDENUMTYPE, &alignment, AlignEnum,
+ "alignment", CMDENUMTYPE, &alignment, AlignEnum,
+ "d", CMDENUMTYPE, &diagonal, BoolEnum,
+ "diagonal", CMDENUMTYPE, &diagonal, BoolEnum,
+ "f", CMDENUMTYPE, &final, BoolEnum,
+ "final", CMDENUMTYPE, &final, BoolEnum,
+ "b", CMDENUMTYPE, &bothuncovered, BoolEnum,
+ "both", CMDENUMTYPE, &bothuncovered, BoolEnum,
+ "i", CMDSTRINGTYPE, &input,
+ "o", CMDSTRINGTYPE, &output,
+ "v", CMDENUMTYPE, &verbose, BoolEnum,
+ "verbose", CMDENUMTYPE, &verbose, BoolEnum,
+
+ (char *)NULL);
+
+ GetParams(&argc, &argv, (char*) NULL);
+
+ if (alignment==0){
+ cerr << "usage: symal [-i=<inputfile>] [-o=<outputfile>] -a=[u|i|g] -d=[yes|no] -b=[yes|no] -f=[yes|no] \n"
+ << "Input file or std must be in .bal format (see script giza2bal.pl).\n";
+
+ exit(1);
+
+ }
+
+ fstream inp(input,ios::in);
+ fstream out(output,ios::out);
+
+ if (!inp.is_open()){
+ cerr << "cannot open " << input << "\n";
+ exit(1);
+ }
+
+ if (!out.is_open()){
+ cerr << "cannot open " << output << "\n";
+ exit(1);
+ }
+
+
+ int a[MAX_M],b[MAX_N],m,n;
+ fa=new int[MAX_M+1];
+ ea=new int[MAX_N+1];
+
+
+ int sents = 0;
+ A=new int *[MAX_N+1];
+ for (int i=1;i<=MAX_N;i++) A[i]=new int[MAX_M+1];
+
+ switch (alignment){
+ case UNION:
+ cerr << "symal: computing union alignment\n";
+ while(getals(inp,m,a,n,b,out)) {
+ prunionalignment(out,m,a,n,b);
+ sents++;
+ }
+ cerr << "Sents: " << sents << endl;
+ break;
+ case INTERSECT:
+ cerr << "symal: computing intersect alignment\n";
+ while(getals(inp,m,a,n,b,out)) {
+ printersect(out,m,a,n,b);
+ sents++;
+ }
+ cerr << "Sents: " << sents << endl;
+ break;
+ case GROW:
+ cerr << "symal: computing grow alignment: diagonal ("
+ << diagonal << ") final ("<< final << ")"
+ << "both-uncovered (" << bothuncovered <<")\n";
+
+ while(getals(inp,m,a,n,b,out))
+ printgrow(out,m,a,n,b,diagonal,final,bothuncovered);
+
+ break;
+ case TGTTOSRC:
+ cerr << "symal: computing target-to-source alignment\n";
+
+ while(getals(inp,m,a,n,b,out)){
+ printtgttosrc(out,m,a,n,b);
+ sents++;
+ }
+ cerr << "Sents: " << sents << endl;
+ break;
+ case SRCTOTGT:
+ cerr << "symal: computing source-to-target alignment\n";
+
+ while(getals(inp,m,a,n,b,out)){
+ printsrctotgt(out,m,a,n,b);
+ sents++;
+ }
+ cerr << "Sents: " << sents << endl;
+ break;
+ default:
+ exit(1);
+ }
+
+ delete [] fa; delete [] ea;
+ for (int i=1;i<=MAX_N;i++) delete [] A[i];
+ delete [] A;
+
+ exit(0);
+}