Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-14 19:03:49 +0400
committermfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-14 19:03:49 +0400
commite72010d6ceb1a4b6ff2bbc7e4c41041a9b2c0d18 (patch)
treef1cc16990a3482d9e483a44e28be43dfb53dbbb6 /scripts
parent032e0688ced293b916a817cb294a40b9eeb07f23 (diff)
A tool to compute symmetric alignments from GIZA++ alignments.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@716 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rw-r--r--scripts/training/symal/Makefile13
-rw-r--r--scripts/training/symal/cmd.c642
-rw-r--r--scripts/training/symal/cmd.h49
-rw-r--r--scripts/training/symal/symal.cpp394
4 files changed, 1098 insertions, 0 deletions
diff --git a/scripts/training/symal/Makefile b/scripts/training/symal/Makefile
new file mode 100644
index 000000000..c7ced672f
--- /dev/null
+++ b/scripts/training/symal/Makefile
@@ -0,0 +1,13 @@
+CPP=g++ -g
+CC=gcc -g
+
+all: symal
+
+clean:
+ rm -f *.o
+
+cmd.o: cmd.c cmd.h
+ $(CC) -c -o cmd.o cmd.c
+
+symal: symal.cpp cmd.o
+ $(CPP) -o $@ $(@).cpp cmd.o
diff --git a/scripts/training/symal/cmd.c b/scripts/training/symal/cmd.c
new file mode 100644
index 000000000..ee607a254
--- /dev/null
+++ b/scripts/training/symal/cmd.c
@@ -0,0 +1,642 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "cmd.h"
+
+static Enum_T BoolEnum[] = {
+ { "FALSE", 0 },
+ { "TRUE", 1 },
+ { 0, 0 }
+};
+
+#ifdef NEEDSTRDUP
+char *strdup();
+#endif
+
+#define FALSE 0
+#define TRUE 1
+
+#define LINSIZ 10240
+#define MAXPARAM 256
+
+static char *GetLine(),
+ **str2array();
+static int Scan(),
+ SetParam(),
+ SetEnum(),
+ SetSubrange(),
+ SetStrArray(),
+ SetGte(),
+ SetLte(),
+ CmdError(),
+ EnumError(),
+ SubrangeError(),
+ GteError(),
+ LteError(),
+ PrintParam(),
+ PrintEnum(),
+ PrintStrArray();
+
+static Cmd_T cmds[MAXPARAM+1];
+static char *SepString = " \t\n";
+
+#if defined(__STDC__)
+#include <stdarg.h>
+int DeclareParams(char *ParName, ...)
+#else
+#include <varargs.h>
+int DeclareParams(ParName, va_alist)
+char *ParName;
+va_dcl
+#endif
+{
+ va_list args;
+ static int ParamN = 0;
+ int j,
+ c;
+ char *s;
+
+#if defined(__STDC__)
+ va_start(args, ParName);
+#else
+ va_start(args);
+#endif
+ for(;ParName;) {
+ if(ParamN==MAXPARAM) {
+ fprintf(stderr, "Too many parameters !!\n");
+ break;
+ }
+ for(j=0,c=1; j<ParamN&&(c=strcmp(cmds[j].Name,ParName))<0; j++)
+ ;
+ if(!c) {
+ fprintf(stderr,
+ "Warning: parameter \"%s\" declared twice.\n",
+ ParName);
+ }
+ for(c=ParamN; c>j; c--) {
+ cmds[c] = cmds[c-1];
+ }
+ cmds[j].Name = ParName;
+ cmds[j].Type = va_arg(args, int);
+ cmds[j].Val = va_arg(args, void *);
+ switch(cmds[j].Type) {
+ case CMDENUMTYPE: /* get the pointer to Enum_T struct */
+ cmds[j].p = va_arg(args, void *);
+ break;
+ case CMDSUBRANGETYPE: /* get the two extremes */
+ cmds[j].p = (void*) calloc(2, sizeof(int));
+ ((int*)cmds[j].p)[0] = va_arg(args, int);
+ ((int*)cmds[j].p)[1] = va_arg(args, int);
+ break;
+ case CMDGTETYPE: /* get lower or upper bound */
+ case CMDLTETYPE:
+ cmds[j].p = (void*) calloc(1, sizeof(int));
+ ((int*)cmds[j].p)[0] = va_arg(args, int);
+ break;
+ case CMDSTRARRAYTYPE: /* get the separators string */
+ cmds[j].p = (s=va_arg(args, char*))
+ ? (void*)strdup(s) : 0;
+ break;
+ case CMDBOOLTYPE:
+ cmds[j].Type = CMDENUMTYPE;
+ cmds[j].p = BoolEnum;
+ break;
+ case CMDDOUBLETYPE: /* nothing else is needed */
+ case CMDINTTYPE:
+ case CMDSTRINGTYPE:
+ break;
+ default:
+ fprintf(stderr, "%s: %s %d %s \"%s\"\n",
+ "DeclareParam()", "Unknown Type",
+ cmds[j].Type, "for parameter", cmds[j].Name);
+ exit(1);
+ }
+ ParamN++;
+ ParName = va_arg(args, char *);
+ }
+ cmds[ParamN].Name = NULL;
+ va_end(args);
+ return 0;
+}
+
+int GetParams(n, a, CmdFileName)
+int *n;
+char ***a;
+char *CmdFileName;
+{
+ char *Line,
+ *ProgName;
+ int argc = *n;
+ char **argv = *a,
+ *s;
+ FILE *fp;
+ int IsPipe;
+
+#ifdef MSDOS
+#define PATHSEP '\\'
+ char *dot = NULL;
+#else
+#define PATHSEP '/'
+#endif
+
+ if(!(Line=malloc(LINSIZ))) {
+ fprintf(stderr, "GetParams(): Unable to alloc %d bytes\n",
+ LINSIZ);
+ exit(1);
+ }
+ if((ProgName=strrchr(*argv, PATHSEP))) {
+ ++ProgName;
+ } else {
+ ProgName = *argv;
+ }
+#ifdef MSDOS
+ if(dot=strchr(ProgName, '.')) *dot = 0;
+#endif
+ --argc;
+ ++argv;
+ for(;;) {
+ if(argc && argv[0][0]=='-' && argv[0][1]=='=') {
+ CmdFileName = argv[0]+2;
+ ++argv;
+ --argc;
+ }
+ if(!CmdFileName) {
+ break;
+ }
+ IsPipe = !strncmp(CmdFileName, "@@", 2);
+ fp = IsPipe
+ ? popen(CmdFileName+2, "r")
+ : strcmp(CmdFileName, "-")
+ ? fopen(CmdFileName, "r")
+ : stdin;
+ if(!fp) {
+ fprintf(stderr, "Unable to open command file %s\n",
+ CmdFileName);
+ exit(1);
+ }
+ while(GetLine(fp, LINSIZ, Line) && strcmp(Line, "\\End")) {
+ if(Scan(ProgName, cmds, Line)) {
+ CmdError(Line);
+ }
+ }
+ if(fp!=stdin) {
+ if(IsPipe) pclose(fp); else fclose(fp);
+ }
+ CmdFileName = NULL;
+ }
+ while(argc && **argv=='-' && (s=strchr(*argv, '='))) {
+ *s = ' ';
+ sprintf(Line, "%s/%s", ProgName, *argv+1);
+ *s = '=';
+ if(Scan(ProgName, cmds, Line)) CmdError(*argv);
+ --argc;
+ ++argv;
+ }
+ *n = argc;
+ *a = argv;
+#ifdef MSDOS
+ if(dot) *dot = '.';
+#endif
+ free(Line);
+ return 0;
+}
+
+int PrintParams(ValFlag, fp)
+int ValFlag;
+FILE *fp;
+{
+ int i;
+
+ fflush(fp);
+ if(ValFlag) {
+ fprintf(fp, "Parameters Values:\n");
+ } else {
+ fprintf(fp, "Parameters:\n");
+ }
+ for(i=0; cmds[i].Name; i++) PrintParam(cmds+i, ValFlag, fp);
+ fprintf(fp, "\n");
+ fflush(fp);
+ return 0;
+}
+
+int SPrintParams(a, pfx)
+char ***a,
+ *pfx;
+{
+ int l,
+ n;
+ Cmd_T *cmd;
+
+ if(!pfx) pfx="";
+ l = strlen(pfx);
+ for(n=0, cmd=cmds; cmd->Name; cmd++) n += !!cmd->ArgStr;
+ a[0] = calloc(n, sizeof(char*));
+ for(n=0, cmd=cmds; cmd->Name; cmd++) {
+ if(!cmd->ArgStr) continue;
+ a[0][n] = malloc(strlen(cmd->Name)+strlen(cmd->ArgStr)+l+2);
+ sprintf(a[0][n], "%s%s=%s", pfx, cmd->Name, cmd->ArgStr);
+ ++n;
+ }
+ return n;
+}
+
+static int CmdError(opt)
+char *opt;
+{
+ fprintf(stderr, "Invalid option \"%s\"\n", opt);
+ fprintf(stderr, "This program expectes the following parameters:\n");
+ PrintParams(FALSE, stderr);
+ exit(0);
+}
+
+static int PrintParam(cmd, ValFlag, fp)
+Cmd_T *cmd;
+int ValFlag;
+FILE *fp;
+{
+ fprintf(fp, "%4s", "");
+ switch(cmd->Type) {
+ case CMDDOUBLETYPE:
+ fprintf(fp, "%s", cmd->Name);
+ if(ValFlag) fprintf(fp, ": %22.15e", *(double *)cmd->Val);
+ fprintf(fp, "\n");
+ break;
+ case CMDENUMTYPE:
+ PrintEnum(cmd, ValFlag, fp);
+ break;
+ case CMDINTTYPE:
+ case CMDSUBRANGETYPE:
+ case CMDGTETYPE:
+ case CMDLTETYPE:
+ fprintf(fp, "%s", cmd->Name);
+ if(ValFlag) fprintf(fp, ": %d", *(int *)cmd->Val);
+ fprintf(fp, "\n");
+ break;
+ case CMDSTRINGTYPE:
+ fprintf(fp, "%s", cmd->Name);
+ if(ValFlag) {
+ if(*(char **)cmd->Val) {
+ fprintf(fp, ": \"%s\"", *(char **)cmd->Val);
+ } else {
+ fprintf(fp, ": %s", "NULL");
+ }
+ }
+ fprintf(fp, "\n");
+ break;
+ case CMDSTRARRAYTYPE:
+ PrintStrArray(cmd, ValFlag, fp);
+ break;
+ default:
+ fprintf(stderr, "%s: %s %d %s \"%s\"\n",
+ "PrintParam",
+ "Unknown Type",
+ cmd->Type,
+ "for parameter",
+ cmd->Name);
+ exit(1);
+ }
+ return 0;
+}
+
+static char *GetLine(fp, n, Line)
+FILE *fp;
+int n;
+char *Line;
+{
+ int j,
+ l,
+ offs=0;
+
+ for(;;) {
+ if(!fgets(Line+offs, n-offs, fp)) {
+ return NULL;
+ }
+ if(Line[offs]=='#') continue;
+ l = strlen(Line+offs)-1;
+ Line[offs+l] = 0;
+ for(j=offs; Line[j] && isspace(Line[j]); j++, l--)
+ ;
+ if(l<1) continue;
+ if(j > offs) {
+ char *s = Line+offs,
+ *q = Line+j;
+
+ while((*s++=*q++))
+ ;
+ }
+ if(Line[offs+l-1]=='\\') {
+ offs += l;
+ Line[offs-1] = ' ';
+ } else {
+ break;
+ }
+ }
+ return Line;
+}
+
+static int Scan(ProgName, cmds, Line)
+char *ProgName,
+ *Line;
+Cmd_T *cmds;
+{
+ char *q,
+ *p;
+ int i,
+ hl,
+ HasToMatch = FALSE,
+ c0,
+ c;
+
+ p = Line+strspn(Line, SepString);
+ if(!(hl=strcspn(p, SepString))) {
+ return 0;
+ }
+ if((q=strchr(p, '/')) && q-p<hl) {
+ *q = 0;
+ if(strcmp(p, ProgName)) {
+ *q = '/';
+ return 0;
+ }
+ *q = '/';
+ HasToMatch=TRUE;
+ p = q+1;
+ }
+ if(!(hl = strcspn(p, SepString))) {
+ return 0;
+ }
+ c0 = p[hl];
+ p[hl] = 0;
+ for(i=0, c=1; cmds[i].Name&&(c=strcmp(cmds[i].Name, p))<0; i++)
+ ;
+ p[hl] = c0;
+ if(!c) return SetParam(cmds+i, p+hl+strspn(p+hl, SepString));
+ return HasToMatch && c;
+}
+
+static int SetParam(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ if(!*s && cmd->Type != CMDSTRINGTYPE) {
+ fprintf(stderr,
+ "WARNING: No value specified for parameter \"%s\"\n",
+ cmd->Name);
+ return 0;
+ }
+ switch(cmd->Type) {
+ case CMDDOUBLETYPE:
+ if(sscanf(s, "%lf", (double*)cmd->Val)!=1) {
+ fprintf(stderr,
+ "Float value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ break;
+ case CMDENUMTYPE:
+ SetEnum(cmd, s);
+ break;
+ case CMDINTTYPE:
+ if(sscanf(s, "%d", (int*)cmd->Val)!=1) {
+ fprintf(stderr,
+ "Integer value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ break;
+ case CMDSTRINGTYPE:
+ *(char **)cmd->Val = (strcmp(s, "<NULL>") && strcmp(s, "NULL"))
+ ? strdup(s)
+ : 0;
+ break;
+ case CMDSTRARRAYTYPE:
+ SetStrArray(cmd, s);
+ break;
+ case CMDGTETYPE:
+ SetGte(cmd, s);
+ break;
+ case CMDLTETYPE:
+ SetLte(cmd, s);
+ break;
+ case CMDSUBRANGETYPE:
+ SetSubrange(cmd, s);
+ break;
+ default:
+ fprintf(stderr, "%s: %s %d %s \"%s\"\n",
+ "SetParam",
+ "Unknown Type",
+ cmd->Type,
+ "for parameter",
+ cmd->Name);
+ exit(1);
+ }
+ cmd->ArgStr = strdup(s);
+ return 0;
+}
+
+static int SetEnum(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ Enum_T *en;
+
+ for(en=(Enum_T *)cmd->p; en->Name; en++) {
+ if(*en->Name && !strcmp(s, en->Name)) {
+ *(int *) cmd->Val = en->Idx;
+ return 0;
+ }
+ }
+ return EnumError(cmd, s);
+}
+
+static int SetSubrange(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ int n;
+
+ if(sscanf(s, "%d", &n)!=1) {
+ fprintf(stderr,
+ "Integer value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ if(n < *(int *)cmd->p || n > *((int *)cmd->p+1)) {
+ return SubrangeError(cmd, n);
+ }
+ *(int *)cmd->Val = n;
+ return 0;
+}
+
+static int SetGte(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ int n;
+
+ if(sscanf(s, "%d", &n)!=1) {
+ fprintf(stderr,
+ "Integer value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ if(n<*(int *)cmd->p) {
+ return GteError(cmd, n);
+ }
+ *(int *)cmd->Val = n;
+ return 0;
+}
+
+static int SetStrArray(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ *(char***)cmd->Val = str2array(s, (char*)cmd->p);
+ return 0;
+}
+
+static int SetLte(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ int n;
+
+ if(sscanf(s, "%d", &n)!=1) {
+ fprintf(stderr,
+ "Integer value required for parameter \"%s\"\n",
+ cmd->Name);
+ exit(1);
+ }
+ if(n > *(int *)cmd->p) {
+ return LteError(cmd, n);
+ }
+ *(int *)cmd->Val = n;
+ return 0;
+}
+
+static int EnumError(cmd, s)
+Cmd_T *cmd;
+char *s;
+{
+ Enum_T *en;
+
+ fprintf(stderr,
+ "Invalid value \"%s\" for parameter \"%s\"\n", s, cmd->Name);
+ fprintf(stderr, "Valid values are:\n");
+ for(en=(Enum_T *)cmd->p; en->Name; en++) {
+ if(*en->Name) {
+ fprintf(stderr, " %s\n", en->Name);
+ }
+ }
+ fprintf(stderr, "\n");
+ exit(1);
+}
+
+static int GteError(cmd, n)
+Cmd_T *cmd;
+int n;
+{
+ fprintf(stderr,
+ "Value %d out of range for parameter \"%s\"\n", n, cmd->Name);
+ fprintf(stderr, "Valid values must be greater than or equal to %d\n",
+ *(int *)cmd->p);
+ exit(1);
+}
+
+static int LteError(cmd, n)
+Cmd_T *cmd;
+int n;
+{
+ fprintf(stderr,
+ "Value %d out of range for parameter \"%s\"\n", n, cmd->Name);
+ fprintf(stderr, "Valid values must be less than or equal to %d\n",
+ *(int *)cmd->p);
+ exit(1);
+}
+
+static int SubrangeError(cmd, n)
+Cmd_T *cmd;
+int n;
+{
+ fprintf(stderr,
+ "Value %d out of range for parameter \"%s\"\n", n, cmd->Name);
+ fprintf(stderr, "Valid values range from %d to %d\n",
+ *(int *)cmd->p, *((int *)cmd->p+1));
+ exit(1);
+}
+
+static int PrintEnum(cmd, ValFlag, fp)
+Cmd_T *cmd;
+int ValFlag;
+FILE *fp;
+{
+ Enum_T *en;
+
+ fprintf(fp, "%s", cmd->Name);
+ if(ValFlag) {
+ for(en=(Enum_T *)cmd->p; en->Name; en++) {
+ if(*en->Name && en->Idx==*(int *)cmd->Val) {
+ fprintf(fp, ": %s", en->Name);
+ }
+ }
+ }
+ fprintf(fp, "\n");
+ return 0;
+}
+
+static int PrintStrArray(cmd, ValFlag, fp)
+Cmd_T *cmd;
+int ValFlag;
+FILE *fp;
+{
+ char *indent,
+ **s = *(char***)cmd->Val;
+ int l = 4+strlen(cmd->Name);
+
+ fprintf(fp, "%s", cmd->Name);
+ indent = malloc(l+2);
+ memset(indent, ' ', l+1);
+ indent[l+1] = 0;
+ if(ValFlag) {
+ fprintf(fp, ": %s", s ? (*s ? *s++ : "NULL") : "");
+ if(s) while(*s) {
+ fprintf(fp, "\n%s %s", indent, *s++);
+ }
+ }
+ free(indent);
+ fprintf(fp, "\n");
+ return 0;
+}
+
+static char **str2array(s, sep)
+char *s,
+ *sep;
+{
+ char *p,
+ **a;
+ int n = 0,
+ l;
+
+ if(!sep) sep = SepString;
+ p = s += strspn(s, sep);
+ while(*p) {
+ p += strcspn(p, sep);
+ p += strspn(p, sep);
+ ++n;
+ }
+ a = calloc(n+1, sizeof(char *));
+ p = s;
+ n = 0;
+ while(*p) {
+ l = strcspn(p, sep);
+ a[n] = malloc(l+1);
+ memcpy(a[n], p, l);
+ a[n][l] = 0;
+ ++n;
+ p += l;
+ p += strspn(p, sep);
+ }
+ return a;
+}
diff --git a/scripts/training/symal/cmd.h b/scripts/training/symal/cmd.h
new file mode 100644
index 000000000..c6fa57e71
--- /dev/null
+++ b/scripts/training/symal/cmd.h
@@ -0,0 +1,49 @@
+
+#if !defined(CMD_H)
+
+#define CMD_H
+
+#define CMDDOUBLETYPE 1
+#define CMDENUMTYPE 2
+#define CMDINTTYPE 3
+#define CMDSTRINGTYPE 4
+#define CMDSUBRANGETYPE 5
+#define CMDGTETYPE 6
+#define CMDLTETYPE 7
+#define CMDSTRARRAYTYPE 8
+#define CMDBOOLTYPE 9
+
+typedef struct {
+ char *Name;
+ int Idx;
+} Enum_T;
+
+typedef struct {
+ int Type;
+ char *Name,
+ *ArgStr;
+ void *Val,
+ *p;
+} Cmd_T;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__STDC__)
+int DeclareParams(char *, ...);
+#else
+int DeclareParams();
+#endif
+
+int GetParams(int *n, char ***a,char *CmdFileName),
+ SPrintParams(),
+ PrintParams();
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+
+
+
diff --git a/scripts/training/symal/symal.cpp b/scripts/training/symal/symal.cpp
new file mode 100644
index 000000000..5424d8b8c
--- /dev/null
+++ b/scripts/training/symal/symal.cpp
@@ -0,0 +1,394 @@
+using namespace std;
+
+#include <iomanip>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <list>
+#include <vector>
+#include <set>
+#include <algorithm>
+#include "cmd.h"
+
+#define MAX_WORD 100 //maximum lengthsource/target strings
+#define MAX_M 200 //maximum length of source strings
+#define MAX_N 200 //maximum length of target strings
+
+#define UNION 1
+#define INTERSECT 2
+#define GROW 3
+#define BOOL_YES 1
+#define BOOL_NO 0
+
+#define END_ENUM { (char*)0, 0 }
+
+static Enum_T AlignEnum [] = {
+{ "union", UNION },
+{ "u", UNION },
+{ "intersect", INTERSECT},
+{ "i", INTERSECT},
+{ "grow", GROW },
+{ "g", GROW },
+
+ END_ENUM
+};
+
+static Enum_T BoolEnum [] = {
+ { "true", BOOL_YES },
+ { "yes", BOOL_YES },
+ { "y", BOOL_YES },
+ { "false", BOOL_NO },
+ { "no", BOOL_NO },
+ { "n", BOOL_NO },
+ END_ENUM
+};
+
+
+
+// global variables and constants
+
+int* fa; //counters of covered foreign positions
+int* ea; //counters of covered english positions
+int** A; //alignment matrix with information symmetric/direct/inverse alignments
+
+int verbose=0;
+
+//read an alignment pair from the input stream.
+
+int getals(fstream& inp,int& m, int *a,int& n, int *b)
+{
+ char w[MAX_WORD], dummy[10];
+ int i,j,freq;
+
+ if (inp >> freq){
+ //target sentence
+ inp >> n; assert(n<MAX_N);
+ for (i=1;i<=n;i++){
+ inp >> setw(MAX_WORD) >> w;
+ assert(strlen(w)<MAX_WORD-1);
+ }
+
+ inp >> dummy; //# separator
+ // inverse alignment
+ for (i=1;i<=n;i++) inp >> b[i];
+
+ //source sentence
+ inp >> m; assert(m<MAX_M);
+ for (j=1;j<=m;j++){
+ inp >> setw(MAX_WORD) >> w;
+ assert(strlen(w)<MAX_WORD-1);
+ }
+
+ inp >> dummy; //# separator
+
+ // direct alignment
+ for (j=1;j<=m;j++) {
+ inp >> a[j];
+ assert(0<=a[j] && a[j]<=n);
+ }
+
+ //check inverse alignemnt
+ for (i=1;i<=n;i++)
+ assert(0<=b[i] && b[i]<=m);
+
+ return 1;
+
+ }
+ else
+ return 0;
+};
+
+
+//compute union alignment
+int prunionalignment(fstream& out,int m,int *a,int n,int* b){
+
+ ostringstream sout;
+
+ for (int j=1;j<=m;j++)
+ if (a[j])
+ sout << j-1 << "-" << a[j]-1 << " ";
+
+ for (int i=1;i<=n;i++)
+ if (b[i] && a[b[i]]!=i)
+ sout << b[i]-1 << "-" << i-1 << " ";
+
+ //fix the last " "
+ string str = sout.str();
+ str.replace(str.length()-1,1,"\n");
+
+ out << str;
+
+ return 1;
+}
+
+
+
+//Compute unionalignment Alignment
+
+int printersect(fstream& out,int m,int *a,int n,int* b){
+
+ ostringstream sout;
+
+ for (int j=1;j<=m;j++)
+ if (a[j] && b[a[j]]==j)
+ sout << j-1 << "-" << a[j]-1 << " ";
+
+ //fix the last " "
+ string str = sout.str();
+ str.replace(str.length()-1,1,"\n");
+
+ out << str;
+
+ return 1;
+}
+
+
+//Compute Grow Diagonal Alignment
+//Nice property: you will never introduce more points
+//than the unionalignment alignemt. Hence, you will always be able
+//to represent the grow alignment as the unionalignment of a
+//directed and inverted alignment
+
+int printgrow(fstream& out,int m,int *a,int n,int* b, bool diagonal=false,bool final=false,bool bothuncovered=false){
+
+ ostringstream sout;
+
+ vector <pair <int,int> > neighbors; //neighbors
+
+ pair <int,int> entry;
+
+ neighbors.push_back(make_pair(-1,-0));
+ neighbors.push_back(make_pair(0,-1));
+ neighbors.push_back(make_pair(1,0));
+ neighbors.push_back(make_pair(0,1));
+
+
+ if (diagonal){
+ neighbors.push_back(make_pair(-1,-1));
+ neighbors.push_back(make_pair(-1,1));
+ neighbors.push_back(make_pair(1,-1));
+ neighbors.push_back(make_pair(1,1));
+ }
+
+
+ int i,j,o;
+
+
+ //covered foreign and english positions
+
+ memset(fa,0,(m+1)*sizeof(int));
+ memset(ea,0,(n+1)*sizeof(int));
+
+ //matrix to quickly check if one point is in the symmetric
+ //alignment (value=2), direct alignment (=1) and inverse alignment
+
+ for (int i=1;i<=n;i++) memset(A[i],0,(m+1)*sizeof(int));
+
+ set <pair <int,int> > currentpoints; //symmetric alignment
+ set <pair <int,int> > unionalignment; //union alignment
+
+ pair <int,int> point; //variable to store points
+ set<pair <int,int> >::const_iterator k; //iterator over sets
+
+ //fill in the alignments
+ for (j=1;j<=m;j++){
+ if (a[j]){
+ unionalignment.insert(make_pair(a[j],j));
+ if (b[a[j]]==j){
+ fa[j]=1;ea[a[j]]=1;
+ A[a[j]][j]=2;
+ currentpoints.insert(make_pair(a[j],j));
+ }
+ else
+ A[a[j]][j]=-1;
+ }
+ }
+
+ for (i=1;i<=n;i++)
+ if (b[i] && a[b[i]]!=i){ //not intersection
+ unionalignment.insert(make_pair(i,b[i]));
+ A[i][b[i]]=1;
+ }
+
+
+ int added=1;
+
+ while (added){
+ added=0;
+ ///scan the current alignment
+ for (k=currentpoints.begin();k!=currentpoints.end();k++){
+ //cout << "{"<< (k->second)-1 << "-" << (k->first)-1 << "}";
+ for (o=0;o<neighbors.size();o++){
+ //cout << "go over check all neighbors\n";
+ point.first=k->first+neighbors[o].first;
+ point.second=k->second+neighbors[o].second;
+ //cout << point.second-1 << " " << point.first-1 << "\n";
+ //check if neighbor is inside 'matrix'
+ if (point.first>0 && point.first <=n && point.second>0 && point.second<=m)
+ //check if neighbor is in the unionalignment alignment
+ if (b[point.first]==point.second || a[point.second]==point.first){
+ //cout << "In unionalignment ";cout.flush();
+ //check if it connects at least one uncovered word
+ if (!(ea[point.first] && fa[point.second]))
+ {
+ //insert point in currentpoints!
+ currentpoints.insert(point);
+ A[point.first][point.second]=2;
+ ea[point.first]=1; fa[point.second]=1;
+ added=1;
+ //cout << "added grow: " << point.second-1 << "-" << point.first-1 << "\n";cout.flush();
+ }
+ }
+ }
+ }
+ }
+
+ if (final){
+ for (k=unionalignment.begin();k!=unionalignment.end();k++)
+ if (A[k->first][k->second]==1)
+ {
+ point.first=k->first;point.second=k->second;
+ //one of the two words is not covered yet
+ //cout << "{" << point.second-1 << "-" << point.first-1 << "} ";
+ if ((bothuncovered && !ea[point.first] && !fa[point.second]) ||
+ (!bothuncovered && !(ea[point.first] && fa[point.second])))
+ {
+ //add it!
+ currentpoints.insert(point);
+ A[point.first][point.second]=2;
+ //keep track of new covered positions
+ ea[point.first]=1;fa[point.second]=1;
+
+ //added=1;
+ //cout << "added final: " << point.second-1 << "-" << point.first-1 << "\n";
+ }
+ }
+
+ for (k=unionalignment.begin();k!=unionalignment.end();k++)
+ if (A[k->first][k->second]==-1)
+ {
+ point.first=k->first;point.second=k->second;
+ //one of the two words is not covered yet
+ //cout << "{" << point.second-1 << "-" << point.first-1 << "} ";
+ if ((bothuncovered && !ea[point.first] && !fa[point.second]) ||
+ (!bothuncovered && !(ea[point.first] && fa[point.second])))
+ {
+ //add it!
+ currentpoints.insert(point);
+ A[point.first][point.second]=2;
+ //keep track of new covered positions
+ ea[point.first]=1;fa[point.second]=1;
+
+ //added=1;
+ //cout << "added final: " << point.second-1 << "-" << point.first-1 << "\n";
+ }
+ }
+ }
+
+
+ for (k=currentpoints.begin();k!=currentpoints.end();k++)
+ sout << k->second-1 << "-" << k->first-1 << " ";
+
+
+ //fix the last " "
+ string str = sout.str();
+ str.replace(str.length()-1,1,"\n");
+
+ out << str;
+ out.flush();
+ return 1;
+
+ return 1;
+}
+
+
+
+//Main file here
+
+
+int main(int argc, char** argv){
+
+int alignment=0;
+char* input="/dev/stdin";
+char* output="/dev/stdout";
+int diagonal=false;
+int final=false;
+int bothuncovered=false;
+
+
+ DeclareParams("a", CMDENUMTYPE, &alignment, AlignEnum,
+ "alignment", CMDENUMTYPE, &alignment, AlignEnum,
+ "d", CMDENUMTYPE, &diagonal, BoolEnum,
+ "diagonal", CMDENUMTYPE, &diagonal, BoolEnum,
+ "f", CMDENUMTYPE, &final, BoolEnum,
+ "final", CMDENUMTYPE, &final, BoolEnum,
+ "b", CMDENUMTYPE, &bothuncovered, BoolEnum,
+ "both", CMDENUMTYPE, &bothuncovered, BoolEnum,
+ "i", CMDSTRINGTYPE, &input,
+ "o", CMDSTRINGTYPE, &output,
+ "v", CMDENUMTYPE, &verbose, BoolEnum,
+ "verbose", CMDENUMTYPE, &verbose, BoolEnum,
+
+ (char *)NULL);
+
+ GetParams(&argc, &argv, (char*) NULL);
+
+ if (alignment==0){
+ cerr << "usage: symal [-i=<inputfile>] [-o=<outputfile>] -a=[u|i|g] -d=[yes|no] -b=[yes|no] -f=[yes|no] \n"
+ << "Input file or std must be in .bal format (see script giza2bal.pl).\n";
+
+ exit(0);
+
+ }
+
+ fstream inp(input,ios::in);
+ fstream out(output,ios::out);
+
+ if (!inp.is_open()){
+ cerr << "cannot open " << input << "\n";
+ exit(0);
+ }
+
+ if (!out.is_open()){
+ cerr << "cannot open " << output << "\n";
+ exit(0);
+ }
+
+
+ int a[MAX_M],b[MAX_N],m,n;
+ fa=new int[MAX_M+1];
+ ea=new int[MAX_N+1];
+
+
+ A=new int *[MAX_N+1];
+ for (int i=1;i<=MAX_N;i++) A[i]=new int[MAX_M+1];
+
+ switch (alignment){
+ case UNION:
+ cerr << "symal: computing union alignment\n";
+ while(getals(inp,m,a,n,b)) prunionalignment(out,m,a,n,b);
+ break;
+ case INTERSECT:
+ cerr << "symal: computing intersect alignment\n";
+ while(getals(inp,m,a,n,b)) printersect(out,m,a,n,b);
+ break;
+ case GROW:
+ cerr << "symal: computing grow alignment: diagonal ("
+ << diagonal << ") final ("<< final << ")"
+ << "both-uncovered (" << bothuncovered <<")\n";
+
+ while(getals(inp,m,a,n,b))
+ printgrow(out,m,a,n,b,diagonal,final,bothuncovered);
+
+ break;
+ default:
+ exit(0);
+ }
+
+ delete [] fa; delete [] ea;
+ for (int i=1;i<=MAX_N;i++) delete [] A[i];
+ delete [] A;
+
+ exit(1);
+}