Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/giza-pp.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorredpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b>2007-09-25 05:56:49 +0400
committerredpony <redpony@ca354974-eb3a-0410-8f5c-d3948404989b>2007-09-25 05:56:49 +0400
commit4bf5e78d59d755f5f97a78b5ff76829c81b16a80 (patch)
tree751561816a3f416950c78e7890c77247f97913a2
Initial check in, based on GIZA++-v2 modified to compile on gcc 4.1.1
.C and .cc suffixes were normalized to .cpp. This is for standardization and to prevent accidental overwrites with previous versions of the code by copying.
-rw-r--r--GIZA++-v2/ATables.cpp119
-rw-r--r--GIZA++-v2/ATables.h178
-rw-r--r--GIZA++-v2/AlignTables.cpp44
-rw-r--r--GIZA++-v2/AlignTables.h124
-rw-r--r--GIZA++-v2/Array.h5
-rw-r--r--GIZA++-v2/Array2.h111
-rw-r--r--GIZA++-v2/Array4.h78
-rw-r--r--GIZA++-v2/D4Tables.h460
-rw-r--r--GIZA++-v2/D5Tables.h235
-rw-r--r--GIZA++-v2/Dictionary.cpp92
-rw-r--r--GIZA++-v2/Dictionary.h48
-rw-r--r--GIZA++-v2/FlexArray.h58
-rw-r--r--GIZA++-v2/ForwardBackward.cpp242
-rw-r--r--GIZA++-v2/ForwardBackward.h62
-rw-r--r--GIZA++-v2/GNU.GPL282
-rw-r--r--GIZA++-v2/Globals.h73
-rw-r--r--GIZA++-v2/HMMTables.cpp177
-rw-r--r--GIZA++-v2/HMMTables.h178
-rw-r--r--GIZA++-v2/LICENSE282
-rw-r--r--GIZA++-v2/Makefile138
-rw-r--r--GIZA++-v2/Makefile.definitions0
-rw-r--r--GIZA++-v2/Makefile.src2
-rw-r--r--GIZA++-v2/MoveSwapMatrix.cpp231
-rw-r--r--GIZA++-v2/MoveSwapMatrix.h116
-rw-r--r--GIZA++-v2/NTables.cpp93
-rw-r--r--GIZA++-v2/NTables.h145
-rw-r--r--GIZA++-v2/Parameter.cpp144
-rw-r--r--GIZA++-v2/Parameter.h199
-rw-r--r--GIZA++-v2/Perplexity.cpp40
-rw-r--r--GIZA++-v2/Perplexity.h108
-rw-r--r--GIZA++-v2/Pointer.h175
-rw-r--r--GIZA++-v2/README508
-rw-r--r--GIZA++-v2/TTables.cpp323
-rw-r--r--GIZA++-v2/TTables.h424
-rw-r--r--GIZA++-v2/Vector.h424
-rw-r--r--GIZA++-v2/WordClasses.h95
-rw-r--r--GIZA++-v2/alignment.cpp38
-rw-r--r--GIZA++-v2/alignment.h227
-rw-r--r--GIZA++-v2/collCounts.cpp293
-rw-r--r--GIZA++-v2/collCounts.h80
-rw-r--r--GIZA++-v2/defs.h78
-rw-r--r--GIZA++-v2/dependencies635
-rw-r--r--GIZA++-v2/file_spec.h59
-rw-r--r--GIZA++-v2/getSentence.cpp340
-rw-r--r--GIZA++-v2/getSentence.h123
-rw-r--r--GIZA++-v2/hmm.cpp405
-rw-r--r--GIZA++-v2/hmm.h88
-rw-r--r--GIZA++-v2/logprob.cpp161
-rw-r--r--GIZA++-v2/logprob.h222
-rw-r--r--GIZA++-v2/main.cpp718
-rw-r--r--GIZA++-v2/model1.cpp283
-rw-r--r--GIZA++-v2/model1.h164
-rw-r--r--GIZA++-v2/model2.cpp232
-rw-r--r--GIZA++-v2/model2.h70
-rw-r--r--GIZA++-v2/model2to3.cpp398
-rw-r--r--GIZA++-v2/model3.cpp511
-rw-r--r--GIZA++-v2/model3.h138
-rw-r--r--GIZA++-v2/model345-peg.cpp191
-rw-r--r--GIZA++-v2/model3_viterbi.cpp656
-rw-r--r--GIZA++-v2/model3_viterbi_with_tricks.cpp690
-rw-r--r--GIZA++-v2/myassert.cpp20
-rw-r--r--GIZA++-v2/myassert.h20
-rw-r--r--GIZA++-v2/mymath.h9
-rw-r--r--GIZA++-v2/mystl.h322
-rw-r--r--GIZA++-v2/parse.cpp151
-rw-r--r--GIZA++-v2/plain2snt.cpp115
-rw-r--r--GIZA++-v2/reports.cpp211
-rw-r--r--GIZA++-v2/snt2cooc.cpp106
-rw-r--r--GIZA++-v2/snt2plain.cpp90
-rwxr-xr-xGIZA++-v2/trainGIZA++.sh19
-rw-r--r--GIZA++-v2/transpair_model1.h108
-rw-r--r--GIZA++-v2/transpair_model2.h52
-rw-r--r--GIZA++-v2/transpair_model3.cpp197
-rw-r--r--GIZA++-v2/transpair_model3.h84
-rw-r--r--GIZA++-v2/transpair_model4.cpp179
-rw-r--r--GIZA++-v2/transpair_model4.h79
-rw-r--r--GIZA++-v2/transpair_model5.cpp243
-rw-r--r--GIZA++-v2/transpair_model5.h74
-rw-r--r--GIZA++-v2/transpair_modelhmm.h223
-rw-r--r--GIZA++-v2/utility.cpp30
-rw-r--r--GIZA++-v2/utility.h54
-rw-r--r--GIZA++-v2/vocab.cpp90
-rw-r--r--GIZA++-v2/vocab.h103
-rw-r--r--Makefile14
-rw-r--r--README8
-rw-r--r--mkcls-v2/Array.h370
-rw-r--r--mkcls-v2/FixedArray.h287
-rw-r--r--mkcls-v2/FlexArray.h48
-rw-r--r--mkcls-v2/GDAOptimization.cpp159
-rw-r--r--mkcls-v2/GDAOptimization.h80
-rw-r--r--mkcls-v2/GNU.GPL282
-rw-r--r--mkcls-v2/HCOptimization.cpp57
-rw-r--r--mkcls-v2/HCOptimization.h54
-rw-r--r--mkcls-v2/IterOptimization.cpp199
-rw-r--r--mkcls-v2/IterOptimization.h123
-rw-r--r--mkcls-v2/KategProblem.cpp1001
-rw-r--r--mkcls-v2/KategProblem.h439
-rw-r--r--mkcls-v2/KategProblemKBC.cpp243
-rw-r--r--mkcls-v2/KategProblemKBC.h157
-rw-r--r--mkcls-v2/KategProblemTest.cpp700
-rw-r--r--mkcls-v2/KategProblemTest.h60
-rw-r--r--mkcls-v2/KategProblemWBC.cpp344
-rw-r--r--mkcls-v2/KategProblemWBC.h131
-rw-r--r--mkcls-v2/LICENSE282
-rw-r--r--mkcls-v2/MSBOptimization.cpp229
-rw-r--r--mkcls-v2/MSBOptimization.h77
-rw-r--r--mkcls-v2/MYOptimization.cpp85
-rw-r--r--mkcls-v2/MYOptimization.h61
-rw-r--r--mkcls-v2/Makefile23
-rw-r--r--mkcls-v2/Optimization.cpp30
-rw-r--r--mkcls-v2/Optimization.h49
-rw-r--r--mkcls-v2/PopOptimization.cpp105
-rw-r--r--mkcls-v2/PopOptimization.h89
-rw-r--r--mkcls-v2/Problem.cpp165
-rw-r--r--mkcls-v2/Problem.h159
-rw-r--r--mkcls-v2/ProblemTest.cpp264
-rw-r--r--mkcls-v2/ProblemTest.h65
-rw-r--r--mkcls-v2/README10
-rw-r--r--mkcls-v2/RRTOptimization.cpp217
-rw-r--r--mkcls-v2/RRTOptimization.h79
-rw-r--r--mkcls-v2/SAOptimization.cpp280
-rw-r--r--mkcls-v2/SAOptimization.h86
-rw-r--r--mkcls-v2/StatVar.cpp140
-rw-r--r--mkcls-v2/StatVar.h134
-rw-r--r--mkcls-v2/TAOptimization.cpp208
-rw-r--r--mkcls-v2/TAOptimization.h78
-rw-r--r--mkcls-v2/general.cpp120
-rw-r--r--mkcls-v2/general.h89
-rw-r--r--mkcls-v2/makePackage.sh43
-rw-r--r--mkcls-v2/mkcls.cpp618
-rw-r--r--mkcls-v2/my.h54
-rw-r--r--mkcls-v2/myassert.h44
-rw-r--r--mkcls-v2/myleda.h278
-rw-r--r--mkcls-v2/mystl.h124
134 files changed, 24433 insertions, 0 deletions
diff --git a/GIZA++-v2/ATables.cpp b/GIZA++-v2/ATables.cpp
new file mode 100644
index 0000000..4401194
--- /dev/null
+++ b/GIZA++-v2/ATables.cpp
@@ -0,0 +1,119 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "ATables.h"
+#include "Globals.h"
+#include "myassert.h"
+#include "Parameter.h"
+
+GLOBAL_PARAMETER(bool,CompactADTable,"compactadtable","1: only 3-dimensional alignment table for IBM-2 and IBM-3",PARLEV_MODELS,1);
+GLOBAL_PARAMETER(float,amodel_smooth_factor,"model23SmoothFactor","smoothing parameter for IBM-2/3 (interpolation with constant)",PARLEV_SMOOTH,0.0);
+
+template <class VALTYPE>
+void amodel<VALTYPE>::printTable(const char *filename) const
+ // print amodel to file with the name filename (it'll be created or overwritten
+ // format : for a table :
+ // aj j l m val
+ // where aj is source word pos, j target word pos, l source sentence length,
+ // m is target sentence length.
+ //
+{
+ //return;
+ if (is_distortion)
+ cout << "Dumping distortion table (d) to file:" << filename <<'\n';
+ else
+ cout << "Dumping alignment table (a) to file:" << filename <<'\n';
+
+ ofstream of(filename);
+ double ssum=0.0;
+ for(WordIndex l=0; l < MaxSentLength; l++)
+ for(WordIndex m=0;m<MaxSentLength;m++)
+ {
+ if( CompactADTable && l!=m )
+ continue;
+ unsigned int L=((CompactADTable&&is_distortion)?MaxSentLength:(l+1))-1;
+ unsigned int M=((CompactADTable&&!is_distortion)?MaxSentLength:(m+1))-1;
+ if( is_distortion==0 )
+ for(WordIndex j=1;j<=M; j++)
+ {
+ double sum=0.0;
+ for(WordIndex i=0;i<=L; i++)
+ {
+ VALTYPE x=getValue(i, j, L, M);
+ if( x>PROB_SMOOTH )
+ {
+ of << i << ' ' << j << ' ' << L << ' ' << M << ' ' << x << '\n';
+ sum+=x;
+ }
+ }
+ ssum+=sum;
+ }
+ else
+ for(WordIndex i=0;i<=L;i++)
+ {
+ double sum=0.0;
+ for(WordIndex j=1;j<=M;j++)
+
+ {
+ VALTYPE x=getValue(j, i, L, M);
+ if( x>PROB_SMOOTH )
+ {
+ of << j << ' ' << i << ' ' << L << ' ' << M << ' ' << x << '\n';
+ sum+=x;
+ }
+ }
+ ssum+=sum;
+ }
+ }
+}
+
+extern short NoEmptyWord;
+
+template <class VALTYPE>
+void amodel<VALTYPE>::readTable(const char *filename)
+{
+ /* This function reads the a table from a file.
+ Each line is of the format: aj j l m val
+ where aj is the source word position, j the target word position,
+ l the source sentence length, and m the target sentence length
+
+ This function also works for a d table, where the positions
+ of aj and i are swapped. Both the a and d tables are 4 dimensional
+ hashes; this function will simply read in the four values and keep
+ them in that order when hashing the fifth value.
+ NAS, 7/11/99
+ */
+ ifstream inf(filename);
+ cout << "Reading a/d table from " << filename << "\n";
+ if(!inf){
+ cerr << "\nERROR: Cannot open " << filename<<"\n";
+ return;
+ }
+ WordIndex w, x, l, m;
+ VALTYPE prob;
+ while(inf >> w >> x >> l >> m >> prob )
+ // the NULL word is added to the length
+ // of the sentence in the tables, but discount it when you write the tables.
+ setValue(w, x, l, m, prob);
+}
+
+template class amodel<COUNT> ;
+//template class amodel<PROB> ;
diff --git a/GIZA++-v2/ATables.h b/GIZA++-v2/ATables.h
new file mode 100644
index 0000000..b4ebfcb
--- /dev/null
+++ b/GIZA++-v2/ATables.h
@@ -0,0 +1,178 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/* --------------------------------------------------------------------------*
+ * *
+ * Module :ATables *
+ * *
+ * Prototypes File: ATables.h *
+ * *
+ * Objective: Defines clases and methods for handling I/O for distortion & *
+ * alignment tables. *
+ *****************************************************************************/
+
+#ifndef _atables_h
+#define _atables_h 1
+
+#include "defs.h"
+#include <cassert>
+#include <iostream>
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <set>
+#include "Vector.h"
+#include <utility>
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+#include <fstream.h>
+#include "Array4.h"
+#include "myassert.h"
+#include "Globals.h"
+
+extern bool CompactADTable;
+extern float amodel_smooth_factor;
+extern short NoEmptyWord;
+
+/* ------------------- Class Defintions of amodel ---------------------------*/
+/* Class Name: amodel:
+ Objective: This defines the underlying data structure for distortiont prob.
+ and count tables. They are defined as a hash table. Each entry in the hash
+ table is the probability (d(j/l,m,i), where j is word target position, i is
+ source word position connected to it, m is target sentence length, and l is
+ source sentence length) or count collected for it. The probability and the
+ count are represented as log integer probability as
+ defined by the class LogProb .
+
+ This class is used to represents a Tables (probabiliity) and d (distortion)
+ tables and also their corresponding count tables .
+
+ *--------------------------------------------------------------------------*/
+
+inline int Mabs(int a)
+{
+ if(a<0)
+ return -a;
+ else
+ return a;
+}
+
+template <class VALTYPE>
+class amodel
+{
+ public:
+ Array4<VALTYPE> a;
+ bool is_distortion ;
+ WordIndex MaxSentLength;
+ bool ignoreL, ignoreM;
+ VALTYPE get(WordIndex aj, WordIndex j, WordIndex l, WordIndex m)const
+ {
+ massert( (!is_distortion) || aj<=m );massert( (!is_distortion) || j<=l );massert( (!is_distortion) || aj!=0 );
+ massert( is_distortion || aj<=l );massert( is_distortion || j<=m );massert( (is_distortion) || j!=0 );
+ massert( l<MaxSentLength );massert( m<MaxSentLength );
+ return a.get(aj, j, (CompactADTable&&is_distortion)?MaxSentLength:(l+1),(CompactADTable&&!is_distortion)?MaxSentLength:(m+1));
+ }
+ static float smooth_factor;
+ amodel(bool flag)
+ : a(MAX_SENTENCE_LENGTH+1,0.0), is_distortion(flag), MaxSentLength(MAX_SENTENCE_LENGTH)
+ {};
+ VALTYPE&getRef(WordIndex aj, WordIndex j, WordIndex l, WordIndex m)
+ {
+ massert( (!is_distortion) || aj<=m );massert( (!is_distortion) || j<=l );
+ massert( is_distortion || aj<=l );massert( is_distortion || j<=m );massert( (is_distortion) || j!=0 );
+ massert( l<MaxSentLength );massert( m<MaxSentLength );
+ return a(aj, j, (CompactADTable&&is_distortion)?MaxSentLength:(l+1),(CompactADTable&&!is_distortion)?MaxSentLength:(m+1));
+ }
+ void setValue(WordIndex aj, WordIndex j, WordIndex l, WordIndex m, VALTYPE val)
+ {
+ getRef(aj, j, l, m)=val;
+ }
+ VALTYPE getValue(WordIndex aj, WordIndex j, WordIndex l, WordIndex m) const
+ {
+ if( is_distortion==0 )
+ return max(double(PROB_SMOOTH),amodel_smooth_factor/(l+1)+(1.0-amodel_smooth_factor)*get(aj, j, l, m));
+ else
+ return max(double(PROB_SMOOTH),amodel_smooth_factor/m+(1.0-amodel_smooth_factor)*get(aj, j, l, m));
+ }
+ void printTable(const char* filename)const ;
+ template<class COUNT>
+ void normalize(amodel<COUNT>& aTable)const
+ {
+ WordIndex i, j, l, m ;
+ COUNT total;
+ int nParam=0;
+ for(l=0;l<MaxSentLength;l++)
+ for(m=0;m<MaxSentLength;m++)
+ {
+ if( CompactADTable && l!=m )
+ continue;
+ unsigned int L=((CompactADTable&&is_distortion)?MaxSentLength:(l+1))-1;
+ unsigned int M=((CompactADTable&&!is_distortion)?MaxSentLength:(m+1))-1;
+ if( is_distortion==0 )
+ for(j=1;j<=M; j++)
+ {
+ total=0.0;
+ for(i=0;i<=L;i++)
+ {
+ total+=get(i, j, L, M);
+ }
+ if( total )
+ for(i=0;i<=L;i++)
+ {
+ nParam++;
+ aTable.getRef(i, j, L, M)=get(i, j, L, M)/total;
+ massert(aTable.getRef(i,j,L,M)<=1.0);
+ if( NoEmptyWord&&i==0 )
+ aTable.getRef(i,j,L,M)=0;
+ }
+ }
+ else
+ for(i=0;i<=L;i++)
+ {
+ total=0.0;
+ for(j=1;j<=M;j++)
+ total+=get(j, i, L, M);
+ if( total )
+ for(j=1;j<=M;j++)
+ {
+ aTable.getRef(j, i, L, M)=amodel_smooth_factor/M+(1.0-amodel_smooth_factor)*get(j, i, L, M)/total;
+ nParam++;
+ massert(aTable.getRef(j,i,L,M)<=1.0);
+ if( NoEmptyWord&&i==0 )
+ aTable.getRef(j,i,L,M)=0;
+ }
+ }
+ }
+ cout << "A/D table contains " << nParam << " parameters.\n";
+ }
+
+ void readTable(const char *filename);
+ void clear()
+ {a.clear();}
+};
+
+/* ------------------- End of amodel Class Definitions ----------------------*/
+
+#endif
diff --git a/GIZA++-v2/AlignTables.cpp b/GIZA++-v2/AlignTables.cpp
new file mode 100644
index 0000000..de75107
--- /dev/null
+++ b/GIZA++-v2/AlignTables.cpp
@@ -0,0 +1,44 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "AlignTables.h"
+
+bool alignmodel::insert(Vector<WordIndex>& aj, LogProb val)
+{
+ hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator i;
+ i = a.find(aj);
+ if(i != a.end() || val <= 0)
+ return false ;
+ a.insert(pair<const Vector<WordIndex>, LogProb>(aj, val));
+ return true ;
+}
+
+
+LogProb alignmodel::getValue(Vector<WordIndex>& align) const
+{
+ const LogProb zero = 0.0 ;
+ hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::const_iterator i;
+ i = a.find(align);
+ if(i == a.end())
+ return zero;
+ else
+ return (*i).second;
+}
diff --git a/GIZA++-v2/AlignTables.h b/GIZA++-v2/AlignTables.h
new file mode 100644
index 0000000..773b172
--- /dev/null
+++ b/GIZA++-v2/AlignTables.h
@@ -0,0 +1,124 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _aligntables_h
+#define _aligntables_h 1
+
+#include "defs.h"
+
+
+#include <cassert>
+
+#include <iostream>
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <set>
+//#include <vector>
+#include "Vector.h"
+#include <utility>
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+#include <math.h>
+#include <fstream>
+#include "transpair_model1.h"
+
+
+/* ----------------- Class Defintions for hashmyalignment --------------------
+ Objective: This class is used to define a hash mapping function to map
+ an alignment (defined as a vector of integers) into a hash key
+ ----------------------------------------------------------------------------*/
+
+class hashmyalignment : public unary_function< Vector<WordIndex>, size_t >
+{
+public:
+ size_t operator() (const Vector<WordIndex>& key) const
+ // to define the mapping function. it takes an alignment (a vector of
+ // integers) and it returns an integer value (hash key).
+ {
+ WordIndex j ;
+ size_t s ;
+ size_t key_sum = 0 ;
+ // logmsg << "For alignment:" ;
+ for (j = 1 ; j < key.size() ; j++){
+ // logmsg << " " << key[j] ;
+ key_sum += (size_t) (int) pow(double(key[j]), double((j % 6)+1));
+ }
+ // logmsg << " , Key value was : " << key_sum;
+ s = key_sum % 1000000 ;
+ // logmsg << " h(k) = " << s << endl ;
+ return(s);
+ }
+};
+
+class equal_to_myalignment{
+ // returns true if two alignments are the same (two vectors have same enties)
+public:
+ bool operator()(const Vector<WordIndex> t1,
+ const Vector<WordIndex> t2) const
+ {WordIndex j ;
+ if (t1.size() != t2.size())
+ return(false);
+ for (j = 1 ; j < t1.size() ; j++)
+ if (t1[j] != t2[j])
+ return(false);
+ return(true);
+ }
+
+};
+
+/* ---------------- End of Class Defnition for hashmyalignment --------------*/
+
+
+/* ------------------ Class Defintions for alignmodel -----------------------
+ Class Name: alignmodel
+ Objective: Alignments neighborhhoods (collection of alignments) are stored in
+ a hash table (for easy lookup). Each alignment vector is mapped into a hash
+ key using the operator defined above.
+ *--------------------------------------------------------------------------*/
+
+class alignmodel{
+private:
+ hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment > a;
+private:
+ // void erase(Vector<WordIndex>&);
+public:
+
+ // methods;
+
+ inline hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator begin(void){return a.begin();} // begining of hash
+ inline hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator end(void){return a.end();} // end of hash
+ inline const hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >& getHash() const {return a;}; // reference to hash table
+ bool insert(Vector<WordIndex>&, LogProb val=0.0); // add a alignmnet
+ // void setValue(Vector<WordIndex>&, LogProb val); // not needed
+ LogProb getValue(Vector<WordIndex>&)const; // retrieve prob. of alignment
+ inline void clear(void){ a.clear();}; // clear hash table
+ // void printTable(const char* filename);
+ inline void resize(WordIndex n) {a.resize(n);}; // resize table
+
+};
+
+/* -------------- End of alignmode Class Definitions ------------------------*/
+#endif
diff --git a/GIZA++-v2/Array.h b/GIZA++-v2/Array.h
new file mode 100644
index 0000000..eae58d4
--- /dev/null
+++ b/GIZA++-v2/Array.h
@@ -0,0 +1,5 @@
+#ifndef GIZA_ARRAY_H_DEFINED
+#define GIZA_ARRAY_H_DEFINED
+#include "Vector.h"
+#define Array Vector
+#endif
diff --git a/GIZA++-v2/Array2.h b/GIZA++-v2/Array2.h
new file mode 100644
index 0000000..dbb5194
--- /dev/null
+++ b/GIZA++-v2/Array2.h
@@ -0,0 +1,111 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/*--
+Array2: Implementation of a twodimensional checked array allowing for
+a specified underlieing one-dimensional data-structure.
+
+Franz Josef Och (30/07/99)
+--*/
+#ifndef CLASS_Array2_DEFINED
+#define CLASS_Array2_DEFINED
+
+#include "mystl.h"
+#include <string>
+#include <vector>
+
+template<class T, class Y=vector<T> > class Array2
+{
+private:
+ Y p;
+ // short h1, h2;
+ unsigned int h1, h2;
+public:
+ Array2(unsigned int _h1, unsigned int _h2)
+ : p(_h1*_h2), h1(_h1), h2(_h2) {}
+ Array2(unsigned int _h1, unsigned int _h2, const T&_init)
+ : p(_h1*_h2, _init), h1(_h1), h2(_h2) {}
+ Array2()
+ : h1(0), h2(0) {}
+ inline T &operator()(unsigned int i, unsigned int j)
+ { assert(i<h1);assert(j<h2);return p[i*h2+j]; }
+ inline const T&operator()(unsigned int i, unsigned int j) const
+ { assert(i<h1);assert(j<h2);return p[i*h2+j]; }
+ inline T get(unsigned int i, unsigned int j)
+ { assert(i<h1);assert(j<h2);return p[i*h2+j]; }
+ inline void set(unsigned int i, unsigned int j, T x)
+ { assert(i<h1);assert(j<h2);p[i*h2+j]=x; }
+ inline const T get(unsigned int i, unsigned int j) const
+ { assert(i<h1);assert(j<h2);return p[i*h2+j]; }
+ inline unsigned int getLen1() const
+ { return h1; }
+ inline unsigned int getLen2() const
+ { return h2; }
+
+inline T*begin(){
+#ifdef __STL_DEBUG
+ if( h1==0||h2==0)return 0;
+#endif
+ return &(p[0]);
+}
+inline T*end(){
+#ifdef __STL_DEBUG
+ if( h1==0||h2==0)return 0;
+#endif
+ return &(p[0])+p.size();
+}
+
+ inline const T*begin()const{ return p.begin(); }
+ inline const T*end()const{return p.end();}
+
+ friend ostream&operator<<(ostream&out, const Array2<T, Y>&ar)
+ {
+ for(unsigned int i=0;i<ar.getLen1();i++)
+ {
+ //out << i << ": ";
+ for(unsigned int j=0;j<ar.getLen2();j++)
+ out << ar(i, j) << ' ';
+ out << '\n';
+ }
+ return out << endl;
+ }
+ inline void resize(unsigned int a,unsigned int b)
+ {
+ if( !(a==h1&&b==h2))
+ {
+ h1=a;
+ h2=b;
+ p.resize(h1*h2);
+ }
+ }
+ inline void resize(unsigned int a,unsigned int b,const T&t)
+ {
+ if( !(a==h1&&b==h2))
+ {
+ h1=a;
+ h2=b;
+ p.resize(h1*h2);
+ fill(p.begin(),p.end(),t);
+ }
+ }
+};
+
+#endif
diff --git a/GIZA++-v2/Array4.h b/GIZA++-v2/Array4.h
new file mode 100644
index 0000000..4e57a2e
--- /dev/null
+++ b/GIZA++-v2/Array4.h
@@ -0,0 +1,78 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef AlignmentArray4_h_DEFINED
+#define AlignmentArray4_h_DEFINED
+
+#include "Array2.h"
+template<class T> class Array4
+{
+ private:
+ Array2< Array2<T>* > A;
+ int M;
+ T init;
+ public:
+ Array4(int m,const T&_init)
+ : A(m,m,0),M(m),init(_init) {}
+ ~Array4()
+ {
+ for(int l=0;l<M;++l)
+ for(int m=0;m<M;++m)
+ delete A(l,m);
+ }
+ const T&operator()(int i, int j, int l, int m)const
+ {
+ if( A(l,m)==0 )
+ return init;
+ else
+ return (*A(l,m))(i,j);
+ }
+ const T&get(int i, int j, int l, int m)const
+ {
+ if( A(l,m)==0 )
+ return init;
+ else
+ return (*A(l,m))(i,j);
+ }
+ T&operator()(int i, int j, int l, int m)
+ {
+ if( A(l,m)==0 )
+ {
+ A(l,m)=new Array2<T>(max(l+1,m+1),max(l+1,m+1),init);
+ }
+ return (*A(l,m))(i,j);
+ }
+ void clear()
+ {
+ for(int l=0;l<M;++l)
+ for(int m=0;m<M;++m)
+ if( A(l,m) )
+ {
+ Array2<T>&a=*A(l,m);
+ for(int i=0;i<=l;++i)
+ for(int j=0;j<=m;++j)
+ a(i,j)=0.0;
+ }
+ }
+};
+
+#endif
diff --git a/GIZA++-v2/D4Tables.h b/GIZA++-v2/D4Tables.h
new file mode 100644
index 0000000..a6d1335
--- /dev/null
+++ b/GIZA++-v2/D4Tables.h
@@ -0,0 +1,460 @@
+/*
+
+Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _d4tables_h_define
+#define _d4tables_h_define
+#include <math.h>
+#include "WordClasses.h"
+#include "Globals.h"
+#include "myassert.h"
+
+extern float d4modelsmooth_factor;
+
+class m4_key
+{
+ public:
+ int deps;
+ int l;
+ int m;
+ int F;
+ int E;
+ int prevj;
+ int vacancies1,vacancies2;
+ m4_key(int _deps,int _l,int _m,int _F,int _E,int _prevj,int _v1,int _v2)
+ : deps(_deps),l(_l),m(_m),F(_F),E(_E),prevj(_prevj),vacancies1(_v1),vacancies2(_v2) {}
+ friend ostream&print1(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf)
+ {
+ if(x.deps&DEP_MODEL_l)out << "l: " << x.l<<' ';
+ if(x.deps&DEP_MODEL_m)out << "m: " << x.m<<' ';
+ if(x.deps&DEP_MODEL_F)out << "F: " << wcf.classString(x.F)<< ' ';
+ if(x.deps&DEP_MODEL_E)out << "E: " << wce.classString(x.E)<< ' ';
+ // if(x.deps&DEP_MODEL_pj)out << "j-1: " << x.prevj<<' ';
+ if(x.vacancies1!=-1)out << "v1: " << x.vacancies1 << ' ';
+ if(x.vacancies2!=-1)out << "v2: " << x.vacancies2 << ' ';
+ return out << '\n';
+ }
+ friend ostream&print1_m5(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf)
+ {
+ out << ((x.deps&DEP_MODEL_E)?wce.classString(x.E):string("0"))<< ' ';
+ out << ((x.deps&DEP_MODEL_F)?wcf.classString(x.F):string("0"))<< ' ';
+ out << x.vacancies1 << ' ';
+ out << x.vacancies2 << ' ';
+ return out;
+ }
+ friend ostream&printb1(ostream&out,const m4_key&x,const WordClasses&wce,const WordClasses&wcf)
+ {
+ if(x.deps&DEP_MODELb_l)out << "l: " << x.l<<' ';
+ if(x.deps&DEP_MODELb_m)out << "m: " << x.m<<' ';
+ if(x.deps&DEP_MODELb_F)out << "F: " << wcf.classString(x.F) << ' ';
+ if(x.deps&DEP_MODELb_E)out << "E: " << wce.classString(x.E) << ' ';
+ if(x.vacancies1!=-1)out << "v1: " << x.vacancies1 << ' ';
+ if(x.vacancies2!=-1)out << "v2: " << x.vacancies2 << ' ';
+ return out << '\n';
+ }
+ friend ostream&printb1_m5(ostream&out,const m4_key&x,const WordClasses&wcf)
+ {
+ out << "-1 " << ((x.deps&DEP_MODEL_F)?wcf.classString(x.F):string("0"))<< ' ';
+ out << x.vacancies1 << ' ';
+ out << x.vacancies2 << ' ';
+ return out;
+ }
+};
+
+class compare1
+{
+ private:
+ int deps;
+ public:
+ compare1(int _deps) : deps(_deps) {}
+ bool operator()(const m4_key&a,const m4_key&b)const
+ {
+ if(deps&DEP_MODEL_l){if( a.l<b.l )return 1;if( b.l<a.l )return 0;}
+ if(deps&DEP_MODEL_m){if( a.m<b.m )return 1;if( b.m<a.m )return 0;}
+ if(deps&DEP_MODEL_F){if( a.F<b.F )return 1;if( b.F<a.F )return 0;}
+ if(deps&DEP_MODEL_E){if( a.E<b.E )return 1;if( b.E<a.E )return 0;}
+ //if(deps&DEP_MODEL_pj){if( a.prevj<b.prevj )return 1;if( b.prevj<a.prevj )return 0;}
+ if(a.vacancies1<b.vacancies1)return 1;if(b.vacancies1<a.vacancies1)return 0;
+ if(a.vacancies2<b.vacancies2)return 1;if(b.vacancies2<a.vacancies2)return 0;
+ return 0;
+ }
+};
+
+class compareb1
+{
+ private:
+ int deps;
+ public:
+ compareb1(int _deps) : deps(_deps) {}
+ bool operator()(const m4_key&a,const m4_key&b)const
+ {
+ if(deps&DEP_MODELb_l){if( a.l<b.l )return 1;if( b.l<a.l )return 0;}
+ if(deps&DEP_MODELb_m){if( a.m<b.m )return 1;if( b.m<a.m )return 0;}
+ if(deps&DEP_MODELb_F){if( a.F<b.F )return 1;if( b.F<a.F )return 0;}
+ if(deps&DEP_MODELb_E){if( a.E<b.E )return 1;if( b.E<a.E )return 0;}
+ //if(deps&DEP_MODELb_pj){if( a.prevJ<b.prevJ )return 1;if( b.prevJ<a.prevJ )return 0;}
+ if(a.vacancies1<b.vacancies1)return 1;if(b.vacancies1<a.vacancies1)return 0;
+ if(a.vacancies2<b.vacancies2)return 1;if(b.vacancies2<a.vacancies2)return 0;
+ return 0;
+ }
+};
+
+inline void tokenize(const string&in,Vector<string>&out)
+{
+ string s;
+ istrstream l(in.c_str());
+ while(l>>s)
+ out.push_back(s);
+}
+
+class d4model
+{
+ public:
+ typedef Vector<pair<COUNT,PROB> > Vpff;
+ map<m4_key,Vpff,compare1 > D1;
+ map<m4_key,Vpff,compareb1> Db1;
+ PositionIndex msl;
+ WordClasses ewordclasses;
+ WordClasses fwordclasses;
+ template<class MAPPER>
+ void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile)
+ {
+ ifstream estrm(efile.c_str()),fstrm(ffile.c_str());
+ if( !estrm )
+ {
+ cerr << "ERROR: can not read " << efile << endl;
+ }
+ else
+ ewordclasses.read(estrm,m1);
+ if( !fstrm )
+ cerr << "ERROR: can not read " << ffile << endl;
+ else
+ fwordclasses.read(fstrm,m2);
+ }
+ d4model(PositionIndex _msl)
+ : D1(compare1(M4_Dependencies)),Db1(compareb1(M4_Dependencies)),msl(_msl)
+ {}
+ COUNT&getCountRef_first(WordIndex j,WordIndex j_cp,int E,int F,int l,int m)
+ {
+ assert(j>=1);
+ m4_key key(M4_Dependencies,l,m,F,E,j_cp,-1,-1);
+ map<m4_key,Vpff,compare1 >::iterator p=D1.find(key);
+ if(p==D1.end())p=D1.insert(make_pair(key,Vpff(msl*2+1,pair<COUNT,PROB>(0.0,0.0)))).first;
+ assert(p!=D1.end());
+ return (p->second)[j-j_cp+msl].first;
+ }
+ COUNT&getCountRef_bigger(WordIndex j,WordIndex j_prev,int E,int F,int l,int m)
+ {
+ assert(j>=1);
+ assert(j_prev>=1);
+ m4_key key(M4_Dependencies,l,m,F,E,j_prev,-1,-1);
+ map<m4_key,Vpff,compareb1 >::iterator p=Db1.find(key);
+ if(p==Db1.end())p=Db1.insert(make_pair(key,Vpff(msl*2+1,pair<COUNT,PROB>(0.0,0.0)))).first;
+ assert(p!=Db1.end());
+ return (p->second)[j-j_prev+msl].first;
+ }
+ map<m4_key,Vpff,compare1 >::const_iterator getProb_first_iterator(int E,int F,int l,int m)const
+ {return D1.find(m4_key(M4_Dependencies,l,m,F,E,0,-1,-1));}
+ PROB getProb_first_withiterator(WordIndex j,WordIndex j_cp,int m,const map<m4_key,Vpff,compare1 >::const_iterator& p)const
+ {
+ assert(j>=1);//assert(j_cp>=0);
+ assert(j<=msl);assert(j_cp<=msl);
+ if(p==D1.end())
+ {
+ return PROB_SMOOTH;
+ }
+ else
+ {
+ massert((p->second)[j-j_cp+msl].second<=1.0);
+ return max(PROB_SMOOTH,d4modelsmooth_factor/(2*m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_cp+msl].second);
+ }
+ }
+ PROB getProb_first(WordIndex j,WordIndex j_cp,int E,int F,int l,int m)const
+ {
+ assert(j>=1);//assert(j_cp>=0);
+ assert(j<=msl);assert(j_cp<=msl);
+ m4_key key(M4_Dependencies,l,m,F,E,j_cp,-1,-1);
+ map<m4_key,Vpff,compare1 >::const_iterator p=D1.find(key);
+ if(p==D1.end())
+ {
+ return PROB_SMOOTH;
+ }
+ else
+ {
+ massert((p->second)[j-j_cp+msl].second<=1.0);
+ return max(PROB_SMOOTH,d4modelsmooth_factor/(2*m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_cp+msl].second);
+ }
+ }
+ map<m4_key,Vpff,compareb1 >::const_iterator getProb_bigger_iterator(int E,int F,int l,int m)const
+ {
+ return Db1.find(m4_key(M4_Dependencies,l,m,F,E,0,-1,-1));
+ }
+ PROB getProb_bigger_withiterator(WordIndex j,WordIndex j_prev,int m,const map<m4_key,Vpff,compareb1 >::const_iterator&p)const
+ {
+ massert(j>=1);massert(j_prev>=1);
+ massert(j>j_prev);
+ massert(j<=msl);massert(j_prev<=msl);
+ if(p==Db1.end())
+ {
+ return PROB_SMOOTH;
+ }
+ else
+ {
+ massert((p->second)[j-j_prev+msl].second<=1.0 );
+ return max(PROB_SMOOTH,d4modelsmooth_factor/(m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_prev+msl].second);
+ }
+ }
+
+ PROB getProb_bigger(WordIndex j,WordIndex j_prev,int E,int F,int l,int m)const
+ {
+ massert(j>=1);massert(j_prev>=1);
+ massert(j>j_prev);
+ massert(j<=msl);massert(j_prev<=msl);
+ m4_key key(M4_Dependencies,l,m,F,E,j_prev,-1,-1);
+ map<m4_key,Vpff,compareb1 >::const_iterator p=Db1.find(key);
+ if(p==Db1.end())
+ {
+ return PROB_SMOOTH;
+ }
+ else
+ {
+ massert((p->second)[j-j_prev+msl].second<=1.0 );
+ return max(PROB_SMOOTH,d4modelsmooth_factor/(m-1)+(1-d4modelsmooth_factor)*(p->second)[j-j_prev+msl].second);
+ }
+ }
+ void normalizeTable()
+ {
+ int nParams=0;
+ for(map<m4_key,Vpff,compare1 >::iterator i=D1.begin();i!=D1.end();++i)
+ {
+ Vpff&d1=i->second;
+ double sum=0.0;
+ for(PositionIndex i=0;i<d1.size();i++)
+ sum+=d1[i].first;
+ for(PositionIndex i=0;i<d1.size();i++)
+ {
+ d1[i].second=sum?(d1[i].first/sum):(1.0/d1.size());
+ nParams++;
+ }
+ }
+ for(map<m4_key,Vpff,compareb1 >::iterator i=Db1.begin();i!=Db1.end();++i)
+ {
+ Vpff&db1=i->second;
+ double sum=0.0;
+ for(PositionIndex i=0;i<db1.size();i++)
+ sum+=db1[i].first;
+ for(PositionIndex i=0;i<db1.size();i++)
+ {
+ db1[i].second=sum?(db1[i].first/sum):(1.0/db1.size());
+ nParams++;
+ }
+ }
+ cout << "D4 table contains " << nParams << " parameters.\n";
+ }
+ void clear()
+ {
+ for(map<m4_key,Vpff,compare1 >::iterator i=D1.begin();i!=D1.end();++i)
+ {
+ Vpff&d1=i->second;
+ for(PositionIndex i=0;i<d1.size();i++)
+ d1[i].first=0.0;
+ }
+ for(map<m4_key,Vpff,compareb1 >::iterator i=Db1.begin();i!=Db1.end();++i)
+ {
+ Vpff&db1=i->second;
+ for(PositionIndex i=0;i<db1.size();i++)
+ db1[i].first=0.0;
+ }
+ }
+
+ void printProbTable(const char*fname1,const char*fname2)
+ {
+ ofstream out(fname1);
+ double ssum=0.0;
+ out << "# Translation tables for Model 4 .\n";
+ out << "# Table for head of cept.\n";
+ for(map<m4_key,Vpff,compare1 >::const_iterator i=D1.begin();i!=D1.end();++i)
+ {
+ const Vpff&d1=i->second;
+ double sum=0.0;
+ for(PositionIndex ii=0;ii<d1.size();ii++)sum+=d1[ii].first;
+ if ( sum )
+ {
+ print1(out,i->first,ewordclasses,fwordclasses);
+ out << "SUM: " << sum << ' '<< '\n';
+ for(unsigned ii=0;ii<d1.size();ii++)
+ if( d1[ii].first )
+ out << (int)(ii)-(int)(msl) << ' ' << d1[ii].first << '\n';
+ out << endl;
+ }
+ ssum+=sum;
+ }
+ out << "# Table for non-head of cept.\n";
+ for(map<m4_key,Vpff,compareb1 >::const_iterator i=Db1.begin();i!=Db1.end();++i)
+ {
+ const Vpff&db1=i->second;
+ double sum=0.0;
+ for(PositionIndex ii=0;ii<db1.size();++ii)sum+=db1[ii].first;
+ if( sum )
+ {
+ printb1(out,i->first,ewordclasses,fwordclasses);
+ out << "SUM: " << sum << ' '<<'\n';
+ for(unsigned ii=0;ii<db1.size();ii++)
+ if( db1[ii].first )
+ {
+ out << (int)(ii)-(int)(msl) << ' ' << db1[ii].first << '\n';
+ }
+ out << endl;
+ }
+ ssum+=sum;
+ }
+ out << endl << "FULL-SUM: " << ssum << endl;
+ if( M4_Dependencies==76 )
+ {
+ ofstream out2(fname2);
+ for(map<m4_key,Vpff,compare1 >::const_iterator i=D1.begin();i!=D1.end();++i)
+ {
+ const Vpff&d1=i->second;
+ for(unsigned ii=0;ii<d1.size();ii++)
+ if( d1[ii].first )
+ out2 << ewordclasses.classString(i->first.E) << ' ' << fwordclasses.classString(i->first.F) << ' ' << (int)(ii)-(int)(msl) << ' ' << d1[ii].second << '\n';
+ }
+ for(map<m4_key,Vpff,compareb1 >::const_iterator i=Db1.begin();i!=Db1.end();++i)
+ {
+ const Vpff&db1=i->second;
+ for(unsigned ii=0;ii<db1.size();ii++)
+ if( db1[ii].first )
+ out2 << -1 << ' ' << fwordclasses.classString(i->first.F) << ' ' << (int)(ii)-(int)(msl) << ' ' << db1[ii].second << '\n';
+ }
+ }
+ }
+ bool readProbTable(const char *fname)
+ {
+ cerr << "Reading D4Tables from " << fname << endl;
+ ifstream file(fname);
+ string line;
+ do
+ {
+ getline(file,line);
+ } while(line.length()&&line[0]=='#');
+
+ do
+ {
+ while(line.length()==0)
+ getline(file,line);
+ if( line[0]=='#')
+ break;
+ Vector<string> linestr;
+ tokenize(line,linestr);
+ m4_key k(M4_Dependencies,0,0,0,0,0,-1,-1);
+ for(unsigned int i=0;i<linestr.size();i+=2)
+ {
+ if( linestr[i]=="l:" ){k.l=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODEL_l);}
+ if( linestr[i]=="m:" ){k.m=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODEL_m);}
+ if( linestr[i]=="F:" ){k.F=fwordclasses(linestr[i+1]);iassert(M4_Dependencies&DEP_MODEL_F);}
+ if( linestr[i]=="E:" ){k.E=ewordclasses(linestr[i+1]);iassert(M4_Dependencies&DEP_MODEL_E);}
+ //if( linestr[i]=="j-1:" ){k.prevj=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODEL_pj);}
+ }
+ string str;
+ double sum;
+ file >> str >> sum;
+ iassert(str=="SUM:");
+ if( str!="SUM:")
+ cerr << "ERROR: string is " << str << " and not sum " << endl;
+
+ do
+ {
+ int value;
+ double count;
+ getline(file,line);
+ istrstream twonumbers(line.c_str());
+ if(twonumbers >> value >> count)
+ {
+ if( D1.count(k)==0 )
+ D1.insert(make_pair(k,Vpff(msl*2+1,pair<COUNT,PROB>(0.0,0.0))));
+ D1[k][value+msl]=make_pair(count,count/sum);
+ }
+ }while(line.length());
+ }while(file);
+ do
+ {
+ getline(file,line);
+ } while(line.length()&&line[0]=='#');
+ do
+ {
+ while(line.length()==0)
+ getline(file,line);
+ if( line[0]=='#')
+ break;
+ Vector<string> linestr;
+ tokenize(line,linestr);
+ m4_key k(M4_Dependencies,0,0,0,0,0,-1,-1);
+ bool sumRead=0;
+ for(unsigned int i=0;i<linestr.size();i+=2)
+ {
+ if( linestr[i]=="l:" ){k.l=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODELb_l);}
+ else if( linestr[i]=="m:" ){k.m=atoi(linestr[i+1].c_str());iassert(M4_Dependencies&DEP_MODELb_m);}
+ else if( linestr[i]=="F:" ){k.F=fwordclasses(linestr[i+1]);iassert(M4_Dependencies&DEP_MODELb_F);}
+ else if( linestr[i]=="E:" ){k.E=ewordclasses(linestr[i+1]);iassert(M4_Dependencies&DEP_MODELb_E);}
+ else if( linestr[i]=="SUM:" )
+ {
+ cerr << "Warning: obviously no dependency.\n";
+ sumRead=1;
+ }
+ else if( linestr[i]=="FULL-SUM:" )
+ {
+ break;
+ }
+ else
+ {
+ cerr << "ERROR: error in reading d4 tables: " << linestr[i] << ' ' << linestr[i+1] << endl;
+ }
+ }
+ string str;
+ double sum;
+ if( sumRead==0 )
+ file >> str >> sum;
+ else
+ {
+ str=linestr[0];
+ sum=atof(linestr[1].c_str());
+ }
+ if( str!="SUM:" )
+ cerr << "ERROR: should read SUM but read " << str << endl;
+ do
+ {
+ int value;
+ double count;
+ getline(file,line);
+ istrstream twonumbers(line.c_str());
+ if(twonumbers >> value >> count)
+ {
+ if( Db1.count(k)==0 )
+ Db1.insert(make_pair(k,Vpff(msl*2+1,pair<COUNT,PROB>(0.0,0.0))));
+ Db1[k][value+msl]=make_pair(count,count/sum);
+ }
+ }while(file&&line.length());
+ }while(file);
+ return 1;
+ }
+};
+
+#endif
diff --git a/GIZA++-v2/D5Tables.h b/GIZA++-v2/D5Tables.h
new file mode 100644
index 0000000..c662795
--- /dev/null
+++ b/GIZA++-v2/D5Tables.h
@@ -0,0 +1,235 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _d5tables_h_define
+#define _d5tables_h_define
+#include <math.h>
+#include "D4Tables.h"
+
+extern float d5modelsmooth_countoffset;
+extern float d5modelsmooth_factor;
+
+#define UNSEENPROB (1.0/vacancies_total)
+
+class d5model
+{
+ private:
+ typedef Vector < pair < COUNT,PROB > >Vpff;
+ map< m4_key,Vpff,compare1 > D1;
+ map< m4_key,Vpff,compareb1 > Db1;
+ public:
+ d4model&d4m;
+ WordClasses ewordclasses,fwordclasses;
+ template<class MAPPER>
+ void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile)
+ {
+ ifstream estrm(efile.c_str()),fstrm(ffile.c_str());
+ if( !estrm )
+ cerr << "ERROR: can not read classes from " << efile << endl;
+ else
+ ewordclasses.read(estrm,m1);
+ if( !fstrm )
+ cerr << "ERROR: can not read classes from " << ffile << endl;
+ else
+ fwordclasses.read(fstrm,m2);
+ }
+ d5model (d4model&_d4m)
+ :D1 (compare1(M5_Dependencies)), Db1 (compareb1(M5_Dependencies)),d4m(_d4m)
+ {}
+ COUNT &getCountRef_first (PositionIndex vacancies_j,
+ PositionIndex vacancies_jp, int F,
+ PositionIndex l, PositionIndex m,
+ PositionIndex vacancies_total)
+ {
+ massert(vacancies_j>0);
+ massert(vacancies_total>0);
+ //massert(vacancies_jp<=vacancies_total);
+ massert(vacancies_j <=vacancies_total);
+ massert(vacancies_total<=m);
+ m4_key key(M5_Dependencies,l,m,F,0,0,vacancies_jp,vacancies_total);
+ map<m4_key,Vpff,compare1 >::iterator p=D1.find(key);
+ if(p==D1.end())
+ p=D1.insert(make_pair(key,Vpff(vacancies_total+1,make_pair(0,UNSEENPROB)))).first; // !!! constrain length
+ massert(p!=D1.end());
+ return (p->second)[vacancies_j].first;
+ }
+ COUNT &getCountRef_bigger (PositionIndex vacancies_j,
+ PositionIndex vacancies_jp, int F,
+ PositionIndex l, PositionIndex m,
+ PositionIndex vacancies_total)
+ {
+ massert(vacancies_j>0);
+ massert(vacancies_total>0);
+ massert (vacancies_jp <= vacancies_j);
+ massert (vacancies_j-vacancies_jp <= vacancies_total);
+ m4_key key(M5_Dependencies,l,m,F,0,0,-1,vacancies_total);
+ map<m4_key,Vpff,compareb1 >::iterator p=Db1.find(key);
+ if(p==Db1.end())
+ p=Db1.insert(make_pair(key,Vpff(vacancies_total+1,make_pair(0,UNSEENPROB)))).first; // !!! constrain length
+ massert(p!=Db1.end());
+ return (p->second)[vacancies_j - vacancies_jp].first;
+ }
+ PROB getProb_first (PositionIndex vacancies_j, PositionIndex vacancies_jp,
+ int F, PositionIndex l, PositionIndex m,
+ PositionIndex vacancies_total) const
+ {
+ massert(vacancies_j>0);
+ massert(vacancies_total>0);
+ //massert(vacancies_jp<=vacancies_total);
+ massert(vacancies_j <=vacancies_total);
+ massert(vacancies_total<=m);
+ m4_key key(M5_Dependencies,l,m,F,0,0,vacancies_jp,vacancies_total);
+ map<m4_key,Vpff,compare1 >::const_iterator p=D1.find(key);
+ if( p==D1.end() )
+ return UNSEENPROB;
+ else
+ return max(PROB_SMOOTH,d5modelsmooth_factor/(vacancies_total)+(1-d5modelsmooth_factor)*(p->second)[vacancies_j].second);
+ }
+ PROB getProb_bigger (PositionIndex vacancies_j, PositionIndex vacancies_jp,
+ int F, PositionIndex l, PositionIndex m,
+ PositionIndex vacancies_total) const
+ {
+ massert(vacancies_j>0);
+ massert(vacancies_total>0);
+ massert (vacancies_jp <= vacancies_j);
+ massert (vacancies_j-vacancies_jp <= vacancies_total);
+ m4_key key(M5_Dependencies,l,m,F,0,0,-1,vacancies_total);
+ map<m4_key,Vpff,compareb1 >::const_iterator p=Db1.find(key);
+ if(p==Db1.end())
+ return UNSEENPROB;
+ else
+ return max(PROB_SMOOTH,d5modelsmooth_factor/(vacancies_total)+(1-d5modelsmooth_factor)*(p->second)[vacancies_j - vacancies_jp].second);
+ }
+ void normalizeTable ()
+ {
+ int nParams=0;
+ for(map<m4_key,Vpff,compare1 >::iterator i=D1.begin();i!=D1.end();++i)
+ {
+ Vpff&d1=i->second;
+ COUNT sum=0.0;
+ for(PositionIndex i=0;i<d1.size();i++)
+ sum+=d1[i].first+d5modelsmooth_countoffset;
+ for(PositionIndex i=0;i<d1.size();i++)
+ {
+ d1[i].second=sum?((d1[i].first+d5modelsmooth_countoffset)/sum):(1.0/d1.size());
+ nParams++;
+ }
+ }
+ for(map<m4_key,Vpff,compareb1 >::iterator i=Db1.begin();i!=Db1.end();++i)
+ {
+ Vpff&db1=i->second;
+ double sum=0.0;
+ for(PositionIndex i=0;i<db1.size();i++)
+ sum+=db1[i].first+d5modelsmooth_countoffset;
+ for(PositionIndex i=0;i<db1.size();i++)
+ {
+ db1[i].second=sum?((db1[i].first+d5modelsmooth_countoffset)/sum):(1.0/db1.size());
+ nParams++;
+ }
+ }
+ cout << "D5 table contains " << nParams << " parameters.\n";
+ }
+
+ friend ostream&operator<<(ostream&out,d5model&d5m)
+ {
+ out << "# Translation tables for Model 5 .\n";
+ out << "# Table for head of cept.\n";
+ for(map<m4_key,Vpff,compare1 >::const_iterator i=d5m.D1.begin();i!=d5m.D1.end();++i)
+ {
+ const Vpff&d1=i->second;
+ COUNT sum=0.0;
+ for(PositionIndex ii=0;ii<d1.size();ii++)sum+=d1[ii].first;
+ if ( sum )
+ {
+ for(unsigned ii=0;ii<d1.size();ii++)
+ {
+ print1_m5(out,i->first,d5m.ewordclasses,d5m.fwordclasses);
+ out << (int)(ii) << ' ' << d1[ii].second << ' ' << d1[ii].first << '\n';
+ }
+ out << endl;
+ }
+ }
+ out << "# Table for non-head of cept.\n";
+ for(map<m4_key,Vpff,compareb1 >::const_iterator i=d5m.Db1.begin();i!=d5m.Db1.end();++i)
+ {
+ const Vpff&db1=i->second;
+ double sum=0.0;
+ for(PositionIndex ii=0;ii<db1.size();++ii)sum+=db1[ii].first;
+ if( sum )
+ {
+ for(unsigned ii=0;ii<db1.size();ii++)
+ {
+ printb1_m5(out,i->first,d5m.fwordclasses);
+ out << (int)(ii) << ' ' << db1[ii].second << ' ' << db1[ii].first << '\n';
+ }
+ out << endl;
+ }
+ }
+ return out;
+ }
+ void readProbTable(const char*x)
+ {
+ ifstream f(x);
+ string l;
+ while(getline(f,l))
+ {
+ if(l.length()&&l[0]=='#')
+ continue;
+ istrstream is(l.c_str());
+ string E,F;
+ int v1,v2,ii;
+ double prob,count;
+ if(is>>E>>F>>v1>>v2>>ii>>prob>>count)
+ {
+ //cerr << "Read: " << E << " " << F << " " << v1 << " " << v2 << " " << prob<< endl;
+ if( count>0 )
+ if( E=="-1")
+ getCountRef_bigger(ii,0,fwordclasses(F),1000,1000,v2)+=count;
+ else
+ getCountRef_first(ii,v1,fwordclasses(F),1000,1000,v2)+=count;
+ }
+ }
+ normalizeTable();
+ ofstream of("M5FILE");
+ of << (*this);
+ }
+ void clear()
+ {
+ for(map<m4_key,Vpff,compare1 >::iterator i=D1.begin();i!=D1.end();++i)
+ {
+ Vpff&d1=i->second;
+ for(PositionIndex i=0;i<d1.size();i++)
+ d1[i].first=0.0;
+ }
+ for(map<m4_key,Vpff,compareb1 >::iterator i=Db1.begin();i!=Db1.end();++i)
+ {
+ Vpff&db1=i->second;
+ for(PositionIndex i=0;i<db1.size();i++)
+ db1[i].first=0.0;
+ }
+ }
+};
+
+#endif
+
+
+
diff --git a/GIZA++-v2/Dictionary.cpp b/GIZA++-v2/Dictionary.cpp
new file mode 100644
index 0000000..ee77ff5
--- /dev/null
+++ b/GIZA++-v2/Dictionary.cpp
@@ -0,0 +1,92 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/* Noah A. Smith
+ Dictionary object for dictionary filter in Model 1 training
+
+ Dictionary file must be in order (sorted) by Foreign vocab id, but English
+ vocab ids may be in any order.
+
+ 9 August 1999
+*/
+
+#include "Dictionary.h"
+
+Dictionary::Dictionary(const char *filename){
+ if(!strcmp(filename, "")){
+ dead = true;
+ return;
+ }
+ dead = false;
+ cout << "Reading dictionary from: " << filename << '\n';
+ ifstream dFile(filename);
+ if(!dFile){
+ cerr << "ERROR: Can't open dictionary: " << filename << '\n';
+ exit(1);
+ }
+
+ currindexmin = 0;
+ currindexmax = 0;
+ currval = 0;
+ int p, q;
+ while((dFile >> p >> q)){
+ pairs[0].push_back(p);
+ pairs[1].push_back(q);
+ }
+ cout << "Dictionary read; " << pairs[0].size() << " pairs loaded." << '\n';
+ dFile.close();
+}
+
+
+bool Dictionary::indict(int p, int q){
+ if(dead) return false;
+ if(p == 0 && q == 0) return false;
+ if(currval == p){
+ for(int i = currindexmin; i <= currindexmax; i++)
+ if(pairs[1][i] == q) return true;
+ return false;
+ }
+ else{
+ int begin = 0, end = pairs[0].size() - 1, middle = 0;
+ unsigned int t;
+ bool ret = false;
+ while(begin <= end){
+ middle = begin + ((end - begin) >> 1);
+ if(p < pairs[0][middle]) end = middle - 1;
+ else if(p > pairs[0][middle]) begin = middle + 1;
+ else{
+ break;
+ }
+ }
+ t = middle;
+ while(pairs[0][t] == p )
+ if(pairs[1][t--] == q) ret = true;
+ currindexmin = t + 1;
+ t = middle + 1;
+ while(pairs[0][t] == p && t < pairs[0].size())
+ if(pairs[1][t++] == q) ret = true;
+ currindexmax = t - 1;
+ currval = p;
+ return ret;
+ }
+}
+
+
diff --git a/GIZA++-v2/Dictionary.h b/GIZA++-v2/Dictionary.h
new file mode 100644
index 0000000..3a5c71e
--- /dev/null
+++ b/GIZA++-v2/Dictionary.h
@@ -0,0 +1,48 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/* Noah A. Smith
+ Dictionary object for dictionary filter in Model 1 training
+
+ 9 August 1999
+*/
+
+#include <iostream>
+#include <fstream>
+
+#include "Vector.h"
+
+#ifndef DICTIONARY_H
+#define DICTIONARY_H
+
+class Dictionary{
+ private:
+ Vector<int> pairs[2];
+ int currval;
+ int currindexmin;
+ int currindexmax;
+ bool dead;
+ public:
+ Dictionary(const char *);
+ bool indict(int, int);
+};
+
+#endif
diff --git a/GIZA++-v2/FlexArray.h b/GIZA++-v2/FlexArray.h
new file mode 100644
index 0000000..c7365f7
--- /dev/null
+++ b/GIZA++-v2/FlexArray.h
@@ -0,0 +1,58 @@
+/*
+
+Copyright (C) 1988,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef CLASS_FlexArray_defined
+#define CLASS_FlexArray_defined
+#include "Array.h"
+#include <iostream>
+#include <fstream>
+template<class T>
+class FlexArray
+{
+private:
+ Array<T> p;
+ int start,End;
+public:
+ FlexArray(int _start=0,int _end=-1)
+ : p(_end-_start+1),start(_start),End(_end) {}
+ FlexArray(int _start,int _end,const T&init)
+ : p(_end-_start+1,init),start(_start),End(_end) {}
+ T&operator[](int i)
+ {return p[i-start];}
+ const T&operator[](int i)const
+ {return p[i-start];}
+ int low()const{return start;}
+ int high()const{return End;}
+ T*begin(){return conv<double>(p.begin());}
+ T*end(){return conv<double>(p.end());}
+};
+
+template<class T>
+inline ostream&operator<<(ostream&out,const FlexArray<T>&x)
+{
+ for(int i=x.low();i<=x.high();++i)
+ out << i << ':' << x[i] << ';' << ' ';
+ return out;
+}
+
+
+#endif
diff --git a/GIZA++-v2/ForwardBackward.cpp b/GIZA++-v2/ForwardBackward.cpp
new file mode 100644
index 0000000..969316a
--- /dev/null
+++ b/GIZA++-v2/ForwardBackward.cpp
@@ -0,0 +1,242 @@
+/*
+
+Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef NO_TRAINING
+#include "ForwardBackward.h"
+#include "Globals.h"
+#include "myassert.h"
+#include "HMMTables.h"
+#include "mymath.h"
+
+
+double ForwardBackwardTraining(const HMMNetwork&net,Array<double>&g,Array<Array2<double> >&E){
+ const int I=net.size1(),J=net.size2(),N=I*J;
+ Array<double> alpha(N,0),beta(N,0),sum(J);
+ for(int i=0;i<I;i++)
+ beta[N-I+i]=net.getBetainit(i);
+ double * cur_beta=conv<double>(beta.begin())+N-I-1;
+ for(int j=J-2;j>=0;--j)
+ for(int ti=I-1;ti>=0;--ti,--cur_beta) {
+ const double *next_beta=conv<double>(beta.begin())+(j+1)*I;
+ const double *alprob=&net.outProb(j,ti,0),*next_node=&net.nodeProb(0,j+1);
+ for(int ni=0;ni<I;++ni,(next_node+=J)){
+ massert(cur_beta<next_beta&& &net.outProb(j,ti,ni)==alprob);
+ massert(next_node == &net.nodeProb(ni,j+1));
+ /* if( VERB&&(*next_beta)*(*alprob)*(*next_node) )
+ cout << "B= " << (int)(cur_beta-beta.begin()) << " += " << (*next_beta) << "("
+ << next_beta-beta.begin() << ") alprob:" << (*alprob) << " lexprob:" << (*next_node) << endl;*/
+ (*cur_beta)+=(*next_beta++)*(*alprob++)*(*next_node);
+ }
+ }
+ for(int i=0;i<I;i++)
+ alpha[i]=net.getAlphainit(i)*net.nodeProb(i,0);
+ double* cur_alpha=conv<double>(alpha.begin())+I;
+ cur_beta=conv<double>(beta.begin())+I;
+ for(int j=1;j<J;j++){
+ Array2<double>&e=E[ (E.size()==1)?0:(j-1) ];
+ if( (E.size()!=1) || j==1 )
+ {
+ e.resize(I,I);
+ fill(e.begin(),e.end(),0.0);
+ }
+
+ for(int ti=0;ti<I;++ti,++cur_alpha,++cur_beta) {
+ const double * prev_alpha=conv<double>(alpha.begin())+I*(j-1);
+ double *cur_e= &e(ti,0);
+ double this_node=net.nodeProb(ti,j);
+ const double* alprob= &net.outProb(j-1,0,ti);
+ for(int pi=0;pi<I;++pi,++prev_alpha,(alprob+=I)){
+ massert(prev_alpha<cur_alpha&& &net.outProb(j-1,pi,ti)==alprob);
+ massert(&e(ti,pi)==cur_e);
+ const double alpha_increment= *prev_alpha*(*alprob)*this_node;
+ (*cur_alpha)+=alpha_increment;
+ (*cur_e++)+=alpha_increment*(*cur_beta);
+ }
+ }
+ }
+ g.resize(N);
+ transform(alpha.begin(),alpha.end(),beta.begin(),g.begin(),multiplies<double>());
+ double bsum=0,esum=0,esum2;
+ for(int i=0;i<I;i++)
+ bsum+=beta[i]*net.nodeProb(i,0)*net.getAlphainit(i);
+ for(unsigned int j=0;j<(unsigned int)E.size();j++)
+ {
+ Array2<double>&e=E[j];
+ const double *epe=e.end();
+ for(const double*ep=e.begin();ep!=epe;++ep)
+ esum+=*ep;
+ }
+ if( J>1 )
+ esum2=esum/(J-1);
+ else
+ esum2=0.0;
+ if(!(esum2==0.0||mfabs(esum2-bsum)/bsum<1e-3*I))
+ cout << "ERROR2: " << esum2 <<" " <<bsum << " " << esum << net << endl;
+ double * sumptr=conv<double>(sum.begin());
+ double* ge=conv<double>(g.end());
+ for(double* gp=conv<double>(g.begin());gp!=ge;gp+=I)
+ {
+ *sumptr++=normalize_if_possible(gp,gp+I);
+ if(bsum && !(mfabs((*(sumptr-1)-bsum)/bsum)<1e-3*I))
+ cout << "ERROR: " << *(sumptr-1) << " " << bsum << " " << mfabs((*(sumptr-1)-bsum)/bsum) << ' ' << I << ' ' << J << endl;
+ }
+ for(unsigned int j=0;j<(unsigned int)E.size();j++)
+ {
+ Array2<double>&e=E[j];
+ double* epe=e.end();
+ if( esum )
+ for(double*ep=e.begin();ep!=epe;++ep)
+ *ep/=esum;
+ else
+ for(double*ep=e.begin();ep!=epe;++ep)
+ *ep/=1.0/(max(I*I,I*I*(J-1)));
+ }
+ if( sum.size() )
+ return sum[0];
+ else
+ return 1.0;
+}
+void HMMViterbi(const HMMNetwork&net,Array<int>&vit) {
+ const int I=net.size1(),J=net.size2();
+ vit.resize(J);
+ Array<double>g;
+ Array<Array2<double> >e(1);
+ ForwardBackwardTraining(net,g,e);
+ for(int j=0;j<J;j++) {
+ double * begin=conv<double>(g.begin())+I*j;
+ vit[j]=max_element(begin,begin+I)-begin;
+ }
+}
+void HMMViterbi(const HMMNetwork&net,Array<double>&g,Array<int>&vit) {
+ const int I=net.size1(),J=net.size2();
+ vit.resize(J);
+ for(int j=0;j<J;j++) {
+ double* begin=conv<double>(g.begin())+I*j;
+ vit[j]=max_element(begin,begin+I)-begin;
+ }
+}
+
+double HMMRealViterbi(const HMMNetwork&net,Array<int>&vitar,int pegi,int pegj,bool verbose){
+ const int I=net.size1(),J=net.size2(),N=I*J;
+ Array<double> alpha(N,-1);
+ Array<double*> bp(N,(double*)0);
+ vitar.resize(J);
+ if( J==0 )
+ return 1.0;
+ for(int i=0;i<I;i++)
+ {
+ alpha[i]=net.getAlphainit(i)*net.nodeProb(i,0);
+ if( i>I/2 )
+ alpha[i]=0; // only first empty word can be chosen
+ bp[i]=0;
+ }
+ double *cur_alpha=conv<double>(alpha.begin())+I;
+ double **cur_bp=conv<double*>(bp.begin())+I;
+ for(int j=1;j<J;j++)
+ {
+ if( pegj+1==j)
+ for(int ti=0;ti<I;ti++)
+ if( (pegi!=-1&&ti!=pegi)||(pegi==-1&&ti<I/2) )
+ (cur_alpha-I)[ti]=0.0;
+ for(int ti=0;ti<I;++ti,++cur_alpha,++cur_bp) {
+ double* prev_alpha=conv<double>(alpha.begin())+I*(j-1);
+ double this_node=net.nodeProb(ti,j);
+ const double *alprob= &net.outProb(j-1,0,ti);
+ for(int pi=0;pi<I;++pi,++prev_alpha,(alprob+=I)){
+ massert(prev_alpha<cur_alpha&& &net.outProb(j-1,pi,ti)==alprob);
+ const double alpha_increment= *prev_alpha*(*alprob)*this_node;
+ if( alpha_increment> *cur_alpha )
+ {
+ (*cur_alpha)=alpha_increment;
+ (*cur_bp)=prev_alpha;
+ }
+ }
+ }
+ }
+ for(int i=0;i<I;i++)
+ alpha[N-I+i]*=net.getBetainit(i);
+ if( pegj==J-1)
+ for(int ti=0;ti<I;ti++)
+ if( (pegi!=-1&&ti!=pegi)||(pegi==-1&&ti<I/2) )
+ (alpha)[N-I+ti]=0.0;
+
+ int j=J-1;
+ cur_alpha=conv<double>(alpha.begin())+j*I;
+ vitar[J-1]=max_element(cur_alpha,cur_alpha+I)-cur_alpha;
+ double ret= *max_element(cur_alpha,cur_alpha+I);
+ while(bp[vitar[j]+j*I])
+ {
+ cur_alpha-=I;
+ vitar[j-1]=bp[vitar[j]+j*I]-cur_alpha;
+ massert(vitar[j-1]<I&&vitar[j-1]>=0);
+ j--;
+ }
+ massert(j==0);
+ if( verbose )
+ {
+ cout << "VERB:PEG: " << pegi << ' ' << pegj << endl;
+ for(int j=0;j<J;j++)
+ cout << "NP " << net.nodeProb(vitar[j],j) << ' ' << "AP " << ((j==0)?net.getAlphainit(vitar[j]):net.outProb(j-1,vitar[j-1],vitar[j])) << " j:" << j << " i:" << vitar[j] << "; ";
+ cout << endl;
+ }
+ return ret;
+}
+
+double MaximumTraining(const HMMNetwork&net,Array<double>&g,Array<Array2<double> >&E){
+ Array<int> vitar;
+ double ret=HMMRealViterbi(net,vitar);
+ const int I=net.size1(),J=net.size2();
+ if( E.size()==1 )
+ {
+ Array2<double>&e=E[0];
+ e.resize(I,I);
+ g.resize(I*J);
+ fill(g.begin(),g.end(),0.0);
+ fill(e.begin(),e.end(),0.0);
+ for(int i=0;i<J;++i)
+ {
+ g[i*I+vitar[i]]=1.0;
+ if( i>0 )
+ e(vitar[i],vitar[i-1])++;
+ }
+ }
+ else
+ {
+ g.resize(I*J);
+ fill(g.begin(),g.end(),0.0);
+ for(int i=0;i<J;++i)
+ {
+ g[i*I+vitar[i]]=1.0;
+ if( i>0 )
+ {
+ Array2<double>&e=E[i-1];
+ e.resize(I,I);
+ fill(e.begin(),e.end(),0.0);
+ e(vitar[i],vitar[i-1])++;
+ }
+ }
+ }
+ return ret;
+}
+
+#endif
+
diff --git a/GIZA++-v2/ForwardBackward.h b/GIZA++-v2/ForwardBackward.h
new file mode 100644
index 0000000..42449d3
--- /dev/null
+++ b/GIZA++-v2/ForwardBackward.h
@@ -0,0 +1,62 @@
+/*
+
+Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef NO_EM_MARKOF_ZEUGS_DEFINED
+#define NO_EM_MARKOF_ZEUGS_DEFINED
+#ifndef NO_TRAINING
+#include "myassert.h"
+#include "Array.h"
+#include "Array2.h"
+
+class HMMNetwork
+{
+ public:
+ int as,bs;
+ Array2<double> n;
+ Array<Array2<double> > e;
+ Array<double> alphainit;
+ Array<double> betainit;
+ int ab;
+ double finalMultiply;
+ HMMNetwork(int I,int J)
+ : as(I),bs(J),n(as,bs),/*e(as,as,0.0),*/e(0),alphainit(as,1.0/as),betainit(as,1.0),ab(as*bs),finalMultiply(1.0)
+ {}
+ double getAlphainit(int i)const{return alphainit[i];}
+ double getBetainit(int i)const{return betainit[i];}
+ inline int size1()const{return as;}
+ inline int size2()const{return bs;}
+ inline const double&nodeProb(int i,int j)const
+ {return n(i,j);}
+ inline const double&outProb(int j,int i1,int i2)const
+ {/*massert(e[min(int(e.size())-1,j)](i1,i2) );*/ return e[min(int(e.size())-1,j)](i1,i2);}
+ friend ostream&operator<<(ostream&out,const HMMNetwork&x)
+ {
+ return out <<"N: \n"<< x.n << endl << "E: \n" << x.e << "A:\n" << x.alphainit << "B:\n" << x.betainit << endl;
+ }
+};
+double ForwardBackwardTraining(const HMMNetwork&mc,Array<double>&gamma,Array<Array2<double> >&epsilon);
+void HMMViterbi(const HMMNetwork&mc,Array<int>&vit);
+double HMMRealViterbi(const HMMNetwork&net,Array<int>&vit,int pegi=-1,int pegj=-1,bool verbose=0);
+double MaximumTraining(const HMMNetwork&net,Array<double>&g,Array<Array2<double> >&e);
+void HMMViterbi(const HMMNetwork&net,Array<double>&g,Array<int>&vit);
+#endif
+#endif
diff --git a/GIZA++-v2/GNU.GPL b/GIZA++-v2/GNU.GPL
new file mode 100644
index 0000000..5b2225e
--- /dev/null
+++ b/GIZA++-v2/GNU.GPL
@@ -0,0 +1,282 @@
+
+
+Preamble
+
+The licenses for most software are designed to take away your freedom
+to share and change it. By contrast, the GNU General Public License is
+intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the software, or if you modify it.
+
+For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on,
+we want its recipients to know that what they have is not the
+original, so that any problems introduced by others will not reflect
+on the original authors' reputations.
+
+Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at
+all.
+
+The precise terms and conditions for copying, distribution and
+modification follow.
+
+
+TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+0. This License applies to any program or other work which contains a
+notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the Program
+(independent of having been made by running the Program). Whether that
+is true depends on what the Program does.
+
+1. You may copy and distribute verbatim copies of the Program's source
+code as you receive it, in any medium, provided that you conspicuously
+and appropriately publish on each copy an appropriate copyright notice
+and disclaimer of warranty; keep intact all the notices that refer to
+this License and to the absence of any warranty; and give any other
+recipients of the Program a copy of this License along with the
+Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a
+fee.
+
+2. You may modify your copy or copies of the Program or any portion of
+it, thus forming a work based on the Program, and copy and distribute
+such modifications or work under the terms of Section 1 above,
+provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that
+ in whole or in part contains or is derived from the Program or
+ any part thereof, to be licensed as a whole at no charge to all
+ third parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you
+ provide a warranty) and that users may redistribute the program
+ under these conditions, and telling the user how to view a copy
+ of this License. (Exception: if the Program itself is interactive
+ but does not normally print such an announcement, your work based
+ on the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of
+ Sections 1 and 2 above on a medium customarily used for software
+ interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt otherwise
+to copy, modify, sublicense or distribute the Program is void, and
+will automatically terminate your rights under this License. However,
+parties who have received copies, or rights, from you under this
+License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted
+herein. You are not responsible for enforcing compliance by third
+parties to this License.
+
+
+7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+9. The Free Software Foundation may publish revised and/or new
+versions of the General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Program does not specify a
+version number of this License, you may choose any version ever
+published by the Free Software Foundation.
+
+10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the
+author to ask for permission. For software which is copyrighted by the
+Free Software Foundation, write to the Free Software Foundation; we
+sometimes make exceptions for this. Our decision will be guided by the
+two goals of preserving the free status of all derivatives of our free
+software and of promoting the sharing and reuse of software generally.
+
+NO WARRANTY
+
+11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE
+LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS
+AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF
+ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+
+12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+END OF TERMS AND CONDITIONS
diff --git a/GIZA++-v2/Globals.h b/GIZA++-v2/Globals.h
new file mode 100644
index 0000000..fc2953c
--- /dev/null
+++ b/GIZA++-v2/Globals.h
@@ -0,0 +1,73 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef Globals_asdf_defined
+#define Globals_asdf_defined
+#include <string>
+#include <fstream>
+#include <map>
+#include "defs.h"
+#include "Vector.h"
+
+extern float PROB_SMOOTH;
+extern bool Verbose, Log, Peg, Transfer, Transfer2to3, useDict ;
+extern string Prefix, LogFilename, OPath,
+ SourceVocabFilename, TargetVocabFilename, CorpusFilename, TestCorpusFilename,
+ t_Filename, a_Filename, p0_Filename, d_Filename, n_Filename, dictionary_Filename;
+extern ofstream logmsg ;
+extern double M5P0,P0 ;
+extern bool NODUMPS, FEWDUMPS ;
+extern string Usage ;
+extern unsigned int MAX_SENTENCE_LENGTH ;
+extern int PegUntil;
+
+extern short DeficientDistortionForEmptyWord;
+
+extern int M4_Dependencies;
+extern int M5_Dependencies;
+
+extern short OutputInAachenFormat;
+
+#define DEP_MODEL_l 1
+#define DEP_MODEL_m 2
+#define DEP_MODEL_F 4
+#define DEP_MODEL_E 8
+
+#define DEP_MODELb_l 16
+#define DEP_MODELb_m 32
+#define DEP_MODELb_F 64
+#define DEP_MODELb_E 128
+
+#define DEP_SUM 256
+
+class vcbList;
+
+extern vcbList *globeTrainVcbList, *globfTrainVcbList;
+
+extern short PredictionInAlignments;
+extern short SmoothHMM;
+#define VERB Verbose
+
+double ErrorsInAlignment(const map< pair<int,int>,char >&reference,const Vector<WordIndex>&test,int l,int&missing,int&toomuch,int&eventsMissing,int&eventsToomuch,int);
+extern Vector<map< pair<int,int>,char > > ReferenceAlignment;
+void printGIZAPars(ostream&out);
+
+#endif
diff --git a/GIZA++-v2/HMMTables.cpp b/GIZA++-v2/HMMTables.cpp
new file mode 100644
index 0000000..f037289
--- /dev/null
+++ b/GIZA++-v2/HMMTables.cpp
@@ -0,0 +1,177 @@
+/*
+
+Copyright (C) 1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "HMMTables.h"
+#include <fstream>
+#include "Globals.h"
+#include "Parameter.h"
+
+template<class CLS,class MAPPERCLASSTOSTRING>
+void HMMTables<CLS,MAPPERCLASSTOSTRING>::writeJumps(ostream&out) const
+{
+ double ssum=0.0;
+ for(typename map<AlDeps<CLS>,FlexArray<double> >::const_iterator i=alProb.begin();i!=alProb.end();++i)
+ {
+ double sum=0.0;
+ out << "\n\nDistribution for: ";
+ printAlDeps(out,i->first,*mapper1,*mapper2);
+ out << ' ';
+ for(int a=i->second.low();a<=i->second.high();++a)
+ if( i->second[a] )
+ {
+ out << a << ':' << i->second[a] << ';' << ' ';
+ sum+=i->second[a];
+ }
+ out << '\n' << '\n';
+ out << "SUM: " << sum << '\n';
+ ssum+=sum;
+ }
+ out << "FULL-SUM: " << ssum << '\n';
+}
+template<class CLS,class MAPPERCLASSTOSTRING>
+void HMMTables<CLS,MAPPERCLASSTOSTRING>::readJumps(istream&)
+{
+}
+template<class CLS,class MAPPERCLASSTOSTRING>
+double HMMTables<CLS,MAPPERCLASSTOSTRING>::getAlProb(int istrich,int k,int sentLength,int J,CLS w1,CLS w2,int j,int iter) const
+{
+ massert(k<sentLength&&k>=0);
+ massert(istrich<sentLength&&istrich>=-1);
+ int pos=istrich-k;
+ switch(PredictionInAlignments)
+ {
+ case 0: pos=istrich-k; break;
+ case 1: pos=k; break;
+ case 2:
+ pos=(k*J-j*sentLength);
+ if( pos>0 ) pos+=J/2; else pos-=J/2;
+ pos/=J;
+ break;
+ default:abort();
+ }
+ typename map<AlDeps<CLS>,FlexArray<double> >::const_iterator p=alProb.find(AlDeps<CLS>(sentLength,istrich,j,w1,w2));
+ if( p!=alProb.end() )
+ {
+ return (p->second)[pos];
+ }
+ else
+ {
+ if( iter>0&&iter<5000 )
+ cout << "WARNING: Not found: " << ' ' << J << ' ' << sentLength << '\n';;
+ return 1.0/(2*sentLength-1);
+ }
+}
+
+template<class CLS,class MAPPERCLASSTOSTRING>
+void HMMTables<CLS,MAPPERCLASSTOSTRING>::addAlCount(int istrich,int k,int sentLength,int J,CLS w1,CLS w2,int j,double value,double valuePredicted)
+{
+ int pos=istrich-k;
+ switch(PredictionInAlignments)
+ {
+ case 0: pos=istrich-k; break;
+ case 1: pos=k; break;
+ case 2:
+ pos=(k*J-j*sentLength);
+ if( pos>0 ) pos+=J/2; else pos-=J/2;
+ pos/=J;
+ break;
+ default:abort();
+ }
+ AlDeps<CLS> deps(AlDeps<CLS>(sentLength,istrich,j,w1,w2));
+
+ {
+ typename map<AlDeps<CLS>,FlexArray<double> >::iterator p=alProb.find(deps);
+ if( p==alProb.end() )
+ {
+ if( (CompareAlDeps&1)==0 )
+ p=alProb.insert(make_pair(deps,FlexArray<double> (-MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH,0.0))).first;
+ else
+ p=alProb.insert(make_pair(deps,FlexArray<double> (-sentLength,sentLength,0.0))).first;
+ }
+ p->second[pos]+=value;
+ }
+
+ if( valuePredicted )
+ {
+ typename map<AlDeps<CLS>,FlexArray<double> >::iterator p=alProbPredicted.find(deps);
+ if( p==alProbPredicted.end() )
+ {
+ if( (CompareAlDeps&1)==0 )
+ p=alProbPredicted.insert(make_pair(deps,FlexArray<double> (-MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH,0.0))).first;
+ else
+ p=alProbPredicted.insert(make_pair(deps,FlexArray<double> (-sentLength,sentLength,0.0))).first;
+ }
+ p->second[pos]+=valuePredicted;
+ }
+}
+
+template<class CLS,class MAPPERCLASSTOSTRING>
+Array<double>&HMMTables<CLS,MAPPERCLASSTOSTRING>::doGetAlphaInit(int I)
+{
+ if( !init_alpha.count(I) )
+ init_alpha[I]=Array<double>(I,0);
+ return init_alpha[I];
+}
+template<class CLS,class MAPPERCLASSTOSTRING>
+Array<double>&HMMTables<CLS,MAPPERCLASSTOSTRING>::doGetBetaInit(int I)
+{
+ if( !init_beta.count(I) )
+ init_beta[I]=Array<double>(I,0);
+ return init_beta[I];
+}
+
+template<class CLS,class MAPPERCLASSTOSTRING>
+bool HMMTables<CLS,MAPPERCLASSTOSTRING>::getAlphaInit(int I,Array<double>&x)const
+{
+ hash_map<int,Array<double> >::const_iterator i=init_alpha.find(I);
+ if( i==init_alpha.end() )
+ return 0;
+ else
+ {
+ x=i->second;
+ for(unsigned int j=x.size()/2+1;j<x.size();++j) // only first empty word can be chosen
+ x[j]=0;
+ return 1;
+ }
+}
+template<class CLS,class MAPPERCLASSTOSTRING>
+bool HMMTables<CLS,MAPPERCLASSTOSTRING>::getBetaInit(int I,Array<double>&x)const
+{
+ hash_map<int,Array<double> >::const_iterator i=init_beta.find(I);
+ if( i==init_beta.end() )
+ return 0;
+ else
+ {
+ x=i->second;
+ return 1;
+ }
+}
+
+template<class CLS,class MAPPERCLASSTOSTRING>
+HMMTables<CLS,MAPPERCLASSTOSTRING>:: HMMTables(double _probForEmpty,const MAPPERCLASSTOSTRING&m1,const MAPPERCLASSTOSTRING&m2):
+ probabilityForEmpty(mfabs(_probForEmpty)),
+ updateProbabilityForEmpty(_probForEmpty<0.0),
+ mapper1(&m1),
+ mapper2(&m2)
+{}
+template<class CLS,class MAPPERCLASSTOSTRING>
+HMMTables<CLS,MAPPERCLASSTOSTRING>::~HMMTables() {}
diff --git a/GIZA++-v2/HMMTables.h b/GIZA++-v2/HMMTables.h
new file mode 100644
index 0000000..3f35129
--- /dev/null
+++ b/GIZA++-v2/HMMTables.h
@@ -0,0 +1,178 @@
+/*
+
+Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef HMM_TABLES_H_ASDF_DEFINED
+#define HMM_TABLES_H_ASDF_DEFINED
+#include "FlexArray.h"
+
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+#include "Array.h"
+#include <map>
+#include "mymath.h"
+
+template<class T>
+T normalize_if_possible(T*a,T*b)
+{
+ T sum=0;
+ for(T*i=a;i!=b;++i)
+ sum+=*i;
+ if( sum )
+ for(T*i=a;i!=b;++i)
+ *i/=sum;
+ else
+ fill(a,b,1.0/(b-a));
+ return sum;
+}
+
+extern short CompareAlDeps;
+template<class CLS>
+class AlDeps
+{
+ public:
+ int englishSentenceLength;
+ CLS classPrevious;
+ int previous;
+ int j;
+ CLS Cj;
+ AlDeps(int l,int p=0,int _j=0,CLS s1=0,CLS _Cj=0)
+ : englishSentenceLength(l),classPrevious(s1),previous(p),j(_j),Cj(_Cj)
+ {}
+ friend bool operator<(const AlDeps&x,const AlDeps&y)
+ {
+ if( (CompareAlDeps&1) && x.englishSentenceLength<y.englishSentenceLength ) return 1;
+ if( (CompareAlDeps&1) && y.englishSentenceLength<x.englishSentenceLength ) return 0;
+ if( (CompareAlDeps&2) && x.classPrevious<y.classPrevious ) return 1;
+ if( (CompareAlDeps&2) && y.classPrevious<x.classPrevious ) return 0;
+ if( (CompareAlDeps&4) && x.previous<y.previous ) return 1;
+ if( (CompareAlDeps&4) && y.previous<x.previous ) return 0;
+ if( (CompareAlDeps&8) && x.j<y.j ) return 1;
+ if( (CompareAlDeps&8) && y.j<x.j ) return 0;
+ if( (CompareAlDeps&16) && x.Cj<y.Cj ) return 1;
+ if( (CompareAlDeps&16) && y.Cj<x.Cj ) return 0;
+ return 0;
+ }
+ friend bool operator==(const AlDeps&x,const AlDeps&y)
+ { return !( x<y || y<x ); }
+};
+
+template<class CLS>
+class Hash_AlDeps
+{
+ public:
+ unsigned
+ int
+ operator()
+ (const AlDeps<CLS>&x)
+ const
+ {
+ unsigned int hash=0;
+ if( (CompareAlDeps&1) ) { hash=hash+x.englishSentenceLength;hash*=31;}
+ if( (CompareAlDeps&2) ) { hash=hash+x.classPrevious;hash*=31;}
+ if( (CompareAlDeps&4) ) { hash=hash+x.previous;hash*=31;}
+ if( (CompareAlDeps&8) ) { hash=hash+x.j;hash*=31;}
+ if( (CompareAlDeps&16) ) { hash=hash+x.Cj;hash*=31;}
+ return hash;
+
+ }
+};
+
+template<class CLS,class MAPPERCLASSTOSTRING>
+class HMMTables
+{
+ protected:
+ double probabilityForEmpty;
+ bool updateProbabilityForEmpty;
+ hash_map<int,Array<double> > init_alpha;
+ hash_map<int,Array<double> > init_beta;
+ map<AlDeps<CLS>,FlexArray<double> > alProb;
+ map<AlDeps<CLS>,FlexArray<double> > alProbPredicted;
+ int globalCounter;
+ double divSum;
+ double p0_count,np0_count;
+ const MAPPERCLASSTOSTRING*mapper1;
+ const MAPPERCLASSTOSTRING*mapper2;
+ public:
+ const HMMTables<CLS,MAPPERCLASSTOSTRING>*getThis()const {return this;}
+ HMMTables(double _probForEmpty,const MAPPERCLASSTOSTRING&m1,const MAPPERCLASSTOSTRING&m2);
+ virtual ~HMMTables();
+ virtual double getAlProb(int i,int k,int sentLength,int J,CLS w1,CLS w2,int j,int iter=0) const;
+ virtual void writeJumps(ostream&) const;
+ void addAlCount(int i,int k,int sentLength,int J,CLS w1,CLS w2,int j,double value,double valuePredicted);
+ virtual void readJumps(istream&);
+ virtual bool getAlphaInit(int I,Array<double>&x)const;
+ virtual bool getBetaInit(int I,Array<double>&x)const;
+ Array<double>&doGetAlphaInit(int I);
+ Array<double>&doGetBetaInit(int I);
+ virtual double getProbabilityForEmpty()const
+ {return probabilityForEmpty;}
+ void performGISIteration(const HMMTables<CLS,MAPPERCLASSTOSTRING>*old)
+ {
+ cout << "OLDSIZE: " << (old?(old->alProb.size()):0) << " NEWSIZE:"<< alProb.size()<< endl;
+ for(typename map<AlDeps<CLS>,FlexArray<double> >::iterator i=alProb.begin();i!=alProb.end();++i)
+ {
+ if( alProbPredicted.count(i->first))
+ {
+ normalize_if_possible(i->second.begin(),i->second.end());
+ normalize_if_possible(alProbPredicted[i->first].begin(),alProbPredicted[i->first].end());
+ for(int j=i->second.low();j<=i->second.high();++j)
+ {
+ if( i->second[j] )
+ if(alProbPredicted[i->first][j]>0.0 )
+ {
+ double op=1.0;
+ if( old && old->alProb.count(i->first) )
+ op=(old->alProb.find(i->first)->second)[j];
+ //cerr << "GIS: " << j << ' ' << " OLD:"
+ // << op << "*true:"
+ // << i->second[j] << "/pred:" << alProbPredicted[i->first][j] << " -> ";
+ i->second[j]= op*(i->second[j]/alProbPredicted[i->first][j]);
+ //cerr << i->second[j] << endl;
+ }
+ else
+ {
+ cerr << "ERROR2 in performGISiteration: " << i->second[j] << endl;
+ }
+ }
+ }
+ else
+ cerr << "ERROR in performGISIteration: " << alProbPredicted.count(i->first) << endl;
+ }
+ }
+};
+
+template<class CLS,class MAPPERCLASSTOSTRING>
+inline void printAlDeps(ostream&out,const AlDeps<CLS>&x,const MAPPERCLASSTOSTRING&mapper1,const MAPPERCLASSTOSTRING&mapper2)
+{
+ if( (CompareAlDeps&1) ) out << "sentenceLength: " << x.englishSentenceLength<< ' ';
+ if( (CompareAlDeps&2) ) out << "previousClass: " << mapper1.classString(x.classPrevious) << ' ';
+ if( (CompareAlDeps&4) ) out << "previousPosition: " << x.previous << ' ';
+ if( (CompareAlDeps&8) ) out << "FrenchPosition: " << x.j << ' ';
+ if( (CompareAlDeps&16) ) out << "FrenchClass: " << mapper2.classString(x.Cj) << ' ';
+ //out << '\n';
+}
+
+#endif
diff --git a/GIZA++-v2/LICENSE b/GIZA++-v2/LICENSE
new file mode 100644
index 0000000..5b2225e
--- /dev/null
+++ b/GIZA++-v2/LICENSE
@@ -0,0 +1,282 @@
+
+
+Preamble
+
+The licenses for most software are designed to take away your freedom
+to share and change it. By contrast, the GNU General Public License is
+intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the software, or if you modify it.
+
+For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on,
+we want its recipients to know that what they have is not the
+original, so that any problems introduced by others will not reflect
+on the original authors' reputations.
+
+Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at
+all.
+
+The precise terms and conditions for copying, distribution and
+modification follow.
+
+
+TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+0. This License applies to any program or other work which contains a
+notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the Program
+(independent of having been made by running the Program). Whether that
+is true depends on what the Program does.
+
+1. You may copy and distribute verbatim copies of the Program's source
+code as you receive it, in any medium, provided that you conspicuously
+and appropriately publish on each copy an appropriate copyright notice
+and disclaimer of warranty; keep intact all the notices that refer to
+this License and to the absence of any warranty; and give any other
+recipients of the Program a copy of this License along with the
+Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a
+fee.
+
+2. You may modify your copy or copies of the Program or any portion of
+it, thus forming a work based on the Program, and copy and distribute
+such modifications or work under the terms of Section 1 above,
+provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that
+ in whole or in part contains or is derived from the Program or
+ any part thereof, to be licensed as a whole at no charge to all
+ third parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you
+ provide a warranty) and that users may redistribute the program
+ under these conditions, and telling the user how to view a copy
+ of this License. (Exception: if the Program itself is interactive
+ but does not normally print such an announcement, your work based
+ on the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of
+ Sections 1 and 2 above on a medium customarily used for software
+ interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt otherwise
+to copy, modify, sublicense or distribute the Program is void, and
+will automatically terminate your rights under this License. However,
+parties who have received copies, or rights, from you under this
+License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted
+herein. You are not responsible for enforcing compliance by third
+parties to this License.
+
+
+7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+9. The Free Software Foundation may publish revised and/or new
+versions of the General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Program does not specify a
+version number of this License, you may choose any version ever
+published by the Free Software Foundation.
+
+10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the
+author to ask for permission. For software which is copyrighted by the
+Free Software Foundation, write to the Free Software Foundation; we
+sometimes make exceptions for this. Our decision will be guided by the
+two goals of preserving the free status of all derivatives of our free
+software and of promoting the sharing and reuse of software generally.
+
+NO WARRANTY
+
+11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE
+LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS
+AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF
+ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+
+12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+END OF TERMS AND CONDITIONS
diff --git a/GIZA++-v2/Makefile b/GIZA++-v2/Makefile
new file mode 100644
index 0000000..b81a02a
--- /dev/null
+++ b/GIZA++-v2/Makefile
@@ -0,0 +1,138 @@
+.SUFFIXES: .out .o .c .e .r .f .y .l .s .p .cpp .alpha2o .pentiumo .sgio .alphao
+
+INSTALLDIR ?= /usr/local/bin/
+
+CXX = g++
+
+CFLAGS = $(CFLAGS_GLOBAL) -Wall -W -Wno-deprecated
+CFLAGS_OPT = $(CFLAGS) -O3 -DNDEBUG -DWORDINDEX_WITH_4_BYTE -DBINARY_SEARCH_FOR_TTABLE
+CFLAGS_PRF = $(CFLAGS) -O2 -pg -DNDEBUG -DWORDINDEX_WITH_4_BYTE
+CFLAGS_DBG = $(CFLAGS) -g -DDEBUG -DWORDINDEX_WITH_4_BYTE
+CFLAGS_NRM = $(CFLAGS) -DWORDINDEX_WITH_4_BYTE
+CFLAGS_VDBG = $(CFLAGS) -g -DDEBUG -DWORDINDEX_WITH_4_BYTE -DVDEBUG
+SRC = *.cpp
+TYPE =
+LDFLAGS = -static
+include Makefile.src
+
+OBJ_DIR_PRF = profile/
+OBJ_DIR_OPT = optimized/
+OBJ_DIR_DBG = debug/
+OBJ_DIR_VDBG = vdebug/
+OBJ_DIR_NRM = norm/
+OBJ_OPT2 = ${SRC2:%.cpp=$(OBJ_DIR_OPT)%.o}
+OBJ_OPT = ${SRC:%.cpp=$(OBJ_DIR_OPT)%.o}
+OBJ_DBG = ${SRC:%.cpp=$(OBJ_DIR_DBG)%.o}
+OBJ_VDBG = ${SRC:%.cpp=$(OBJ_DIR_VDBG)%.o}
+OBJ_NRM = ${SRC:%.cpp=$(OBJ_DIR_NRM)%.o}
+OBJ_PRF = ${SRC:%.cpp=$(OBJ_DIR_PRF)%.o}
+OBJ_DIR =
+DATE = `date +%d-%m-%Y`
+
+opt: GIZA++ snt2plain.out plain2snt.out snt2cooc.out
+
+GIZA++: $(OBJ_DIR_OPT) $(OBJ_OPT)
+ $(CXX) $(CFLAGS_OPT) $(OBJ_OPT) $(LDFLAGS) -o GIZA++
+
+prf: GIZA++.prf
+
+GIZA++.prf: $(OBJ_DIR_PRF) $(OBJ_PRF)
+ $(CXX) $(CFLAGS_PRF) $(OBJ_PRF) -o GIZA++.prf $(LDFLAGS)
+
+dbg: GIZA++.dbg
+
+GIZA++.dbg: $(OBJ_DIR_DBG) $(OBJ_DBG)
+ $(CXX) $(CFLAGS_DBG) $(OBJ_DBG) -o GIZA++.dbg $(LDFLAGS)
+
+vdbg: GIZA++.vdbg
+
+GIZA++.vdbg: $(OBJ_DIR_VDBG) $(OBJ_VDBG)
+ $(CXX) $(CFLAGS_VDBG) $(OBJ_VDBG) -o GIZA++.vdbg $(LDFLAGS)
+
+nrm: GIZA++.nrm
+
+GIZA++.nrm: $(OBJ_DIR_NRM) $(OBJ_NRM)
+ $(CXX) $(CFLAGS_NRM) $(OBJ_NRM) -o GIZA++.nrm $(LDFLAGS)
+
+all: dbg opt nrm prf
+
+$(OBJ_DIR_PRF): $(OBJ_DIR)
+ -mkdir $(OBJ_DIR_PRF)
+
+$(OBJ_DIR_OPT): $(OBJ_DIR)
+ -mkdir $(OBJ_DIR_OPT)
+
+$(OBJ_DIR_DBG): $(OBJ_DIR)
+ -mkdir $(OBJ_DIR_DBG)
+
+$(OBJ_DIR_VDBG): $(OBJ_DIR)
+ -mkdir $(OBJ_DIR_VDBG)
+
+$(OBJ_DIR_NRM): $(OBJ_DIR)
+ -mkdir $(OBJ_DIR_NRM)
+
+$(OBJ_DIR):
+ -mkdir $(OBJ_DIR)
+
+$(OBJ_DIR_DBG)%.o: %.cpp
+ $(CXX) $(CFLAGS_DBG) -c $< -o $@
+
+$(OBJ_DIR_VDBG)%.o: %.cpp
+ $(CXX) $(CFLAGS_VDBG) -c $< -o $@
+
+$(OBJ_DIR_NRM)%.o: %.cpp
+ $(CXX) $(CFLAGS_NRM) -c $< -o $@
+
+$(OBJ_DIR_PRF)%.o: %.cpp
+ $(CXX) $(CFLAGS_PRF) -c $< -o $@
+
+$(OBJ_DIR_OPT)%.o: %.cpp
+ $(CXX) $(CFLAGS_OPT) -c $< -o $@
+
+iinstall: opt prf dbg
+ -mkdir $(INSTALLDIR)/$(ARCH)
+ -cp GIZA++ $(INSTALLDIR)/GIZA++
+ -cp GIZA++.prf $(INSTALLDIR)/GIZA++.prf
+ -cp GIZA++.dbg $(INSTALLDIR)/GIZA++.dbg
+
+install: opt
+ -mkdir $(INSTALLDIR)
+ -cp GIZA++ $(INSTALLDIR)/GIZA++
+
+clean:
+ -rm -f $(OBJ_DIR_NRM)/*.o $(OBJ_DIR_DBG)/*.o $(OBJ_DIR_VDBG)/*.o $(OBJ_DIR_PRF)/*.o $(OBJ_DIR_OPT)/*.o
+ -rm -rf $(OBJ_DIR_NRM) $(OBJ_DIR_DBG) $(OBJ_DIR_VDBG) $(OBJ_DIR_PRF) $(OBJ_DIR_OPT)
+ -rm -f snt2plain.out plain2snt.out snt2cooc.out GIZA++
+
+
+backup: clean
+ tar cf - . | gzip -9 > ../GIZA++src.tar.gz
+
+depend: depend_CLEAN dependencies
+
+depend_CLEAN:
+ rm dependencies
+
+dependencies:
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
+ $(CXX) -Wno-deprecated -MM *.cpp $(CFLAGS_OPT) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_OPT)\1?g;print;}'>> dependencies)
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
+ $(CXX) -Wno-deprecated -MM *.cpp $(CFLAGS_DBG) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_DBG)\1?g;print;}'>> dependencies)
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
+ $(CXX) -Wno-deprecated -MM *.cpp $(CFLAGS_VDBG) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_VDBG)\1?g;print;}'>> dependencies)
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
+ $(CXX) -Wno-deprecated -MM *.cpp $(CFLAGS_NRM) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_NRM)\1?g;print;}'>> dependencies)
+ @(echo "#Automatically generated dependecy list" >> dependencies ;\
+ $(CXX) -Wno-deprecated -MM *.cpp $(CFLAGS_PRF) | perl -e 'while(<>){s?^([^\:]+\.o:)?$(OBJ_DIR_PRF)\1?g;print;}'>> dependencies)
+
+-include dependencies
+
+snt2plain.out: snt2plain.cpp
+ $(CXX) -O3 -Wno-deprecated -W -Wall snt2plain.cpp -o snt2plain.out
+
+plain2snt.out: plain2snt.cpp
+ $(CXX) -O3 -Wno-deprecated -W -Wall plain2snt.cpp -o plain2snt.out
+
+snt2cooc.out: snt2cooc.cpp
+ $(CXX) -O3 -g -W -Wall -Wno-deprecated snt2cooc.cpp -o snt2cooc.out
+
diff --git a/GIZA++-v2/Makefile.definitions b/GIZA++-v2/Makefile.definitions
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/GIZA++-v2/Makefile.definitions
diff --git a/GIZA++-v2/Makefile.src b/GIZA++-v2/Makefile.src
new file mode 100644
index 0000000..a6b8be7
--- /dev/null
+++ b/GIZA++-v2/Makefile.src
@@ -0,0 +1,2 @@
+SRC = Parameter.cpp myassert.cpp Perplexity.cpp model1.cpp model2.cpp model3.cpp getSentence.cpp TTables.cpp ATables.cpp AlignTables.cpp main.cpp NTables.cpp model2to3.cpp collCounts.cpp alignment.cpp vocab.cpp MoveSwapMatrix.cpp transpair_model3.cpp transpair_model5.cpp transpair_model4.cpp utility.cpp parse.cpp reports.cpp model3_viterbi.cpp model3_viterbi_with_tricks.cpp Dictionary.cpp model345-peg.cpp hmm.cpp HMMTables.cpp ForwardBackward.cpp
+
diff --git a/GIZA++-v2/MoveSwapMatrix.cpp b/GIZA++-v2/MoveSwapMatrix.cpp
new file mode 100644
index 0000000..2b0c3a3
--- /dev/null
+++ b/GIZA++-v2/MoveSwapMatrix.cpp
@@ -0,0 +1,231 @@
+/*
+
+Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "MoveSwapMatrix.h"
+
+template<class TRANSPAIR>
+MoveSwapMatrix<TRANSPAIR>::MoveSwapMatrix(const TRANSPAIR&_ef, const alignment&_a)
+ : alignment(_a), ef(_ef), l(ef.get_l()), m(ef.get_m()), _cmove(l+1, m+1), _cswap(m+1, m+1),
+ delmove(l+1, m+1,0),delswap(m+1, m+1,0),changed(l+2, 0), changedCounter(1),
+ modelnr(_ef.modelnr()),lazyEvaluation(0),centerDeleted(0)
+{
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
+ if( lazyEvaluation==0)
+ for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue);
+}
+
+template<class TRANSPAIR>
+void MoveSwapMatrix<TRANSPAIR>::updateJ(WordIndex j, bool useChanged,double thisValue)
+{
+ massert( lazyEvaluation==0 );
+ for(WordIndex i=0;i<=l;i++)
+ if( (useChanged==0||changed[i]!=changedCounter) )
+ if( get_al(j)!=i )
+ _cmove(i, j)=ef.scoreOfMove((*this), i, j,thisValue);
+ else
+ _cmove(i, j)=1.0;
+ for(WordIndex j2=j+1;j2<=m;j2++)
+ if( get_al(j)!=get_al(j2) )
+ _cswap(j, j2)=ef.scoreOfSwap((*this), j, j2,thisValue);
+ else
+ _cswap(j, j2)=1.0;
+ for(WordIndex j2=1;j2<j;j2++)
+ if( get_al(j)!=get_al(j2) )
+ _cswap(j2, j)=ef.scoreOfSwap((*this), j2, j,thisValue);
+ else
+ _cswap(j2, j)=1.0;
+}
+template<class TRANSPAIR>
+void MoveSwapMatrix<TRANSPAIR>::updateI(WordIndex i,double thisValue)
+{
+ massert( lazyEvaluation==0);
+ for(WordIndex j=1;j<=m;j++)
+ if( get_al(j)!=i )
+ _cmove(i, j)=ef.scoreOfMove((*this), i, j,thisValue);
+ else
+ _cmove(i, j)=1.0;
+}
+
+template<class TRANSPAIR>
+void MoveSwapMatrix<TRANSPAIR>::printWrongs()const{
+ for(WordIndex i=0;i<=l;i++)
+ {
+ for(WordIndex j=1;j<=m;j++)
+ if( get_al(j)==i)
+ cout << "A";
+ else
+ {
+ LogProb real=_cmove(i, j), wanted=ef.scoreOfMove((*this), i, j);
+ if( fabs(1.0-real/wanted)>1e-3 )
+ cout << 'b';
+ else if(fabs(1.0-real/wanted)>1e-10 )
+ cout << 'e';
+ else if(real!=wanted)
+ cout << 'E';
+ else
+ cout << ' ';
+ }
+ cout << endl;
+ }
+ cout << endl;
+ for(WordIndex j=1;j<=m;j++)
+ {
+ for(WordIndex j1=1;j1<=m;j1++)
+ if( j1>j )
+ {
+ if( get_al(j)==get_al(j1) )
+ cout << 'A';
+ else
+ cout << (_cswap(j, j1)==ef.scoreOfSwap((*this), j, j1));
+ }
+ else
+ cout << ' ';
+ cout << endl;
+ }
+ massert(0);
+}
+template<class TRANSPAIR>
+bool MoveSwapMatrix<TRANSPAIR>::isRight()const{
+ if( lazyEvaluation )
+ return 1;
+ for(WordIndex i=0;i<=l;i++)
+ for(WordIndex j=1;j<=m;j++)
+ if( get_al(j)!=i && (!(doubleEqual(_cmove(i, j), ef.scoreOfMove((*this), i, j)))) )
+ {
+ cerr << "DIFF: " << i << " " << j << " " << _cmove(i, j) << " " << ef.scoreOfMove((*this), i, j) << endl;
+ return 0;
+ }
+ for(WordIndex j=1;j<=m;j++)
+ for(WordIndex j1=1;j1<=m;j1++)
+ if( j1>j&&get_al(j)!=get_al(j1)&&(!doubleEqual(_cswap(j, j1), ef.scoreOfSwap((*this), j, j1))) )
+ {
+ cerr << "DIFFERENT: " << j << " " << j1 << " " << _cswap(j, j1) << " " << ef.scoreOfSwap((*this), j, j1) << endl;
+ return 0;
+ }
+ return 1;
+}
+
+template<class TRANSPAIR>
+void MoveSwapMatrix<TRANSPAIR>::doMove(WordIndex _i, WordIndex _j)
+{
+ WordIndex old_i=get_al(_j);
+ if( lazyEvaluation )
+ set(_j,_i);
+ else
+ {
+ if ( modelnr==5||modelnr==6 )
+ {
+ set(_j, _i);
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
+ for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue);
+ }
+ else if ( modelnr==4 )
+ {
+ changedCounter++;
+ for(unsigned int k=prev_cept(old_i);k<=next_cept(old_i);++k)changed[k]=changedCounter;
+ for(unsigned int k=prev_cept(_i);k<=next_cept(_i);++k)changed[k]=changedCounter;
+ set(_j, _i);
+ for(unsigned int k=prev_cept(old_i);k<=next_cept(old_i);++k)changed[k]=changedCounter;
+ for(unsigned int k=prev_cept(_i);k<=next_cept(_i);++k)changed[k]=changedCounter;
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
+ for(unsigned int i=0;i<=l;i++)
+ if(changed[i]==changedCounter)
+ updateI(i,thisValue);
+ for(unsigned int j=1;j<=m;j++)
+ if( changed[get_al(j)]==changedCounter )
+ updateJ(j, 1,thisValue);
+ }
+ else
+ {
+ assert(modelnr==3);
+ set(_j, _i);
+ changedCounter++;
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
+ updateI(old_i,thisValue);
+ changed[old_i]=changedCounter;
+ updateI(_i,thisValue);
+ changed[_i]=changedCounter;
+ for(WordIndex j=1;j<=m;j++)
+ if( get_al(j)==_i || get_al(j)==old_i )
+ updateJ(j, 1,thisValue);
+ }
+ }
+}
+template<class TRANSPAIR>
+void MoveSwapMatrix<TRANSPAIR>::doSwap(WordIndex _j1, WordIndex _j2)
+{
+ assert( cswap(_j1, _j2)>1 );
+ WordIndex i1=get_al(_j1), i2=get_al(_j2);
+ if( lazyEvaluation==1 )
+ {
+ set(_j1, i2);
+ set(_j2, i1);
+ }
+ else
+ {
+ if ( modelnr==5||modelnr==6 )
+ {
+ set(_j1, i2);
+ set(_j2, i1);
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
+ for(WordIndex j=1;j<=m;j++)updateJ(j, 0,thisValue);
+ }
+ else if( modelnr==4 )
+ {
+ changedCounter++;
+ for(unsigned int k=prev_cept(i1);k<=next_cept(i1);++k)changed[k]=changedCounter;
+ for(unsigned int k=prev_cept(i2);k<=next_cept(i2);++k)changed[k]=changedCounter;
+ set(_j1, i2);
+ set(_j2, i1);
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
+ for(unsigned int i=0;i<=l;i++)
+ if(changed[i]==changedCounter)
+ updateI(i,thisValue);
+ for(unsigned int j=1;j<=m;j++)
+ if( changed[get_al(j)]==changedCounter )
+ updateJ(j, 1,thisValue);
+ }
+ else
+ {
+ assert(modelnr==3);
+ set(_j1, i2);
+ set(_j2, i1);
+ changedCounter++;
+ double thisValue=ef.scoreOfAlignmentForChange((*this));
+ updateI(i1,thisValue);
+ changed[i1]=changedCounter;
+ updateI(i2,thisValue);
+ changed[i2]=changedCounter;
+ updateJ(_j1, 1,thisValue);
+ updateJ(_j2, 1,thisValue);
+ }
+ }
+}
+
+#include "transpair_model3.h"
+#include "transpair_model4.h"
+#include "transpair_model5.h"
+#include "transpair_modelhmm.h"
+template class MoveSwapMatrix<transpair_model3>;
+template class MoveSwapMatrix<transpair_model4>;
+template class MoveSwapMatrix<transpair_model5>;
+template class MoveSwapMatrix<transpair_modelhmm>;
diff --git a/GIZA++-v2/MoveSwapMatrix.h b/GIZA++-v2/MoveSwapMatrix.h
new file mode 100644
index 0000000..b1bbf15
--- /dev/null
+++ b/GIZA++-v2/MoveSwapMatrix.h
@@ -0,0 +1,116 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/*--
+MoveSwapMatrix: Efficient representation for moving and swapping
+around in IBM3 training.
+Franz Josef Och (30/07/99)
+--*/
+#ifndef moveswap2_costs_h_defined
+#define moveswap2_costs_h_defined
+#include "alignment.h"
+#include "transpair_model3.h"
+#include "myassert.h"
+
+extern short DoViterbiTraining;
+
+template<class TRANSPAIR>
+class MoveSwapMatrix : public alignment
+{
+ private:
+ const TRANSPAIR&ef;
+ const WordIndex l, m;
+ Array2<LogProb, Vector<LogProb> > _cmove, _cswap;
+ Array2<char,Vector<char> > delmove,delswap;
+ Vector<int> changed;
+ int changedCounter;
+ const int modelnr;
+ bool lazyEvaluation;
+ bool centerDeleted;
+ public:
+ bool check()const
+ {
+ return 1;
+ }
+ const TRANSPAIR&get_ef()const
+ {return ef;}
+ bool isCenterDeleted()const
+ {return centerDeleted;}
+ bool isLazy()const
+ {return lazyEvaluation;}
+ MoveSwapMatrix(const TRANSPAIR&_ef, const alignment&_a);
+ void updateJ(WordIndex j, bool,double thisValue);
+ void updateI(WordIndex i,double thisValue);
+ void doMove(WordIndex _i, WordIndex _j);
+ void doSwap(WordIndex _j1, WordIndex _j2);
+ void delCenter()
+ {
+ centerDeleted=1;
+ }
+ void delMove(WordIndex x, WordIndex y)
+ {
+ delmove(x,y)=1;
+ }
+ void delSwap(WordIndex x, WordIndex y)
+ {
+ massert(y>x);
+ delswap(x,y)=1;
+ delswap(y,x)=1;
+ }
+ bool isDelMove(WordIndex x, WordIndex y)const
+ {
+ return DoViterbiTraining||delmove(x,y);
+ }
+ bool isDelSwap(WordIndex x, WordIndex y)const
+ {
+ massert(y>x);
+ return DoViterbiTraining||delswap(x,y);
+ }
+ LogProb cmove(WordIndex x, WordIndex y)const
+ {
+ massert( get_al(y)!=x );
+ massert( delmove(x,y)==0 );
+ if( lazyEvaluation )
+ return ef.scoreOfMove(*this,x,y);
+ else
+ {
+ return _cmove(x, y);
+ }
+ }
+ LogProb cswap(WordIndex x, WordIndex y)const
+ {
+ massert(x<y);
+ massert(delswap(x,y)==0);
+ massert(get_al(x)!=get_al(y));
+ if( lazyEvaluation )
+ return ef.scoreOfSwap(*this,x,y);
+ else
+ {
+ massert(y>x);
+ return _cswap(x, y);
+ }
+ }
+ void printWrongs()const;
+ bool isRight()const;
+ friend ostream&operator<<(ostream&out, const MoveSwapMatrix<TRANSPAIR>&m)
+ {return out << (alignment)m << "\nEF:\n"<< m.ef << "\nCMOVE\n"<<m._cmove << "\nCSWAP\n" << m._cswap << endl;};
+};
+#endif
diff --git a/GIZA++-v2/NTables.cpp b/GIZA++-v2/NTables.cpp
new file mode 100644
index 0000000..e02a7c9
--- /dev/null
+++ b/GIZA++-v2/NTables.cpp
@@ -0,0 +1,93 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "NTables.h"
+#include <iostream>
+#include "defs.h"
+#include <fstream>
+#include "Parameter.h"
+
+GLOBAL_PARAMETER(double,NTablesFactorGraphemes,"nSmooth","smoothing for fertility parameters (good value: 64): weight for wordlength-dependent fertility parameters",PARLEV_SMOOTH,64.0);
+GLOBAL_PARAMETER(double,NTablesFactorGeneral,"nSmoothGeneral","smoothing for fertility parameters (default: 0): weight for word-independent fertility parameters",PARLEV_SMOOTH,0.0);
+
+template <class VALTYPE>
+void nmodel<VALTYPE>::printNTable(int noEW, const char* filename,
+ const Vector<WordEntry>& evlist,
+ bool actual) const
+ // prints the fertility table but with actual sourcce words (not their id)
+{
+ cerr << "Dumping nTable to: " << filename << '\n';
+ ofstream of(filename);
+ VALTYPE p ;
+ WordIndex k, i ;
+ for(i=1; int(i) < noEW; i++){
+ if (evlist[i].freq > 0){
+ if (actual)
+ of << evlist[i].word << ' ' ;
+ else
+ of << i << ' ' ;
+ for( k=0; k < MAX_FERTILITY; k++){
+ p = getValue(i, k);
+ if (p <= PROB_SMOOTH)
+ p = 0;
+ of << p << ' ';
+ }
+ of << '\n';
+ }
+ }
+}
+
+template <class VALTYPE>
+void nmodel<VALTYPE>::readNTable(const char *filename){
+ /* This function reads the n table from a file.
+ Each line is of the format: source_word_id p0 p1 p2 ... pn
+ This is the inverse operation of the printTable function.
+ NAS, 7/11/99
+ */
+ ifstream inf(filename);
+ cerr << "Reading fertility table from " << filename << "\n";
+ if(!inf){
+ cerr << "\nERROR: Cannot open " << filename <<"\n";
+ return;
+ }
+
+ VALTYPE prob;
+ WordIndex tok, i;
+ int nFert=0;
+ while(!inf.eof()){
+ nFert++;
+ inf >> ws >> tok;
+ if (tok > MAX_VOCAB_SIZE){
+ cerr << "NTables:readNTable(): unrecognized token id: " << tok
+ <<'\n';
+ exit(-1);
+ }
+ for(i = 0; i < MAX_FERTILITY; i++){
+ inf >> ws >> prob;
+ getRef(tok, i)=prob;
+ }
+ }
+ cerr << "Read " << nFert << " entries in fertility table.\n";
+ inf.close();
+}
+
+template class nmodel<COUNT>;
+//template class nmodel<PROB>;
diff --git a/GIZA++-v2/NTables.h b/GIZA++-v2/NTables.h
new file mode 100644
index 0000000..9ca086e
--- /dev/null
+++ b/GIZA++-v2/NTables.h
@@ -0,0 +1,145 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _ntables_h
+#define _ntables_h 1
+#include "Array2.h"
+#include "Vector.h"
+#include <assert.h>
+#include "defs.h"
+#include "vocab.h"
+#include "myassert.h"
+#include "Globals.h"
+
+extern double NTablesFactorGraphemes,NTablesFactorGeneral;
+
+template <class VALTYPE>
+class nmodel
+{
+ private:
+ Array2<VALTYPE, Vector<VALTYPE> > ntab;
+ public:
+ nmodel(int maxw, int maxn)
+ : ntab(maxw, maxn, 0.0)
+ {}
+ VALTYPE getValue(int w, unsigned int n)const
+ {
+ massert(w!=0);
+ if(n>=ntab.getLen2())
+ return 0.0;
+ else
+ return max(ntab(w, n), VALTYPE(PROB_SMOOTH));
+ }
+ VALTYPE&getRef(int w, int n)
+ {
+ //massert(w!=0);
+ return ntab(w, n);
+ }
+ template<class COUNT>
+ void normalize(nmodel<COUNT>&write,const Vector<WordEntry>* _evlist)const
+{
+ int h1=ntab.getLen1(), h2=ntab.getLen2();
+ int nParams=0;
+ if( _evlist&&(NTablesFactorGraphemes||NTablesFactorGeneral) )
+ {
+ size_t maxlen=0;
+ const Vector<WordEntry>&evlist=*_evlist;
+ for(unsigned int i=1;i<evlist.size();i++)
+ maxlen=max(maxlen,evlist[i].word.length());
+ Array2<COUNT,Vector<COUNT> > counts(maxlen+1,MAX_FERTILITY+1,0.0);
+ Vector<COUNT> nprob_general(MAX_FERTILITY+1,0.0);
+ for(unsigned int i=1;i<min((unsigned int)h1,(unsigned int)evlist.size());i++)
+ {
+ int l=evlist[i].word.length();
+ for(int k=0;k<h2;k++)
+ {
+ counts(l,k)+=getValue(i,k);
+ nprob_general[k]+=getValue(i,k);
+ }
+ }
+ COUNT sum2=0;
+ for(unsigned int i=1;i<maxlen+1;i++)
+ {
+ COUNT sum=0.0;
+ for(int k=0;k<h2;k++)
+ sum+=counts(i,k);
+ sum2+=sum;
+ if( sum )
+ {
+ double average=0.0;
+ //cerr << "l: " << i << " " << sum << " ";
+ for(int k=0;k<h2;k++)
+ {
+ counts(i,k)/=sum;
+ //cerr << counts(i,k) << ' ';
+ average+=k*counts(i,k);
+ }
+ //cerr << "avg: " << average << endl;
+ //cerr << '\n';
+ }
+ }
+ for(unsigned int k=0;k<nprob_general.size();k++)
+ nprob_general[k]/=sum2;
+
+ for(int i=1;i<h1;i++)
+ {
+ int l=-1;
+ if((unsigned int)i<evlist.size())
+ l=evlist[i].word.length();
+ COUNT sum=0.0;
+ for(int k=0;k<h2;k++)
+ sum+=getValue(i, k)+((l==-1)?0.0:(counts(l,k)*NTablesFactorGraphemes)) + NTablesFactorGeneral*nprob_general[k];
+ assert(sum);
+ for(int k=0;k<h2;k++)
+ {
+ write.getRef(i, k)=(getValue(i, k)+((l==-1)?0.0:(counts(l,k)*NTablesFactorGraphemes)))/sum + NTablesFactorGeneral*nprob_general[k];
+ nParams++;
+ }
+ }
+ }
+ else
+ for(int i=1;i<h1;i++)
+ {
+ COUNT sum=0.0;
+ for(int k=0;k<h2;k++)
+ sum+=getValue(i, k);
+ assert(sum);
+ for(int k=0;k<h2;k++)
+ {
+ write.getRef(i, k)=getValue(i, k)/sum;
+ nParams++;
+ }
+ }
+ cerr << "NTable contains " << nParams << " parameter.\n";
+}
+
+ void clear()
+ {
+ int h1=ntab.getLen1(), h2=ntab.getLen2();
+ for(int i=0;i<h1;i++)for(int k=0;k<h2;k++)
+ ntab(i, k)=0;
+ }
+ void printNTable(int noEW, const char* filename, const Vector<WordEntry>& evlist, bool) const;
+ void readNTable(const char *filename);
+
+};
+
+#endif
diff --git a/GIZA++-v2/Parameter.cpp b/GIZA++-v2/Parameter.cpp
new file mode 100644
index 0000000..7af6916
--- /dev/null
+++ b/GIZA++-v2/Parameter.cpp
@@ -0,0 +1,144 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "Parameter.h"
+#include "fstream.h"
+#include "unistd.h"
+#include <strstream>
+
+
+bool absolutePathNames=0;
+string ParameterPathPrefix;
+bool ParameterChangedFlag=0;
+
+bool writeParameters(ofstream&of,const ParSet&parset,int level)
+{
+ if(!of)return 0;
+ for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i)
+ {
+ if(((*i)->getLevel()==level||level==-1)&&(*i)->onlyCopy==0)
+ {
+ ostrstream os;
+ (*i)->printValue(os);
+ os << ends;
+ string s(os.str());
+ of << (*i)->getString() << " ";
+ if( absolutePathNames&&(*i)->isFilename()&&s.length()&&s[0]!='/' )
+ {
+ char path[1024];
+ getcwd(path,1024);
+ of << path << '/';
+ }
+ if( ParameterPathPrefix.length()&&(*i)->isFilename()&&s.length()&&s[0]!='/' )
+ of << ParameterPathPrefix << '/';
+ (*i)->printValue(of);
+ of << endl;
+ }
+ }
+ return 1;
+}
+
+bool readParameters(ifstream&f,const ParSet&parset,int verb,int level)
+{
+ string s;
+ if(!f)return 0;
+ while(getline(f,s))
+ {
+ istrstream eingabe(s.c_str());
+ string s1,s2;
+ eingabe>>s1>>s2;
+ if(makeSetCommand(s1,s2,parset,verb,level)==0)
+ cerr << "ERROR: could not set: (C) " << s1 << " " << s2 << endl;
+ }
+ return 1;
+}
+
+
+bool makeSetCommand(string _s1,string s2,const ParSet&parset,int verb,int level)
+{
+ ParPtr anf;
+ int anfset=0;
+ string s1=simpleString(_s1);
+ for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i)
+ {
+ if( *(*i)==s1 )
+ {
+ if( level==-1 || level==(*i)->getLevel() )
+ (*i)->setParameter(s2,verb);
+ else if(verb>1)
+ cerr << "ERROR: Could not set: (A) " << s1 << " " << s2 << " " << level << " " << (*i)->getLevel() << endl;
+ return 1;
+ }
+ else if( (*i)->getString().substr(0,s1.length())==s1 )
+ {
+ anf=(*i);anfset++;
+ }
+ }
+ if(anfset==1)
+ {
+ if( level==-1 || level==anf->getLevel() )
+ anf->setParameter(s2,verb);
+ else if( verb>1 )
+ cerr << "ERROR: Could not set: (B) " << s1 << " " << s2 << " " << level << " " << anf->getLevel() << endl;
+ return 1;
+ }
+ if( anfset>1 )
+ cerr << "ERROR: ambiguous parameter '" << s1 << "'.\n";
+ if( anfset==0 )
+ cerr << "ERROR: parameter '" << s1 << "' does not exist.\n";
+ return 0;
+}
+
+ostream& printPars(ostream&of,const ParSet&parset,int level)
+{
+ if(!of)return of;
+ for(ParSet::const_iterator i=parset.begin();i!=parset.end();++i)
+ {
+ if(((*i)->getLevel()==level||level==-1)&&(*i)->onlyCopy==0)
+ {
+ (*i)->printAt(of);
+ of << endl;
+ }
+ }
+ return of;
+}
+
+string simpleString(const string s)
+{
+ string k;
+ for(unsigned int i=0;i<s.length();++i)
+ {
+ char c[2];
+ c[0]=tolower(s[i]);
+ c[1]=0;
+ if( (c[0]>='a'&&c[0]<='z')||(c[0]>='0'&&c[0]<='9') )
+ k += c;
+ }
+ return k;
+}
+
+
+ParSet&getGlobalParSet()
+{
+ static ParSet x;
+ return x;
+}
diff --git a/GIZA++-v2/Parameter.h b/GIZA++-v2/Parameter.h
new file mode 100644
index 0000000..5125f92
--- /dev/null
+++ b/GIZA++-v2/Parameter.h
@@ -0,0 +1,199 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef PARAMETER_H_DEFINED
+#define PARAMETER_H_DEFINED
+
+#include "mystl.h"
+#include <set>
+#include "Pointer.h"
+#include <string>
+#include "Globals.h"
+#include <fstream>
+
+inline unsigned int mConvert(const string&s,unsigned int &i)
+{
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return i=1; }
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return i=0;}
+ return i=atoi(s.c_str());
+}
+inline int mConvert(const string&s,int &i){
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return i=1;}
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return i=0;}
+ return i=atoi(s.c_str());
+}
+inline double mConvert(const string&s,double &d) { return d=atof(s.c_str()); }
+inline double mConvert(const string&s,float &d) { return d=atof(s.c_str()); }
+inline string mConvert(const string&s,string&n) { return n=s; }
+inline bool mConvert(const string&s,bool&n) {
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;}
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;}
+ return n=atoi(s.c_str());
+}
+inline short mConvert(const string&s,short&n) {
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;}
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;}
+ return n=atoi(s.c_str());
+}
+inline unsigned short mConvert(const string&s,unsigned short&n) {
+ if( strcasecmp(s.c_str(),"yes")==0 || strcasecmp(s.c_str(),"y")==0 || strcasecmp(s.c_str(),"true")==0 || strcasecmp(s.c_str(),"t")==0 ) { cerr << "TRUE\n";return n=1;}
+ if( strcasecmp(s.c_str(),"no")==0 || strcasecmp(s.c_str(),"n")==0 || strcasecmp(s.c_str(),"false")==0 || strcasecmp(s.c_str(),"f")==0 ) { cerr << "FALSE\n";return n=0;}
+ return n=atoi(s.c_str());
+}
+
+string simpleString(const string s);
+
+inline int Hashstring(const string& s)
+{
+ int sum=0;
+ string::const_iterator i=s.begin(),end=s.end();
+ for(;i!=end;i++)sum=5*sum+(*i);
+ return sum;
+}
+
+class _Parameter
+{
+ protected:
+ string name;
+ bool *ifChanged;
+ string description;
+ int level;
+ bool filename;
+ public:
+ int onlyCopy;
+ _Parameter(string n,bool&b,string desc,int _level,bool _onlyCopy)
+ : name(simpleString(n)),ifChanged(&b),description(desc),level(_level),filename(0),onlyCopy(_onlyCopy) {}
+ virtual ~_Parameter(){};
+ bool operator==(const string&s)const
+ { return name== simpleString(s); }
+ void setChanged()
+ { *ifChanged=true; }
+ virtual bool setParameter(string s2,int)=0;
+ virtual ostream&printAt(ostream&out)=0;
+ virtual ostream&printValue(ostream&out)=0;
+ const string&getString() const { return name; }
+ int getLevel() const { return level;}
+ bool isFilename() { return filename;}
+ void setFilename(bool x=1) { filename=x;}
+ friend bool operator==(const _Parameter&a,const _Parameter&b)
+ { return a.name==b.name; }
+ friend bool operator<(const _Parameter&a,const _Parameter&b)
+ { return a.name<b.name; }
+ friend int Hash(const _Parameter&aaa)
+ { return Hashstring(aaa.name); }
+ friend ostream&operator<<(ostream&out,const _Parameter&p)
+ { return out<<"Parameter: "<<p.name <<endl;}
+};
+
+template<class T>
+class Parameter : public _Parameter
+{
+ private:
+ T*t;
+ public:
+ Parameter(string n,bool&b,string desc,T&_t,int level=0,bool onlyCopy=0)
+ : _Parameter(n,b,desc,level,onlyCopy),t(&_t) {}
+ virtual ~Parameter(){}
+ virtual bool setParameter(string s2,int verb)
+ {
+ T x;
+ if( !(*t==mConvert(s2,x)))
+ {
+ bool printedFirst=0;
+ if( verb>1 )
+ {
+ cout << "Parameter '"<<name <<"' changed from '"<<*t<<"' to '";
+ printedFirst=1;
+ }
+ mConvert(s2,*t);
+ if( printedFirst )
+ cout << *t <<"'\n";
+ setChanged();
+ return 1;
+ }
+ return 0;
+ }
+ virtual ostream&printAt(ostream&out)
+ {return out << name << " = " << *t << " (" << description << ")";}
+ virtual ostream&printValue(ostream&out)
+ {return out << *t;}
+};
+
+typedef MP<_Parameter> ParPtr;
+
+class ParSet : public set<ParPtr>
+{
+ public:
+ void insert(const ParPtr&x)
+ {
+ if( count(x)!=0 )
+ cerr << "ERROR: element " << x->getString() << " already inserted.\n";
+ set<ParPtr>::insert(x);
+ }
+};
+
+bool makeSetCommand(string s1,string s2,const ParSet&pars,int verb=1,int level= -1);
+ostream&printPars(ostream&out,const ParSet&pars,int level=-1);
+bool writeParameters(ofstream&of,const ParSet&parset,int level=0);
+bool readParameters(ifstream&f,const ParSet&parset,int verb=2,int level=0);
+ParSet&getGlobalParSet();
+extern bool ParameterChangedFlag;
+template<class T>const T&addGlobalParameter(const char *name,const char *description,int level,T*adr,const T&init)
+{
+ *adr=init;
+ getGlobalParSet().insert(new Parameter<T>(name,ParameterChangedFlag,description,*adr,level));
+ return init;
+}
+template<class T>const T&addGlobalParameter(const char *name,const char *name2,const char *description,int level,T*adr,const T&init)
+{
+ *adr=init;
+ getGlobalParSet().insert(new Parameter<T>(name,ParameterChangedFlag,description,*adr,level));
+ getGlobalParSet().insert(new Parameter<T>(name2,ParameterChangedFlag,description,*adr,-1));
+ return init;
+}
+template<class T>const T&addGlobalParameter(const char *name,const char *name2,const char *name3,const char *description,int level,T*adr,const T&init)
+{
+ *adr=init;
+ getGlobalParSet().insert(new Parameter<T>(name,ParameterChangedFlag,description,*adr,level));
+ getGlobalParSet().insert(new Parameter<T>(name2,ParameterChangedFlag,description,*adr,-1));
+ getGlobalParSet().insert(new Parameter<T>(name3,ParameterChangedFlag,description,*adr,-1));
+ return init;
+}
+template<class T>const T&addGlobalParameter(const char *name,const char *name2,const char *name3,const char *name4,const char *description,int level,T*adr,const T&init)
+{
+ *adr=init;
+ getGlobalParSet().insert(new Parameter<T>(name,ParameterChangedFlag,description,*adr,level));
+ getGlobalParSet().insert(new Parameter<T>(name2,ParameterChangedFlag,description,*adr,-1));
+ getGlobalParSet().insert(new Parameter<T>(name3,ParameterChangedFlag,description,*adr,-1));
+ getGlobalParSet().insert(new Parameter<T>(name4,ParameterChangedFlag,description,*adr,-1));
+ return init;
+}
+void MakeParameterOptimizing(istream&file,string resultingParameters);
+
+#define GLOBAL_PARAMETER(TYP,VARNAME,NAME,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,DESCRIPTION,LEVEL,&VARNAME,INIT);
+#define GLOBAL_PARAMETER2(TYP,VARNAME,NAME,NAME2,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,DESCRIPTION,LEVEL,&VARNAME,INIT);
+#define GLOBAL_PARAMETER3(TYP,VARNAME,NAME,NAME2,NAME3,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,NAME3,DESCRIPTION,LEVEL,&VARNAME,INIT);
+#define GLOBAL_PARAMETER4(TYP,VARNAME,NAME,NAME2,NAME3,NAME4,DESCRIPTION,LEVEL,INIT) TYP VARNAME=addGlobalParameter< TYP >(NAME,NAME2,NAME3,NAME4,DESCRIPTION,LEVEL,&VARNAME,INIT);
+
+void setParameterLevelName(unsigned int i,string x);
+
+#endif
diff --git a/GIZA++-v2/Perplexity.cpp b/GIZA++-v2/Perplexity.cpp
new file mode 100644
index 0000000..d44dec5
--- /dev/null
+++ b/GIZA++-v2/Perplexity.cpp
@@ -0,0 +1,40 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/* Perplexity.cc
+ * =============
+ * Mike Jahr, 7/21/99
+ * Machine Translation group, WS99
+ * Center for Language and Speech Processing
+ *
+ * Last Modified by: Yaser Al-Onaizan, August 17, 1999
+ *
+ * Simple class used to calculate cross entropy and perplexity
+ * of models.
+ */
+
+#include "Perplexity.h"
+
+void Perplexity::record(string model){
+ modelid.push_back(model);
+ perp.push_back(perplexity());
+ ce.push_back(cross_entropy());
+}
diff --git a/GIZA++-v2/Perplexity.h b/GIZA++-v2/Perplexity.h
new file mode 100644
index 0000000..e363680
--- /dev/null
+++ b/GIZA++-v2/Perplexity.h
@@ -0,0 +1,108 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/* Perplexity.h
+ * ============
+ * Mike Jahr, 7/15/99
+ * Machine Translation group, WS99
+ * Center for Language and Speech Processing
+ *
+ * Last Modified by: Yaser Al-Onaizan, August 17, 1999
+ *
+ * Simple class used to calculate cross entropy and perplexity
+ * of models.
+ */
+
+#ifndef _PERPLEXITY_H
+#define _PERPLEXITY_H
+
+#include <math.h>
+#include <fstream.h>
+#include "Vector.h"
+#include "defs.h"
+#include "Array2.h"
+#include "Globals.h"
+
+#define CROSS_ENTROPY_BASE 2
+
+class Perplexity {
+ private:
+ double sum;
+ double wc;
+ Array2<double, Vector<double> > *E_M_L;
+ Vector<string> modelid;
+ Vector<double > perp;
+ Vector<double > ce;
+ Vector<string> name ;
+ public:
+ ~Perplexity() { delete E_M_L;}
+ Perplexity() {
+ E_M_L = new Array2<double, Vector<double> >(MAX_SENTENCE_LENGTH,MAX_SENTENCE_LENGTH);
+ unsigned int l, m ;
+ Vector<double> fact(MAX_SENTENCE_LENGTH, 1.0);
+ for (m = 2 ; m < MAX_SENTENCE_LENGTH ; m++)
+ fact[m] = fact[m-1] * m ;
+ for (m = 1 ; m < MAX_SENTENCE_LENGTH ; m++)
+ for (l = 1 ; l < MAX_SENTENCE_LENGTH ; l++) {
+ (*E_M_L)(l, m) = log (pow((LAMBDA * l), double(m)) * exp(-LAMBDA * double(l)) /
+ (fact[m])) ;
+ }
+ sum = 0 ;
+ wc = 0;
+ perp.clear();
+ ce.clear();
+ name.clear();
+ }
+ inline void clear() {
+ sum = 0 ;
+ wc = 0 ;
+ }
+ const size_t size()const{return(min(perp.size(), ce.size()));}
+ inline void addFactor(const double p, const double count, const int l,
+ const int m,bool withPoisson) {
+ wc += count * m ; // number of french words
+ sum += count * ( (withPoisson?((*E_M_L)(l, m)):0.0) + p) ;
+ }
+ inline double perplexity() {
+ return exp( -1*sum / wc);
+ }
+
+ inline double cross_entropy() {
+ return (-1.0*sum / (log(double(CROSS_ENTROPY_BASE)) * wc));
+ }
+
+ inline double word_count() {
+ return wc;
+ }
+
+ inline double getSum(){
+ return sum ;
+ }
+
+ void record(string model);
+
+ friend void generatePerplexityReport(const Perplexity&, const Perplexity&,
+ const Perplexity&, const Perplexity&,
+ ostream&, int, int, bool);
+};
+
+
+#endif
diff --git a/GIZA++-v2/Pointer.h b/GIZA++-v2/Pointer.h
new file mode 100644
index 0000000..c6a2dc0
--- /dev/null
+++ b/GIZA++-v2/Pointer.h
@@ -0,0 +1,175 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef HEADER_Pointer_DEFINED
+#define HEADER_Pointer_DEFINED
+
+#include <assert.h>
+#include <stream.h>
+
+template<class T>
+class SmartPointer
+{
+ protected:
+ T*p;
+ public:
+ SmartPointer(T*_p=0)
+ : p(_p) {}
+ inline T&operator*() const
+ {return *p;}
+ inline T*operator->() const
+ {return p;}
+ inline operator const bool() const
+ {return p!=0;}
+ inline T*ptr() const
+ { return p; }
+};
+template<class T> inline ostream &operator<<(ostream&out,const SmartPointer<T>&s)
+{if( s.ptr() )return out << *s;else return out <<"nullpointer";}
+
+
+template<class T>
+class SmartPointerConst
+{
+ protected:
+ const T*p;
+ public:
+ SmartPointerConst(const T*_p=0)
+ : p(_p) {}
+ inline const T&operator*() const
+ {return *p;}
+ inline const T*operator->() const
+ {return p;}
+ inline operator const bool() const
+ {return p!=0;}
+ inline const T*ptr() const
+ { return p; }
+};
+template<class T> inline ostream &operator<<(ostream&out,const SmartPointerConst<T>&s)
+{if( s.ptr() )return out << *s;else return out <<"nullpointer";}
+
+template <class T>
+class UP : public SmartPointer<T>
+{
+ public:
+ UP(T*_p=0)
+ : SmartPointer<T>(_p) {}
+};
+template<class T> inline bool operator==(const UP<T>&s1,const UP<T>&s2)
+{return s1.ptr()==s2.ptr();}
+template<class T> inline bool operator<(const UP<T>&s1,const UP<T>&s2)
+{return s1.ptr() < s2.ptr();}
+template<class T> inline int Hash(const UP<T> &wp)
+{if(wp.ptr())return Hash(*wp);else return 0;}
+
+
+template <class T>
+class UPConst : public SmartPointerConst<T>
+{
+ public:
+ UPConst(const T*_p=0)
+ : SmartPointerConst<T>(_p) {}
+};
+template<class T> inline bool operator==(const UPConst<T>&s1,const UPConst<T>&s2)
+{return s1.ptr()==s2.ptr();}
+template<class T> inline bool operator<(const UPConst<T>&s1,const UPConst<T>&s2)
+{return s1.ptr()<s2.ptr();}
+template<class T> inline int Hash(const UPConst<T> &wp)
+{if(wp.ptr())return Hash(*wp);else return 0;}
+
+
+template <class T>
+class MP : public SmartPointer<T>
+{
+ public:
+ MP(T*_p=0)
+ : SmartPointer<T>(_p) {}
+};
+template <class T> inline bool operator==(const MP<T>&s1,const MP<T>&s2)
+{assert(s1);assert(s2);return *s1==*s2;}
+template <class T> inline bool operator<(const MP<T>&s1,const MP<T>&s2)
+{assert(s1);assert(s2);return *s1 < *s2;}
+template <class T> inline int Hash(const MP<T> &wp)
+{if(wp.ptr())return Hash(*wp);else return 0;}
+
+
+template <class T>
+class MPConst : public SmartPointerConst<T>
+{
+ public:
+ MPConst(const T*_p=0)
+ : SmartPointerConst<T>(_p) {}
+};
+template <class T> inline bool operator==(const MPConst<T>&s1,const MPConst<T>&s2)
+{assert(s1);assert(s2);return *s1== *s2;}
+template <class T> inline bool operator<(const MPConst<T>&s1,const MPConst<T>&s2)
+{assert(s1);assert(s2);return *s1 < *s2;}
+template <class T> inline int Hash(const MPConst<T> &wp)
+{if(wp.ptr())return Hash(*wp);else return 0;}
+
+
+template <class T>
+class DELP : public SmartPointer<T>
+{
+ private:
+ DELP(const DELP<T>&x);
+ public:
+ const DELP<T>&operator=(DELP<T>&x)
+ {
+ delete this->p;
+ this->p=x.p;x.p=0;
+ return *this;
+ }
+
+ ~DELP()
+ { delete this->p;this->p=0;}
+ DELP(T*_p=0)
+ : SmartPointer<T>(_p) {}
+ void set(T*_p)
+ {
+ delete this->p;
+ this->p=_p;
+ }
+ friend bool operator==(const DELP<T>&s1,const DELP<T>&s2)
+ {
+ return *(s1.p)== *(s2.p);
+ }
+ friend bool operator<(const DELP<T>&s1,const DELP<T>&s2)
+ {
+ return *(s1.p) < *(s2.p);
+ }
+ friend inline int Hash(const DELP<T> &wp)
+ {
+ if(wp.p)
+ return Hash(*wp.p);
+ else
+ return 0;
+ }
+};
+#endif
+
+
+
+
+
+
+
diff --git a/GIZA++-v2/README b/GIZA++-v2/README
new file mode 100644
index 0000000..25af288
--- /dev/null
+++ b/GIZA++-v2/README
@@ -0,0 +1,508 @@
+========================================================================
+GIZA++ is an extension of the program GIZA.
+It is a program for learning statistical translation models from
+bitext. It is an implementation of the models described in
+(Brown et al., 1993), (Vogel et al., 1996), (Och et al., 2000a),
+(Och et al., 2000b).
+========================================================================
+
+
+
+CONTENTS of this README file:
+
+Part I: GIZA Package Contents
+Part II: How To Compile GIZA
+Part III: How to Run GIZA
+Part IV: Input File Formats
+ A. VOCABULARY FILES
+ B. Bitext Files
+ C. Dictionary File (optional)
+Part V: Output File Formats:
+ A. PROBABILITY TABLES
+ 1. T TABLE (translation table)
+ 2. N TABLE (Fertility table)
+ 3. P0 TABLE
+ 4. A TABLE
+ 5. D3 TABLE
+ 6. D4 TABLE
+ 7. D5 TABLE
+ 8. HMM TABLE
+ B. ALIGNMENT FILE
+ C. Cross Entropy and Perplexity Files
+ D. Revised Vocabulary files
+Part VI: Literature
+Part VII: New features
+
+HISTORY of this README file:
+
+GIZA++:
+edited: 11 Jan. 2000, Franz Josef Och
+GIZA:
+edited: 16 Aug. 1999, Dan Melamed
+edited: 13 Aug. 1999, Yaser Al-Onaizan
+edited: 20 July 1999, Yaser Al-Onaizan
+edited: 15 July 1999, Yaser Al-Onaizan
+edited: 13 July 1999, Noah Smith
+========================================================================
+
+Part 0: What is GIZA++
+
+GIZA++ is an extension of the program GIZA (part of the SMT toolkit
+EGYPT - http://www.clsp.jhu.edu/ws99/projects/mt/toolkit/ ) which was
+developed by the Statistical Machine Translation team during the
+summer workshop in 1999 at the Center for Language and Speech
+Processing at Johns-Hopkins University (CLSP/JHU). GIZA++ includes a
+lot of additional features. The extensions of GIZA++ were designed and
+written by Franz Josef Och.
+
+Features of GIZA++ not in GIZA:
+
+- Implements full IBM-4 alignment model with a dependency of word
+classes as described in (Brown et al. 1993)
+
+- Implements IBM-5: dependency on word classes, smoothing, ...
+
+- Implements HMM alignment model: Baum-Welch training, Forward-Backward
+algorithm, empty word, dependency on word classes, transfer to
+fertility models, ...
+
+- Implementation of a variant of the IBM-3 and IBM-4
+(-deficientDistortionModel 1) models which allow the training of -p0
+
+- Smoothing for fertility, distortion/alignment parameters
+
+- Significant more efficient training of the fertility models
+
+- Correct implementation of pegging as described in (Brown et
+al. 1993), implemented a series of heuristics in order to make pegging
+sufficiently efficient
+
+- Completely new parameter mechanism: allows to easily add additional
+parameters
+
+- Improved perplexity calculation for models IBM-1, IBM-2 and HMM (the
+parameter of the Poisson-distribution of the sentence lengths is
+computed automatically from the used training corpus)
+
+========================================================================
+Part I: GIZA++ Package Programs
+
+GIZA++: GIZA++ itself
+
+plain2snt.out: simple tool to transform plain text into GIZA text
+format
+
+snt2plain.out: simple tool to transform GIZA text format into plain
+text
+
+trainGIZA++.sh: Shell script to perform standard training given a
+corpus in GIZA text format
+
+========================================================================
+Part II: How To Compile GIZA++
+
+In order to compile GIZA++ you may need:
+- recent version of the GNU compiler (2.95 or higher)
+- recent version of assembler and linker which do not have restrictions
+ with respect to the length of symbol names
+
+There is a make file in the src directory that will take care of the
+compilation. The most important targets are:
+
+GIZA++: generates an optimized version
+
+GIZA++.dbg: generates the debug version
+
+depend: generates the "dependencies" file (make this whenever you add
+source or header files to the package.
+
+========================================================================
+Part III: How To run GIZA++
+
+It's simple:
+
+GIZA++ [config-file] [options]
+
+All options which expect a parameter could also be used in the
+parameter file. For example the command line options
+
+ GIZA++ -S S.vcb -T T.vcb -C ST.snt
+
+corresponds to the config file:
+
+ S: S.vcb
+ T: T.vcb
+ C: ST.snt
+
+If you call GIZA++ without a parameter you get a list of all the
+options. The option names form GIZA are normally still valid. The
+default values of the parameters typically are optimized with respect
+to the corpora I use and typically give good results. It is
+nevertheless important that these parameters are always optimized for
+every new task.
+
+==========================================================================
+Part IV: Input File Formats
+
+A. VOCABULARY FILES
+
+Each entry is stored on one line as follows:
+
+ uniq_id1 string1 no_occurrences1
+ uniq_id2 string2 no_occurrences2
+ uniq_id3 string3 no_occurrences3
+ ....
+
+Here is a sample from an English vocabulary file:
+
+627 abandon 10
+628 abandoned 17
+629 abandoning 2
+630 abandonment 12
+631 abatement 8
+632 abbotsford 2
+
+uniq_ids are sequential positive integer numbers. 0 is reserved for
+the special token NULL.
+
+
+B. Bitext Files
+
+Each sentence pair is stored in three lines. The first line
+is the number of times this sentence pair occurred. The second line is
+the source sentence where each token is replaced by its unique integer
+id from the vocabulary file and the third is the target sentence in
+the same format.
+
+Here's a sample of 3 sentences from English/french corpus:
+
+1
+1 1 226 5008 621 6492 226 6377 6813 226 9505 5100 6824 226 5100 5222 0 614 10243 613
+2769 155 7989 585 1 578 6503 585 8242 578 8142 8541 578 12328 6595 8550 578 6595 6710 1
+1
+1 1 226 6260 11856 11806 1293
+11 1 1 11 155 14888 2649 11447 9457 8488 4168
+1
+1 1 226 7652 1 226 5337 226 6940 12089 5582 8076 12050
+1 1 155 4140 6812 153 1 154 155 14668 15616 10524 9954 1392
+
+C. Dictionary File
+
+This is optional. The dictionary file is of the format:
+
+target_word_id source_word_id
+
+The list should be sorted by the target_word_id.
+
+C. Dictionary Files
+
+If you provide a dictionary and list it in the configuration file,
+GIZA++ will change the cooccurrence counting in the first iteration
+of model 1 to honor the so-called "Dictionary Constraint":
+
+ In parallel sentences "e1 ... en" and "f1 ... fm",
+ ei and fi are counted as a coocurrence pair if one of two
+ conditions is met: 1.) ei and fi occur as an entry in the
+ dictionary, or 2.) ei does not occur in the dictionary with
+ any fj (1 <= j <= m) and fi does not occur in the dictionary
+ with any ej (1 <= j <= n).
+
+The dictionary must a list of pairs, one per line:
+
+ F E
+
+where F is an integer of a target token, and E is the integer of a
+source token. F may be listed with other Es, and vice versa.
+
+Important: The dictionary must be sorted by the F integers!
+
+==========================================================================
+Part V: Output File Formats:
+
+For file names, we will use the prefix "prob_table". This can be
+changed using the -o switch. The default is a combination of user id
+and time stamp.
+
+
+A. PROBABILITY TABLES
+
+Normally, Model1 is trained first, and the result is used to start
+Model2 training. Then Model2 is transfered to Model3. Model3 viterbi
+training follows. This sequence can be adjusted by the various
+options, either on the command line or in a config file.
+
+1. T TABLE ( *.t3.* )
+
+(translation table)
+
+ prob_table.t1.n = t table after n iterations of Model1 training
+ prob_table.t2.n = t table after n iterations of Model2 training
+ prob_table.t2to3 = t table after transfering Model2 to Model3
+ prob_table.t3.n = t table after n iterations of Model3 training
+ prob_table.4.n = t table after n iterations of Model4 training
+
+Each line is of the following format:
+
+s_id t_id P(t_id/s_id)
+
+where:
+ s_id: is the unique id for the source token
+ t_id: is the unique id for the target token
+ P(t_id/s_id) the probability of translating s_id as t_id
+
+sample part of a file:
+
+3599 5697 0.0628115
+2056 10686 0.000259988
+8227 3738 3.57132e-13
+5141 13720 5.52332e-12
+10798 4102 6.53047e-06
+8227 3750 6.97502e-14
+7712 14080 6.0365e-20
+7712 14082 2.68323e-17
+7713 1083 3.94464e-15
+7712 14084 2.98768e-15
+
+Similar files will be generated (with the prefix
+"prob_table.actual.xxx" that has the actual tokens instead of their
+unique ids). This is also true for fertility tables. Also the inverse
+probability table will be generated for the final table and it will
+have the infix "ti" .
+
+2. N TABLE ( *.n3.* )
+
+(Fertility table)
+
+ prob_table.n2to3 = n table estimated during the transfer from M2 to M3
+ ptob_table.n3.X = n table after X iterations of model3
+
+Each line in this file is of the following format:
+
+source_token_id p0 p1 p2 .... pn
+
+where p0 is the probability that the source token has zero fertility;
+p1, fertility one, ...., and n is the maximum possible fertility as
+defined in the program.
+
+sample:
+
+1 0.475861 0.282418 0.133455 0.0653083 0.0329326 0.00844979 0.0014008
+10 0.249747 0.000107778 0.307767 0.192208 0.0641439 0.15016 0.0358886
+11 0.397111 0.390421 0.19925 0.013382 2.21286e-05 0 0
+12 0.0163432 0.560621 0.374745 0.00231588 0 0 0
+13 1.78045e-07 0.545694 0.299573 0.132127 0.0230494 9.00322e-05 0
+14 1.41918e-18 0.332721 0.300773 0.0334969 0 0 0
+15 0 5.98626e-10 0.47729 0.0230955 0 0 0
+17 0 1.66346e-07 0.895883 0.103948 0 0 0
+
+
+3. P0 TABLE ( *.p0* )
+
+(1 - P0 is the probability of inserting a null after a
+ source word.)
+
+This file contains only one line with one real number which is the
+value of P0, the probability of not inserting a NULL token.
+
+
+4. A TABLE ( *.a[23].* )
+
+The file names follow the naming conventions above. The format of each
+line is as follows:
+
+i j l m p(i | j, l, m)
+
+where i, j, l, m are all integers and
+ j = position in target sentence
+ i = position in source sentence
+ l = length of source sentence
+ m = length of target sentence
+and p(i/j,l,m) is the probability that a source word in position i is
+moved to position j in a pair of sentences of length l and m.
+
+sample:
+
+15 14 15 14 0.630798
+15 14 15 15 0.414137
+15 14 15 16 0.268919
+15 14 15 17 0.23171
+15 14 15 18 0.117311
+15 14 15 19 0.119202
+15 14 15 20 0.111369
+15 14 15 21 0.0358169
+
+
+5. D3 TABLE ( *.d3.* )
+
+distortion table
+
+The format is similar to the A table with a slight difference --- the
+position of i & j are switched:
+
+j i l m p(j/i,l,m)
+
+sample:
+
+15 14 14 15 0.286397
+15 14 14 16 0.138898
+15 14 14 17 0.109712
+15 14 14 18 0.0868322
+15 14 14 19 0.0535823
+
+6. D4 TABLE: (( *.d4.* )
+
+distortion table for IBM-4
+
+7. D5 TABLE: ( *.d5.* )
+
+distortion table for IBM-5
+
+8. HMM TABLE: ( *.hhmm.* )
+
+alignment probability table for HMM alignment model
+
+B. ALIGNMENT FILE ( *.A3.* )
+
+In each iteration of the training, and for each sentence pair in the
+training set, the best alignment (viterbi alignment) is written to the
+alignment file (if the dump parameters are set accordingly). The
+alignment file is named prob_table.An.i, where n is the model number
+({1,2, 2to3, 3 or 4}), and i is the iteration number. The format of
+the alignments file is illustrated in the following sample:
+
+# Sentence pair (1)
+il s' agit de la même société qui a changé de propriétaires
+NULL ({ }) UNK ({ }) UNK ({ }) ( ({ }) this ({ 4 11 }) is ({ }) the ({ }) same ({ 6 }) agency ({ }) which ({ 8 }) has ({ }) undergone ({ 1 2 3 7 9 10 12 }) a ({ }) change ({ 5 }) of ({ }) UNK ({ })
+# Sentence pair (2)
+UNK UNK , le propriétaire , dit que cela s' est produit si rapidement qu' il n' en connaît pas la cause exacte
+NULL ({ 4 }) UNK ({ 1 2 }) UNK ({ }) , ({ 3 }) the ({ }) owner ({ 5 22 23 }) , ({ 6 }) says ({ 7 8 }) it ({ }) happened ({ 10 11 12 }) so ({ 13 }) fast ({ 14 19 }) he ({ 16 }) is ({ }) not ({ 20 }) sure ({ 15 17 }) what ({ }) went ({ 18 21 }) wrong ({ 9 })
+
+The alignment file is represented by three lines for each sentence
+pair. The first line is a label that can be used, e.g., as a caption
+for alignment visualization tools. It contains information about the
+sentence sequential number in the training corpus, sentence lengths,
+and alignment probability. The second line is the target sentence, the
+third line is the source sentence. Each token in the source sentence
+is followed by a set of zero or more numbers. These numbers represent
+the positions of the target words to which this source word is
+connected, according to the alignment.
+
+
+C. Perplexity File ( *.perp )
+
+This file will be generated at the end of training. It summarizes
+perplexity values for each training iteration. Here is a sample
+perplexity file that illustrates the format. The format is the same
+for cross entropy. If no test corpus was provided, the values for it
+will be set to "N/A".
+
+# train-size test-size iter. model train-perplexity test-perplexity final(y/n) train-viterbi-perp test-viterbi-perp
+ 447136 9625 0 1 187067 186722 n 3.34328e+06 3.35352e+06
+ 447136 9625 1 1 192.88 248.763 n 909.879 1203.13
+ 447136 9625 2 1 99.45 139.214 n 316.363 459.745
+ 447136 9625 3 1 83.4746 126.046 n 214.612 341.27
+ 447136 9625 4 1 78.6939 124.914 n 179.218 303.169
+ 447136 9625 5 2 76.6848 125.986 n 161.874 286.226
+ 447136 9625 6 2 50.7452 86.2273 n 84.7227 151.701
+ 447136 9625 7 2 42.9178 74.5574 n 63.6644 116.034
+ 447136 9625 8 2 40.0651 70.7444 n 56.3186 104.274
+ 447136 9625 9 2 38.8471 69.4105 n 53.1277 99.6044
+ 447136 9625 10 2to3 38.2561 68.9576 n 51.4856 97.4414
+ 447136 9625 11 3 129.993 248.885 n 86.6675 165.012
+ 447136 9625 12 3 79.2212 169.902 n 86.4842 171.367
+ 447136 9625 13 3 75.0746 164.488 n 84.9647 172.639
+ 447136 9625 14 3 73.412 162.765 n 83.5762 172.797
+ 447136 9625 15 3 72.6107 162.254 y 82.4575 172.688
+
+
+D. Revised Vocabulary files (*.src.vcb, *.trg.vcb)
+
+The revised vocabulary files are similar in format to the original
+vocabulary files. The only exceptions is that the frequency for each
+token is calculated from the given corpus (i.e. it is exact), which is
+not required in the input.
+
+E. final parameter file: ( *.gizacfg )
+
+This file includes all the parameter settings that were used in order
+to perform this training. This means that starting GIZA using this
+parameter file produces (should produce) the same training.
+
+
+
+Part VI: LITERATURE
+-------------------
+
+The following two articles include a comparison of the alignment
+models implemented in GIZA++:
+
+@INPROCEEDINGS{och00:isa,
+ AUTHOR = {F.~J.~Och and H.~Ney},
+ TITLE ={Improved Statistical Alignment Models},
+ BOOKTITLE = ACL00 ,
+ PAGES ={440--447},
+ ADDRESS={ Hongkong, China},
+ MONTH = {October},
+ YEAR = 2000}
+
+@INPROCEEDINGS{och00:aco,
+ AUTHOR = {F.~J.~Och and H.~Ney},
+ TITLE = {A Comparison of Alignment Models for Statistical Machine Translation},
+ BOOKTITLE = COLING00,
+ ADDRESS = {Saarbr\"ucken, Germany},
+ YEAR = {2000},
+ MONTH = {August},
+ PAGES = {1086--1090}
+ }
+
+The following article describes the statistical machine translation
+toolkit EGYPT:
+
+@MISC{ alonaizan99:smt,
+AUTHOR = {Y. Al-Onaizan and J. Curin and M. Jahr and K. Knight and J. Lafferty and I. D. Melamed and F. J. Och and D. Purdy and N. A. Smith and D. Yarowsky},
+TITLE = {Statistical Machine Translation, Final Report, {JHU} Workshop},
+YEAR = {1999},
+ADDRESS = {Baltimore, Maryland, MD},
+NOTE={{\tt http://www.clsp.jhu.edu/ws99/projects/ mt/final\_report/mt-final-report.ps}}
+}
+
+
+The implemented alignment models IBM-1 to IBM-5 and HMM were originally described in:
+
+@ARTICLE{brown93:tmo,
+ AUTHOR = {Brown, P. F. and Della Pietra, S. A. and Della Pietra, V. J. and Mercer, R. L.},
+ TITLE = {The Mathematics of Statistical Machine Translation: Parameter Estimation},
+ JOURNAL = {Computational Linguistics},
+ YEAR = 1993,
+ VOLUME = 19,
+ NUMBER = 2,
+ PAGES = {263--311}
+}
+
+@INPROCEEDINGS{ vogel96:hbw,
+ AUTHOR = {Vogel, S. and Ney, H. and Tillmann, C.},
+ TITLE = {{HMM}-Based Word Alignment in Statistical Translation},
+ YEAR = 1996,
+ PAGES = {836--841},
+ MONTH = {August},
+ ADDRESS = {Copenhagen},
+ BOOKTITLE = COLING96
+}
+
+
+Part VII: New features
+======================
+
+2003-06-09:
+
+- new parameter "-nbestalignments N": prints an N-best list of
+ alignments into a file *.NBEST
+
+- If program is compiled with "-DBINARY_SEARCH_FOR_TTABLE", it uses
+ more memory-efficient data structures for the t table (vector with
+ binary search instead of hash table). Then, the program expects a
+ parameter "-CoocurrenceFile FILE" which specifies a file which
+ includes all lexical coccurrences in the training corpus. This file
+ can be produced by the snt2cooc.out tool.
+
+
diff --git a/GIZA++-v2/TTables.cpp b/GIZA++-v2/TTables.cpp
new file mode 100644
index 0000000..25c126f
--- /dev/null
+++ b/GIZA++-v2/TTables.cpp
@@ -0,0 +1,323 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "TTables.h"
+#include "Parameter.h"
+
+GLOBAL_PARAMETER(float,PROB_CUTOFF,"PROB CUTOFF","Probability cutoff threshold for lexicon probabilities",PARLEV_OPTHEUR,1e-7);
+GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF,"COUNTINCREASE CUTOFF","countCutoff","Counts increment cutoff threshold",PARLEV_OPTHEUR,1e-6);
+
+#ifdef BINARY_SEARCH_FOR_TTABLE
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::printCountTable(const char *,
+ const Vector<WordEntry>&,
+ const Vector<WordEntry>&,
+ const bool) const
+{
+}
+
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::printProbTable(const char *filename,
+ const Vector<WordEntry>& evlist,
+ const Vector<WordEntry>& fvlist,
+ const bool actual) const
+{
+ ofstream of(filename);
+ /* for(unsigned int i=0;i<es.size()-1;++i)
+ for(unsigned int j=es[i];j<es[i+1];++j)
+ {
+ const CPPair&x=fs[j].second;
+ WordIndex e=i,f=fs[j].first;
+ if( actual )
+ of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
+ else
+ of << e << ' ' << f << ' ' << x.prob << '\n';
+ }*/
+ for(unsigned int i=0;i<lexmat.size();++i)
+ {
+ if( lexmat[i] )
+ for(unsigned int j=0;j<lexmat[i]->size();++j)
+ {
+ const CPPair&x=(*lexmat[i])[j].second;
+ WordIndex e=i,f=(*lexmat[i])[j].first;
+ if( x.prob>PROB_SMOOTH )
+ if( actual )
+ of << evlist[e].word << ' ' << fvlist[f].word << ' ' << x.prob << '\n';
+ else
+ of << e << ' ' << f << ' ' << x.prob << '\n';
+ }
+ }
+}
+
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::printProbTableInverse(const char *,
+ const Vector<WordEntry>&,
+ const Vector<WordEntry>&,
+ const double,
+ const double,
+ const bool ) const
+{
+}
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::normalizeTable(const vcbList&, const vcbList&, int)
+{
+ for(unsigned int i=0;i<lexmat.size();++i)
+ {
+ double c=0.0;
+ if( lexmat[i] )
+ {
+ unsigned int lSize=lexmat[i]->size();
+ for(unsigned int j=0;j<lSize;++j)
+ c+=(*lexmat[i])[j].second.count;
+ for(unsigned int j=0;j<lSize;++j)
+ {
+ if( c==0 )
+ (*lexmat[i])[j].second.prob=1.0/(lSize);
+ else
+ (*lexmat[i])[j].second.prob=(*lexmat[i])[j].second.count/c;
+ (*lexmat[i])[j].second.count=0;
+ }
+ }
+ }
+}
+
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::readProbTable(const char *){
+}
+
+template class tmodel<COUNT,PROB> ;
+#else
+/* ------------------ Method Definiotns for Class tmodel --------------------*/
+
+#
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::printCountTable(const char *filename,
+ const Vector<WordEntry>& evlist,
+ const Vector<WordEntry>& fvlist,
+ const bool actual) const
+ // this function dumps the t table. Each line is of the following format:
+ //
+ // c(target_word/source_word) source_word target_word
+{
+ ofstream of(filename);
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
+ for(i = ef.begin(); i != ef.end();++i){
+ if ( ((*i).second).count > COUNTINCREASE_CUTOFF)
+ if (actual)
+ of << ((*i).second).count << ' ' << evlist[ ((*i).first).first ].word << ' ' << fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n';
+ else
+ of << ((*i).second).count << ' ' << ((*i).first).first << ' ' << ((*i).first).second << ' ' << (*i).second.prob << '\n';
+ }
+}
+
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::printProbTable(const char *filename,
+ const Vector<WordEntry>& evlist,
+ const Vector<WordEntry>& fvlist,
+ const bool actual) const
+ // this function dumps the t table. Each line is of the following format:
+ //
+ // source_word target_word p(target_word/source_word)
+{
+ ofstream of(filename);
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
+ for(i = ef.begin(); i != ef.end();++i)
+ if( actual )
+ of << evlist[((*i).first).first].word << ' ' <<
+ fvlist[((*i).first).second].word << ' ' << (*i).second.prob << '\n';
+ else
+ of << ((*i).first).first << ' ' << ((*i).first).second << ' ' <<
+ (*i).second.prob << '\n';
+}
+
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::printProbTableInverse(const char *filename,
+ const Vector<WordEntry>& evlist,
+ const Vector<WordEntry>& fvlist,
+ const double,
+ const double,
+ const bool actual) const
+ // this function dumps the inverse t table. Each line is of the format:
+ //
+ // target_word_id source_word_id p(source_word/target_word)
+ //
+ // if flag "actual " is true then print actual word entries instead of
+ // token ids
+{
+ cerr << "Dumping the t table inverse to file: " << filename << '\n';
+ ofstream of(filename);
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
+ PROB p_inv = 0 ;
+ // static const PROB ratio(double(fTotal)/eTotal);
+ WordIndex e, f ;
+ int no_errors(0);
+ vector<PROB> total(fvlist.size(),PROB(0)) ; // Sum over all e of P(f/e) * p(e) - needed for normalization
+
+ for(i = ef.begin(); i != ef.end(); i++){
+ e = ((*i).first).first ;
+ f = ((*i).first).second ;
+ total[f] += (PROB) evlist[e].freq * ((*i).second.prob); //add P(f/ei) * F(ei)
+ }
+
+ for(i = ef.begin(); i != ef.end(); i++){
+ e = ((*i).first).first ;
+ f = ((*i).first).second ;
+ p_inv = ((*i).second.prob) * (PROB) evlist[e].freq / total[f] ;
+ if (p_inv > 1.0001 || p_inv < 0){
+ no_errors++;
+ if (no_errors <= 10){
+ cerr << "printProbTableInverse(): Error - P("<<evlist[e].word<<"("<<
+ e<<") / "<<fvlist[f].word << "("<<f<<")) = " << p_inv <<'\n';
+ cerr << "f(e) = "<<evlist[e].freq << " Sum(p(f/e).f(e)) = " << total[f] <<
+ " P(f/e) = " <<((*i).second.prob) <<'\n';
+ if (no_errors == 10)
+ cerr<<"printProbTableInverse(): Too many P inverse errors ..\n";
+ }
+ }
+ if (actual)
+ of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n';
+ else
+ of << f << ' ' << e << ' ' << p_inv << '\n';
+ }
+}
+/*
+
+
+
+{
+ cerr << "Dumping the t table inverse to file: " << filename << '\n';
+ ofstream of(filename);
+ hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
+ PROB p_inv = 0 ;
+ static const PROB ratio(double(fTotal)/eTotal);
+ WordIndex e, f ;
+ for(i = ef.begin(); i != ef.end(); i++){
+ e = ((*i).first).first ;
+ f = ((*i).first).second ;
+ p_inv = ((*i).second.prob) * ratio * (PROB) evlist[e].freq /
+ (PROB) fvlist[f].freq ;
+ if (actual)
+ of << fvlist[f].word << ' ' << evlist[e].word << ' ' << p_inv << '\n';
+ else
+ of << f << ' ' << e << ' ' << p_inv << '\n';
+ }
+}
+*/
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::normalizeTable(const vcbList&engl, const vcbList&french, int iter)
+ // normalize conditional probability P(fj/ei):
+ // i.e. make sure that Sum over all j of P(fj/e) = 1
+ // this method reads the counts portion of the table and normalize into
+ // the probability portion. Then the counts are cleared (i.e. zeroed)
+ // if the resulting probability of an entry is below a threshold, then
+ // remove it .
+{
+ if( iter==2 )
+ {
+ total2.resize(engl.uniqTokens());for(unsigned int i=0;i<total2.size();i++)total2[i]=0.0;
+ }
+ nFrench.resize(engl.uniqTokens());for(unsigned int i=0;i<nFrench.size();i++)nFrench[i]=0;
+ nEng.resize(french.uniqTokens());for(unsigned int i=0;i<nEng.size();i++)nEng[i]=0;
+ Vector<double> total(engl.uniqTokens(),0.0);
+ //Vector<int> nFrench(engl.uniqTokens(), 0);
+ //Vector<int> nEng(french.uniqTokens(), 0);
+
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
+ for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e
+ if( iter==2 )
+ total2[((*i).first).first] += (*i).second.count;
+ total[((*i).first).first] += (*i).second.count;
+ nFrench[((*i).first).first]++;
+ nEng[((*i).first).second]++;
+ }
+ for(unsigned int k=0;k<engl.uniqTokens();++k)
+ if( nFrench[k] )
+ {
+ double probMass=(french.uniqTokensInCorpus()-nFrench[k])*PROB_SMOOTH;
+ if( probMass<0.0 )
+ cout << k << " french.uniqTokensInCorpus(): " << french.uniqTokensInCorpus() << " nFrench[k]:"<< nFrench[k] << '\n';
+ total[k]+= total[k]*probMass/(1-probMass);
+ }
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator j, k;
+ PROB p ;
+ int nParams=0;
+ for(j = ef.begin(); j != ef.end(); ){
+ k = j;
+ k++ ;
+ if( (total[((*j).first).first])>0.0 )
+ p = ((((*j).second).count) /(total[((*j).first).first])) ;
+ else
+ p= 0.0;
+ if (p > PROB_CUTOFF)
+ {
+ if( iter>0 )
+ {
+ ((*j).second).prob = 0 ;
+ ((*j).second).count = p ;
+ }
+ else
+ {
+ ((*j).second).prob = p ;
+ ((*j).second).count = 0 ;
+ }
+ nParams++;
+ }
+ else {
+ erase(((*j).first).first, ((*j).first).second);
+ }
+ j = k ;
+ }
+ if( iter>0 )
+ return normalizeTable(engl, french, iter-1);
+ else
+ {
+ }
+}
+
+template <class COUNT, class PROB>
+void tmodel<COUNT, PROB>::readProbTable(const char *filename){
+ /* This function reads the t table from a file.
+ Each line is of the format: source_word_id target_word_id p(target_word|source_word)
+ This is the inverse operation of the printTable function.
+ NAS, 7/11/99
+ */
+ ifstream inf(filename);
+ cerr << "Reading t prob. table from " << filename << "\n";
+ if(!inf){
+ cerr << "\nERROR: Cannot open " << filename << "\n";
+ return;
+ }
+ WordIndex src_id, trg_id;
+ PROB prob;
+ int nEntry=0;
+ while( inf >> src_id >> trg_id >> prob){
+ insert(src_id, trg_id, 0.0, prob);
+ nEntry++;
+ }
+ cerr << "Read " << nEntry << " entries in prob. table.\n";
+}
+
+template class tmodel<COUNT,PROB> ;
+
+/* ---------------- End of Method Definitions of class tmodel ---------------*/
+
+
+#endif
diff --git a/GIZA++-v2/TTables.h b/GIZA++-v2/TTables.h
new file mode 100644
index 0000000..4b609ff
--- /dev/null
+++ b/GIZA++-v2/TTables.h
@@ -0,0 +1,424 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/* --------------------------------------------------------------------------*
+ * *
+ * Module : TTables *
+ * *
+ * Prototypes File: TTables.h *
+ * *
+ * Objective: Defines clases and methods for handling I/O for Probability & *
+ * Count tables and also alignment tables *
+ *****************************************************************************/
+
+#ifndef _ttables_h
+#define _ttables_h 1
+
+
+#include "defs.h"
+#include "vocab.h"
+
+#include <assert.h>
+
+#include <iostream>
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <set>
+#include "Vector.h"
+#include <utility>
+
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+
+#include <fstream.h>
+
+#include "Globals.h"
+
+
+/* The tables defined in the following classes are defined as hash tables. For
+ example. the t-table is a hash function of a word pair; an alignment is
+ a hash function of a vector of integer numbers (sentence positions) and so
+ on */
+
+
+/*----------- Defnition of Hash Function for class tmodel ------- -----------*/
+
+typedef pair<WordIndex, WordIndex> wordPairIds;
+
+
+class hashpair : public unary_function< pair<WordIndex, WordIndex>, size_t >
+{
+public:
+ size_t operator() (const pair<WordIndex, WordIndex>& key) const
+ {
+ return (size_t) MAX_W*key.first + key.second; /* hash function and it
+ is guarnteed to have
+ unique id for each
+ unique pair */
+ }
+};
+
+
+
+/* ------------------ Class Prototype Definitions ---------------------------*
+ Class Name: tmodel
+ Objective: This defines the underlying data structur for t Tables and t
+ Count Tables. They are defined as a hash table. Each entry in the hash table
+ is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The
+ probability and the count are represented as log integer probability as
+ defined by the class LogProb .
+
+ This class is used to represents t Tables (probabiliity) and n (fertility
+ Tables and also their corresponding count tables .
+
+ *---------------------------------------------------------------------------*/
+
+//typedef float COUNT ;
+//typedef LogProb PROB ;
+template <class COUNT, class PROB>
+class LpPair {
+ public:
+ COUNT count ;
+ PROB prob ;
+ public: // constructor
+ LpPair():count(0), prob(0){} ;
+ LpPair(COUNT c, PROB p):count(c), prob(p){};
+} ;
+
+#ifdef BINARY_SEARCH_FOR_TTABLE
+
+
+template<class T>
+T*mbinary_search(T*x,T*y,unsigned int val)
+{
+ if( y-x==0 )
+ return 0;
+ if( x->first==val)
+ return x;
+ if( y-x<2 )
+ return 0;
+ T*mid=x+(y-x)/2;
+ if( val < mid->first )
+ return mbinary_search(x,mid,val);
+ else
+ return mbinary_search(mid,y,val);
+
+}
+
+template<class T>
+const T*mbinary_search(const T*x,const T*y,unsigned int val)
+{
+ if( y-x==0 )
+ return 0;
+ if( x->first==val)
+ return x;
+ if( y-x<2 )
+ return 0;
+ const T*mid=x+(y-x)/2;
+ if( val < mid->first )
+ return mbinary_search(x,mid,val);
+ else
+ return mbinary_search(mid,y,val);
+
+}
+
+template <class COUNT, class PROB>
+class tmodel{
+ typedef LpPair<COUNT, PROB> CPPair;
+ public:
+ int noEnglishWords; // total number of unique source words
+ int noFrenchWords; // total number of unique target words
+ //vector<pair<unsigned int,CPPair> > fs;
+ //vector<unsigned int> es;
+ vector< vector<pair<unsigned int,CPPair> >* > lexmat;
+
+ void erase(WordIndex e, WordIndex f)
+ {
+ CPPair *p=find(e,f);
+ if(p)
+ *p=CPPair(0,0);
+ };
+ CPPair*find(int e,int f)
+ {
+ //pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
+ //pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
+ pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
+ pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
+ pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
+ if( x==0 )
+ {
+ //cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n';
+ //abort();
+ return 0;
+ }
+ return &(x->second);
+ }
+ const CPPair*find(int e,int f)const
+ {
+ const pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
+ const pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
+ //const pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
+ //const pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
+ const pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
+ if( x==0 )
+ {
+ //cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n';
+ //abort();
+ return 0;
+ }
+
+ return &(x->second);
+ }
+public:
+ void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
+ *find(e,f)=CPPair(cval,pval);
+ }
+ CPPair*getPtr(int e,int f){return find(e,f);}
+ tmodel(const string&fn)
+ {
+ int count=0,count2=0;
+ ifstream infile2(fn.c_str());
+ int e,f,olde=-1,oldf=-1;
+ pair<unsigned int,CPPair> cp;
+ vector< pair<unsigned int,CPPair> > cps;
+ while(infile2>>e>>f)
+ {
+ cp.first=f;
+ assert(e>=olde);
+ assert(e>olde ||f>oldf);
+ if( e!=olde&&olde>=0 )
+ {
+ int oldsize=lexmat.size();
+ lexmat.resize(olde+1);
+ for(unsigned int i=oldsize;i<lexmat.size();++i)
+ lexmat[i]=0;
+ lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
+ cps.clear();
+ if( !((*lexmat[olde]).size()==(*lexmat[olde]).capacity()) )
+ cerr << "eRROR: waste of memory: " << (*lexmat[olde]).size() << " " << (*lexmat[olde]).capacity() << endl;
+ count2+=lexmat[olde]->capacity();
+ }
+ cps.push_back(cp);
+ olde=e;
+ oldf=f;
+ count++;
+ }
+ lexmat.resize(olde+1);
+ lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
+ count2+=lexmat[olde]->capacity();
+ cout << "There are " << count << " " << count2 << " entries in table" << '\n';
+ }
+
+
+ /* tmodel(const string&fn)
+ {
+ size_t count=0;
+ {
+ ifstream infile1(fn.c_str());
+ if( !infile1 )
+ {
+ cerr << "ERROR: can't read coocurrence file " << fn << '\n';
+ abort();
+ }
+ int e,f;
+ while(infile1>>e>>f)
+ count++;
+ }
+ cout << "There are " << count << " entries in table" << '\n';
+ ifstream infile2(fn.c_str());
+ fs.resize(count);
+ int e,f,olde=-1,oldf=-1;
+ pair<unsigned int,CPPair> cp;
+ count=0;
+ while(infile2>>e>>f)
+ {
+ assert(e>=olde);
+ assert(e>olde ||f>oldf);
+ if( e!=olde )
+ {
+ es.resize(e+1);
+ for(unsigned int i=olde+1;int(i)<=e;++i)
+ es[i]=count;
+ }
+ cp.first=f;
+ assert(count<fs.size());
+ fs[count]=cp;
+ //fs.push_back(cp);
+ olde=e;
+ oldf=f;
+ count++;
+ }
+ assert(count==fs.size());
+ es.push_back(fs.size());
+ cout << fs.size() << " " << count << " coocurrences read" << '\n';
+ }*/
+ void incCount(WordIndex e, WordIndex f, COUNT inc)
+ {
+ if( inc )
+ {
+ CPPair *p=find(e,f);
+ if( p )
+ p->count += inc ;
+ }
+ }
+
+ PROB getProb(WordIndex e, WordIndex f) const
+ {
+ const CPPair *p=find(e,f);
+ if( p )
+ return max(p->prob, PROB_SMOOTH);
+ else
+ return PROB_SMOOTH;
+ }
+
+ COUNT getCount(WordIndex e, WordIndex f) const
+ {
+ const CPPair *p=find(e,f);
+ if( p )
+ return p->count;
+ else
+ return 0.0;
+ }
+
+ void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
+ void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
+ void printProbTableInverse(const char *filename,
+ const Vector<WordEntry>& evlist,
+ const Vector<WordEntry>& fvlist,
+ const double eTotal,
+ const double fTotal,
+ const bool actual = false ) const;
+ void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
+ void readProbTable(const char *filename);
+};
+
+
+#else
+
+
+template <class COUNT, class PROB>
+class tmodel{
+ typedef LpPair<COUNT, PROB> CPPair;
+ public:
+ int noEnglishWords; // total number of unique source words
+ int noFrenchWords; // total number of unique target words
+ hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> > ef;
+ void erase(WordIndex e, WordIndex f)
+ // In: a source and a target token ids.
+ // removes the entry with that pair from table
+ {
+ ef.erase(wordPairIds(e, f));
+ };
+
+public:
+ Vector<PROB> total2;
+ Vector<int> nFrench;
+ Vector<int> nEng;
+
+
+ // methods;
+
+ // insert: add entry P(fj/ei) to the hash function, Default value is 0.0
+ void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){
+ ef[wordPairIds(e, f)].count = cval ;
+ ef[wordPairIds(e, f)].prob = pval ;
+ }
+
+ // returns a reference to the word pair, if does not exists, it creates it.
+ CPPair&getRe(WordIndex e, WordIndex f)
+ {return ef[wordPairIds(e, f)];}
+
+ // returns a pointer to an existing word pair. if pair does not exists,
+ // the method returns the zero pointer (NULL)
+
+ CPPair*getPtr(WordIndex e, WordIndex f)
+ {
+ // look up this pair and return its position
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator i = ef.find(wordPairIds(e, f));
+ if(i != ef.end()) // if it exists, return a pointer to it.
+ return(&((*i).second));
+ else return(0) ; // else return NULL pointer
+ }
+
+ void incCount(WordIndex e, WordIndex f, COUNT inc)
+ // increments the count of the given word pair. if the pair does not exist,
+ // it creates it with the given value.
+ {
+ if( inc )
+ ef[wordPairIds(e, f)].count += inc ;
+ }
+
+ PROB getProb(WordIndex e, WordIndex f) const
+ // read probability value for P(fj/ei) from the hash table
+ // if pair does not exist, return floor value PROB_SMOOTH
+ {
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i= ef.find(wordPairIds(e, f));
+ if(i == ef.end())
+ return PROB_SMOOTH;
+ else
+ return max(((*i).second).prob, PROB_SMOOTH);
+ }
+
+ COUNT getCount(WordIndex e, WordIndex f) const
+ /* read count value for entry pair (fj/ei) from the hash table */
+ {
+ typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i= ef.find(wordPairIds(e, f));
+ if(i == ef.end())
+ return 0;
+ else
+ return ((*i).second).count;
+ }
+
+ inline const hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >& getHash(void) const {return ef;};
+ /* get a refernece to the hash table */
+ inline void resize(WordIndex n) {ef.resize(n);};
+ // to resize he hash table
+
+ void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
+ void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
+ // print the t table to the given file but this time print actual source and
+ // target words instead of thier token ids
+
+ void printProbTableInverse(const char *filename,
+ const Vector<WordEntry>& evlist,
+ const Vector<WordEntry>& fvlist,
+ const double eTotal,
+ const double fTotal,
+ const bool actual = false ) const;
+ // dump inverse of t table (i.e P(ei/fj)) to the given file name,
+ // if the given flag is true then actual words are printed not token ids
+
+ void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
+ // to norlmalize the table i.e. make sure P(fj/ei) for all j is equal to 1
+
+ void readProbTable(const char *filename);
+ // void readAsFertilityTable(const char *filename);
+};
+/*--------------- End of Class Definition for tmodel -----------------------*/
+
+#endif
+
+#endif
diff --git a/GIZA++-v2/Vector.h b/GIZA++-v2/Vector.h
new file mode 100644
index 0000000..a550e82
--- /dev/null
+++ b/GIZA++-v2/Vector.h
@@ -0,0 +1,424 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/*--
+Vector: checked vector implementation
+
+Franz Josef Och (30/07/99)
+--*/
+#ifndef ARRAY_H_DEFINED
+#define ARRAY_H_DEFINED
+#include "mystl.h"
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <functional>
+#include <assert.h>
+
+
+#ifdef NDEBUG
+
+#include <vector>
+#define Vector vector
+template<class T> ostream& operator<<(ostream&o, const Vector<T>&a)
+{
+ o << "Vector(" << a.size() << "){ ";
+ for(unsigned int iii=0;iii<a.size();iii++)
+ o << " " << iii<< ": " << a[iii]<<" ;";
+ return o << "}\n";
+}
+
+#else
+
+#define ARRAY_DEBUG
+#define memo_del(a, b)
+#define memo_new(a)
+
+template<class T> class Vector
+{
+ private:
+ T *p;
+ int realSize;
+ int maxWritten;
+
+ void copy(T *a, const T *b, int n);
+ void copy(T *a, T *b, int n);
+ void _expand();
+
+ public:
+ Vector()
+ : p(0), realSize(0), maxWritten(-1)
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "MAKE ARRAY: " << this<<" "<<(void*)p << '\n';
+#endif
+ }
+ Vector(const Vector<T> &x)
+ : p(new T[x.maxWritten+1]), realSize(x.maxWritten+1), maxWritten(x.maxWritten)
+ {
+ memo_new(p);
+ copy(p, x.p, realSize);
+#ifdef VERY_ARRAY_DEBUG
+ cout << "MAKE ARRAY copy: " << this << " " << realSize <<" "<<(void*)p<< '\n';
+#endif
+ }
+ explicit Vector(int n)
+ : p(new T[n]), realSize(n), maxWritten(n-1)
+ {
+ memo_new(p);
+#ifdef VERY_ARRAY_DEBUG
+ cout << "MAKE ARRAY with parameter n: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ }
+ Vector(int n, const T&_init)
+ : p(new T[n]), realSize(n), maxWritten(n-1)
+ {
+ memo_new(p);
+ for(int iii=0;iii<n;iii++)p[iii]=_init;
+#ifdef VERY_ARRAY_DEBUG
+ cout << "MAKE ARRAY with parameter n and init: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ }
+
+ ~Vector()
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ delete [] p;
+ memo_del(p, 1);
+#ifndef NDEBUG
+ p=0;realSize=-1;maxWritten=-1;
+#endif
+ }
+
+ Vector<T>& operator=(const Vector<T>&x)
+ {
+ if( this!= &x )
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ delete [] p;
+ memo_del(p, 1);
+ realSize = x.maxWritten+1;
+ maxWritten = x.maxWritten;
+ p = new T[realSize];
+ memo_new(p);
+ copy(p, x.p, realSize);
+#ifdef VERY_ARRAY_DEBUG
+ cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ }
+ return *this;
+ }
+
+ Vector<T>& operator=(Vector<T>&x)
+ {
+ if( this!= &x )
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ delete [] p;
+ memo_del(p, 1);
+ realSize = x.maxWritten+1;
+ maxWritten = x.maxWritten;
+ p = new T[realSize];
+ memo_new(p);
+ copy(p, x.p, realSize);
+#ifdef VERY_ARRAY_DEBUG
+ cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ }
+ return *this;
+ }
+
+ void allowAccess(int n)
+ {
+ while( realSize<=n )
+ _expand();
+ maxWritten=max(maxWritten, n);
+ assert( maxWritten<realSize );
+ }
+ void resize(int n)
+ {
+ while( realSize<n )
+ _expand();
+ maxWritten=n-1;
+ }
+ void clear()
+ {
+ resize(0);
+ }
+ void reserve(int n)
+ {
+ int maxOld=maxWritten;
+ resize(n);
+ maxWritten=maxOld;
+ }
+ void sort(int until=-1)
+ {
+ if( until== -1 ) until=size();
+ std::sort(p, p+until);
+ }
+ void invsort(int until=-1)
+ {
+ if( until== -1 ) until=size();
+ std::sort(p, p+until, greater<T>());
+ }
+ void init(int n, const T&_init)
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ delete []p;
+ memo_del(p, 1);
+ p=new T[n];
+ memo_new(p);
+ realSize=n;
+ maxWritten=n-1;
+ for(int iii=0;iii<n;iii++)p[iii]=_init;
+#ifdef VERY_ARRAY_DEBUG
+ cout << "NEW ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ }
+ inline unsigned int size() const
+ {assert( maxWritten<realSize );
+ return maxWritten+1;}
+ inline int low() const
+ { return 0; }
+ inline int high() const
+ { return maxWritten; }
+ int findMax() const;
+ int findMin() const;
+ const void errorAccess(int n) const;
+ inline T*getPointerToData(){return p;}
+ inline T*begin(){return p;}
+ inline T*end(){return p+maxWritten+1;}
+ inline T& operator[](int n)
+ {
+#ifndef NDEBUG
+ if( n<0 || n>maxWritten )
+ errorAccess(n);
+#endif
+ return p[n];
+ }
+ inline const T& operator[](int n) const
+ {
+#ifndef NDEBUG
+ if(n<0 || n>maxWritten )
+ errorAccess(n);
+#endif
+ return p[n];
+ }
+ inline const T& get(int n) const
+ {
+#ifndef NDEBUG
+ if(n<0 || n>maxWritten )
+ errorAccess(n);
+#endif
+ return p[n];
+ }
+ const T&top(int n=0) const
+ {return (*this)[maxWritten-n];}
+ T&top(int n=0)
+ {return (*this)[maxWritten-n];}
+ const T&back(int n=0) const
+ {return (*this)[maxWritten-n];}
+ T&back(int n=0)
+ {return (*this)[maxWritten-n];}
+ T&push_back(const T&x)
+ {
+ allowAccess(maxWritten+1);
+ (*this)[maxWritten]=x;
+ return top();
+ }
+ bool writeTo(ostream&out) const
+ {
+ out << "Vector ";
+ out << size() << " ";
+ out << a << '\n';
+ for(int iv=0;iv<=maxWritten;iv++)
+ {
+ writeOb(out, (*this)[iv]);
+ out << '\n';
+ }
+ return 1;
+ }
+ bool readFrom(istream&in)
+ {
+ string s;
+ if( !in )
+ {
+ cerr << "ERROR(Vector): file cannot be opened.\n";
+ return 0;
+ }
+ in >> s;
+ if( !(s=="Vector") )
+ {
+ cerr << "ERROR(Vector): Vector!='"<<s<<"'\n";
+ return 0;
+ }
+ int biggest;
+ in >> biggest;
+ in >> a;
+ resize(biggest);
+ for(int iv=0;iv<size();iv++)
+ {
+ readOb(in, (*this)[iv]);
+ }
+ return 1;
+ }
+};
+
+template<class T> bool operator==(const Vector<T> &x, const Vector<T> &y)
+{
+ if( &x == &y )
+ return 1;
+ else
+ {
+ if( y.size()!=x.size() )
+ return 0;
+ else
+ {
+ for(unsigned int iii=0;iii<x.size();iii++)
+ if( !(x[iii]==y[iii]) )
+ return 0;
+ return 1;
+ }
+ }
+}
+template<class T> bool operator!=(const Vector<T> &x, const Vector<T> &y)
+{
+ return !(x==y);
+}
+
+template<class T> bool operator<(const Vector<T> &x, const Vector<T> &y)
+{
+ if( &x == &y )
+ return 0;
+ else
+ {
+ if( y.size()<x.size() )
+ return !(y<x);
+ for(int iii=0;iii<x.size();iii++)
+ {
+ assert( iii!=y.size() );
+ if( x[iii]<y[iii] )
+ return 1;
+ else if( y[iii]<x[iii] )
+ return 0;
+ }
+ return x.size()!=y.size();//??
+ }
+}
+
+
+template<class T> const void Vector<T>:: errorAccess(int n) const
+{
+ cerr << "ERROR: Access to array element " << n
+ << " (" << maxWritten << ", " << realSize << ", " << (void*)p << ")\n";
+ cout << "ERROR: Access to array element " << n
+ << " (" << maxWritten << ", " << realSize << ", " << (void*)p << ")\n";
+ assert(0);
+#ifndef DEBUG
+ abort();
+#endif
+}
+
+template<class T> ostream& operator<<(ostream&o, const Vector<T>&a)
+{
+ o << "Vector(" << a.size() << "){ ";
+ for(unsigned int iii=0;iii<a.size();iii++)
+ o << " " << iii<< ": " << a[iii]<<" ;";
+ return o << "}\n";
+}
+
+template<class T> istream& operator>>(istream&in, Vector<T>&)
+{return in;}
+
+template<class T> int Hash(const Vector<T>&a)
+{
+ int n=0;
+ for(int iii=0;iii<a.size();iii++)
+ n+=Hash(a[iii])*(iii+1);
+ return n+a.size()*47;
+}
+template<class T> void Vector<T>::copy(T *aa, const T *bb, int n)
+{
+ for(int iii=0;iii<n;iii++)
+ aa[iii]=bb[iii];
+}
+template<class T> void Vector<T>::copy(T *aa, T *bb, int n)
+{
+ for(int iii=0;iii<n;iii++)
+ aa[iii]=bb[iii];
+}
+
+template<class T> void Vector<T>::_expand()
+{
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+ T *oldp=p;
+ int oldsize=realSize;
+ realSize=realSize*2+1;
+ p=new T[realSize];
+ memo_new(p);
+ copy(p, oldp, oldsize);
+ delete [] oldp;
+ memo_del(oldp, 1);
+#ifdef VERY_ARRAY_DEBUG
+ cout << "NEW ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << '\n';
+#endif
+}
+
+template<class T> int Vector<T>::findMax() const
+{
+ if( size()==0 )
+ return -1;
+ else
+ {
+ int maxPos=0;
+ for(int iii=1;iii<size();iii++)
+ if( (*this)[maxPos]<(*this)[iii] )
+ maxPos=iii;
+ return maxPos;
+ }
+}
+template<class T> int Vector<T>::findMin() const
+{
+ if( size()==0 )
+ return -1;
+ else
+ {
+ int minPos=0;
+ for(int iii=1;iii<size();iii++)
+ if( (*this)[iii]<(*this)[minPos] )
+ minPos=iii;
+ return minPos;
+ }
+}
+
+#endif
+
+#endif
diff --git a/GIZA++-v2/WordClasses.h b/GIZA++-v2/WordClasses.h
new file mode 100644
index 0000000..1fea083
--- /dev/null
+++ b/GIZA++-v2/WordClasses.h
@@ -0,0 +1,95 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef WordClasses_h_DEFINED
+#define WordClasses_h_DEFINED
+#include <map>
+#include <string>
+#include <set>
+
+class WordClasses
+{
+ private:
+ map<string,string> Sw2c;
+ map<string,int> Sc2int;
+ Vector<string> Sint2c;
+ Vector<int> w2c;
+ unsigned int classes;
+ public:
+ WordClasses()
+ : classes(1)
+ {
+ Sint2c.push_back("0");
+ Sc2int["0"]=0;
+ }
+ template<class MAPPER> bool read(istream&in,const MAPPER&m)
+ {
+ string sline;
+ int maxword=0;
+ while(getline(in,sline))
+ {
+ string word,wclass;
+ istrstream iline(sline.c_str());
+ iline>>word>>wclass;
+ maxword=max(m(word),maxword);
+ assert(Sw2c.count(word)==0);
+ Sw2c[word]=wclass;
+ if( !Sc2int.count(wclass) )
+ {
+ Sc2int[wclass]=classes++;
+ Sint2c.push_back(wclass);
+ assert(classes==Sint2c.size());
+ }
+ }
+ w2c=Vector<int>(maxword+1,0);
+ for(map<string,string>::const_iterator i=Sw2c.begin();i!=Sw2c.end();++i)
+ w2c[m(i->first)]=Sc2int[i->second];
+ cout << "Read classes: #words: " << maxword << " " << " #classes: "<< classes <<endl;
+ return 1;
+ }
+ int getClass(int w)const
+ {
+ if(w>=0&&int(w)<int(w2c.size()) )
+ return w2c[w];
+ else
+ return 0;
+ }
+ const int operator()(const string&x)const
+ {
+ if( Sc2int.count(x) )
+ return Sc2int.find(x)->second;
+ else
+ {
+ cerr << "WARNING: class " << x << " not found.\n";
+ return 0;
+ }
+ }
+ string classString(unsigned int cnr)const
+ {
+ if( cnr<Sint2c.size())
+ return Sint2c[cnr];
+ else
+ return string("0");
+ }
+};
+
+#endif
diff --git a/GIZA++-v2/alignment.cpp b/GIZA++-v2/alignment.cpp
new file mode 100644
index 0000000..55a2e5c
--- /dev/null
+++ b/GIZA++-v2/alignment.cpp
@@ -0,0 +1,38 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/*--
+alignment: 'checked' alignment representation with automatic calculation
+ of fertilities
+Franz Josef Och (30/07/99)
+--*/
+#include "alignment.h"
+
+ostream&operator<<(ostream&out, const alignment&a)
+{
+ int m=a.a.size()-1,l=a.f.size()-1;
+ out << "AL(l:"<<l<<",m:"<<m<<")(a: ";
+ for(int j=1;j<=m;j++)out << a(j) << ' ';
+ out << ")(fert: ";
+ for(int i=0;i<=l;i++)out << a.fert(i) << ' ';
+ return out << ") c:"<<"\n";
+}
+
diff --git a/GIZA++-v2/alignment.h b/GIZA++-v2/alignment.h
new file mode 100644
index 0000000..17774c6
--- /dev/null
+++ b/GIZA++-v2/alignment.h
@@ -0,0 +1,227 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/*--
+alignment: 'checked' alignment representation with autom. calc. of fertilities
+Franz Josef Och (30/07/99)
+--*/
+#ifndef alignment_h_fjo_defined
+#define alignment_h_fjo_defined
+#include "Vector.h"
+#include <assert.h>
+#include "defs.h"
+#include "myassert.h"
+
+class al_struct
+{
+ public:
+ al_struct()
+ : prev(0),next(0){}
+ PositionIndex prev,next;
+};
+
+
+class alignment
+{
+ private:
+ Vector<PositionIndex> a;
+ Vector<PositionIndex> positionSum,f;
+ public:
+ Vector<PositionIndex> als_i;
+ Vector<al_struct> als_j;
+ PositionIndex l,m;
+ alignment()
+ {}
+ alignment(PositionIndex _l, PositionIndex _m)
+ : a(_m+1, (PositionIndex)0),
+ positionSum(_l+1, (PositionIndex)0), f(_l+1, (PositionIndex)0), als_i(_l+1,0),als_j(_m+1),l(_l), m(_m)
+ {
+ f[0]=m;
+ for(PositionIndex j=1;j<=m;j++)
+ {
+ if( j>1 )
+ als_j[j].prev= j-1;
+ if( j<m )
+ als_j[j].next= j+1;
+ }
+ als_i[0]=1;
+ }
+ PositionIndex get_l()const
+ {return l;}
+ PositionIndex get_m()const
+ {return m;}
+ void doMove(int i,int j)
+ {
+ set(j,i);
+ }
+ void doSwap(int j1,int j2)
+ {
+ int aj1=a[j1],aj2=a[j2];
+ set(j1,aj2);
+ set(j2,aj1);
+ }
+ void set(PositionIndex j, PositionIndex aj)
+ {
+ PositionIndex old_aj=a[j];
+ massert(j<a.size());massert(aj<f.size());
+ massert(old_aj<f.size());massert(f[old_aj]>0);
+ massert(j>0);
+ positionSum[old_aj]-=j;
+ // ausfuegen
+ PositionIndex prev=als_j[j].prev;
+ PositionIndex next=als_j[j].next;
+ if( next )
+ als_j[next].prev=prev;
+ if( prev )
+ als_j[prev].next=next;
+ else
+ als_i[old_aj]=next;
+
+ // neue Position suchen
+ PositionIndex lfd=als_i[aj],llfd=0;
+ while( lfd && lfd<j )
+ lfd = als_j[llfd=lfd].next;
+
+ // einfuegen
+ als_j[j].prev=llfd;
+ als_j[j].next=lfd;
+ if( llfd )
+ als_j[llfd].next=j;
+ else
+ als_i[aj]=j;
+ if( lfd )
+ als_j[lfd].prev=j;
+
+ f[old_aj]--;
+ positionSum[aj]+=j;
+ f[aj]++;
+ a[j]=aj;
+ }
+ const Vector<PositionIndex>& getAlignment() const
+ {return a ;}
+ PositionIndex get_al(PositionIndex j)const
+ {
+ massert(j<a.size());
+ return a[j];
+ }
+ PositionIndex operator()(PositionIndex j)const
+ {
+ massert(j<a.size());
+ return a[j];
+ }
+ PositionIndex fert(PositionIndex i)const
+ {
+ massert(i<f.size());
+ return f[i];
+ }
+ PositionIndex get_head(PositionIndex i)const
+ {
+ massert( als_i[i]==_get_head(i) );
+ return als_i[i];
+ }
+ PositionIndex get_center(PositionIndex i)const
+ {
+ if( i==0 )return 0;
+ massert(((positionSum[i]+f[i]-1)/f[i]==_get_center(i)));
+ return (positionSum[i]+f[i]-1)/f[i];
+ }
+ PositionIndex _get_head(PositionIndex i)const
+ {
+ if( fert(i)==0 )return 0;
+ for(PositionIndex j=1;j<=m;j++)
+ if( a[j]==i )
+ return j;
+ return 0;
+ }
+ PositionIndex _get_center(PositionIndex i)const
+ {
+ if( i==0 )return 0;
+ massert(fert(i));
+ PositionIndex sum=0;
+ for(PositionIndex j=1;j<=m;j++)
+ if( a[j]==i )
+ sum+=j;
+ return (sum+fert(i)-1)/fert(i);
+ }
+ PositionIndex prev_cept(PositionIndex i)const
+ {
+ if( i==0 )return 0;
+ PositionIndex k=i-1;
+ while(k&&fert(k)==0)
+ k--;
+ return k;
+ }
+ PositionIndex next_cept(PositionIndex i)const
+ {
+ PositionIndex k=i+1;
+ while(k<l+1&&fert(k)==0)
+ k++;
+ return k;
+ }
+ PositionIndex prev_in_cept(PositionIndex j)const
+ {
+ //PositionIndex k=j-1;
+ //while(k&&a[k]!=a[j])
+ //k--;
+ //assert( als_j[j].prev==k );
+ //assert(k);
+ //return k;
+ massert(als_j[j].prev==0||a[als_j[j].prev]==a[j]);
+ return als_j[j].prev;
+ }
+ friend ostream &operator<<(ostream&out, const alignment&a);
+ friend bool operator==(const alignment&a, const alignment&b)
+ {
+ massert(a.a.size()==b.a.size());
+ for(PositionIndex j=1;j<=a.get_m();j++)
+ if(a(j)!=b(j))
+ return 0;
+ return 1;
+ }
+ friend bool operator<(const alignment&x, const alignment&y)
+ {
+ massert(x.get_m()==y.get_m());
+ for(PositionIndex j=1;j<=x.get_m();j++)
+ if( x(j)<y(j) )
+ return 1;
+ else if( y(j)<x(j) )
+ return 0;
+ return 0;
+ }
+ friend int differences(const alignment&x, const alignment&y){
+ int count=0;
+ massert(x.get_m()==y.get_m());
+ for(PositionIndex j=1;j<=x.get_m();j++)
+ count += (x(j)!=y(j));
+ return count;
+ }
+ bool valid()const
+ {
+ if( 2*f[0]>m )
+ return 0;
+ for(unsigned int i=1;i<=l;i++)
+ if( f[i]>=MAX_FERTILITY )
+ return 0;
+ return 1;
+ }
+ friend class transpair_model5;
+};
+#endif
diff --git a/GIZA++-v2/collCounts.cpp b/GIZA++-v2/collCounts.cpp
new file mode 100644
index 0000000..6e6ef69
--- /dev/null
+++ b/GIZA++-v2/collCounts.cpp
@@ -0,0 +1,293 @@
+/*
+
+Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "alignment.h"
+#include "transpair_model3.h"
+#include <map>
+#include "collCounts.h"
+#include "MoveSwapMatrix.h"
+#include "D5Tables.h"
+#include "transpair_model5.h"
+#include "transpair_modelhmm.h"
+#include "Parameter.h"
+
+extern float COUNTINCREASE_CUTOFF_AL;
+// unifies collectCountsOverAlignments and findAlignmentNeighborhood FJO-20/07/99
+template<class TRANSPAIR>
+int collectCountsOverNeighborhood(const MoveSwapMatrix<TRANSPAIR>&msc,LogProb ascore,Array2<LogProb,Vector<LogProb> >&dtcount,Array2<LogProb,Vector<LogProb> >&ncount,LogProb&p1count,LogProb&p0count,LogProb&total_count)
+{
+ int nAl=0;
+ const PositionIndex l=msc.get_l(),m=msc.get_m();
+ Array2<LogProb,Vector<LogProb> > cmove(l+1,m+1),cswap(l+1,m+1);
+ Vector<LogProb> negmove(m+1),negswap(m+1),plus1fert(l+1),minus1fert(l+1);
+ LogProb total_move,total_swap;
+ if( msc.isCenterDeleted()==0 )
+ {
+ total_move+=ascore;
+ nAl++;
+ }
+ for(PositionIndex j=1;j<=m;j++)
+ for(PositionIndex i=0;i<=l;i++)
+ if( msc(j)!=i && !msc.isDelMove(i,j) )
+ {
+ LogProb newscore=ascore*msc.cmove(i,j);
+ total_move+=newscore;
+ nAl++;
+ cmove(i,j)+=newscore;
+ negmove[j]+=newscore;
+ plus1fert[i]+=newscore;
+ minus1fert[msc(j)]+=newscore;
+ }
+ for(PositionIndex j1=1;j1<=m;j1++)
+ for(PositionIndex j2=j1+1;j2<=m;j2++)
+ if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) )
+ {
+ LogProb newscore=ascore*msc.cswap(j1,j2);
+ total_swap+=newscore;
+ nAl++;
+ cswap(msc(j1),j2)+=newscore;
+ cswap(msc(j2),j1)+=newscore;
+ negswap[j1]+=newscore;
+ negswap[j2]+=newscore;
+ }
+ total_count+=total_move+total_swap;
+ for(PositionIndex j=1;j<=m;j++)
+ for(PositionIndex i=0;i<=l;i++)
+ dtcount(i,j) += ((i==msc(j)) ? (total_count-(negmove[j]+negswap[j])) : (cswap(i,j)+cmove(i,j)));
+ for(PositionIndex i=1;i<=l;i++)
+ {
+ LogProb temp=minus1fert[i]+plus1fert[i];
+ if( msc.fert(i)<MAX_FERTILITY )
+ ncount(i,msc.fert(i))+=total_count-temp;
+ if(msc.fert(i)>0&&msc.fert(i)-1<MAX_FERTILITY)
+ ncount(i,msc.fert(i)-1)+=minus1fert[i];
+ else
+ if( minus1fert[i]!=0.0 )
+ cerr << "ERROR: M1Fa: " << minus1fert[i] << ' ' << i << ' ' << msc.fert(i)<< endl;
+ if(msc.fert(i)+1<MAX_FERTILITY)
+ ncount(i,msc.fert(i)+1)+=plus1fert[i];
+ }
+ LogProb temp=minus1fert[0]+plus1fert[0];
+ p1count += (total_count-temp)*(LogProb)msc.fert(0);
+ p0count += (total_count-temp)*(LogProb)(m-2*msc.fert(0));
+ if( msc.fert(0)>0 )
+ {
+ p1count += (minus1fert[0])*(LogProb)(msc.fert(0)-1);
+ p0count += (minus1fert[0])*(LogProb)(m-2*(msc.fert(0)-1));
+ }
+ else
+ if( minus1fert[0]!=0.0 )
+ cerr << "ERROR: M1Fb: " << minus1fert[0] << endl;
+ if(int(m)-2*(int(msc.fert(0))+1)>=0)
+ {
+ p1count += (plus1fert[0])*(LogProb)(msc.fert(0)+1);
+ p0count += (plus1fert[0])*(LogProb)(m-2*(msc.fert(0)+1));
+ }
+ msc.check();
+ return nAl;
+};
+
+template<class TRANSPAIR>
+double collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&,LogProb,void*)
+{
+ return 0.0;
+}
+
+template<class TRANSPAIR>
+void _collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&Mmsc,const alignment&msc,const TRANSPAIR&ef,LogProb normalized_ascore,d4model*d4Table)
+{
+ Mmsc.check();
+ const PositionIndex m=msc.get_m(),l=msc.get_l();
+ for(PositionIndex j=1;j<=m;++j)
+ if( msc(j)!=0 )
+ if( msc.get_head(msc(j))==j)
+ {
+ int ep=msc.prev_cept(msc(j));
+ //massert( &d4Table->getCountRef_first(j,msc.get_center(ep),d4Table->ewordclasses.getClass(ef.get_es(ep)),d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m) == ef.getCountFirst(ep,j,msc.get_center(ep)));
+ d4Table->getCountRef_first(j,msc.get_center(ep),d4Table->ewordclasses.getClass(ef.get_es(ep)),d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m)+=normalized_ascore;
+ }
+ else
+ {
+ //massert( &d4Table->getCountRef_bigger(j,msc.prev_in_cept(j),0,d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m) == ef.getCountSecond(j,msc.prev_in_cept(j) ));
+ d4Table->getCountRef_bigger(j,msc.prev_in_cept(j),0,d4Table->fwordclasses.getClass(ef.get_fs(j)),l,m)+=normalized_ascore;
+ }
+}
+
+template<class TRANSPAIR>
+void _collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&Mmsc,const alignment&msc,const TRANSPAIR&ef,LogProb normalized_ascore,d5model*d5Table)
+{
+ Mmsc.check();
+ _collectCountsOverNeighborhoodForSophisticatedModels(Mmsc,msc,ef,normalized_ascore,&d5Table->d4m);
+ Mmsc.check();
+ const PositionIndex m=msc.get_m(),l=msc.get_l();
+ PositionIndex prev_cept=0;
+ PositionIndex vac_all=m;
+ Vector<char> vac(m+1,0);
+ for(PositionIndex i=1;i<=l;i++)
+ {
+ PositionIndex cur_j=msc.als_i[i];
+ PositionIndex prev_j=0;
+ PositionIndex k=0;
+ if(cur_j) { // process first word of cept
+ k++;
+ d5Table->getCountRef_first(vacancies(vac,cur_j),vacancies(vac,msc.get_center(prev_cept)),
+ d5Table->fwordclasses.getClass(ef.get_fs(cur_j)),l,m,vac_all-msc.fert(i)+k)+=normalized_ascore;
+ vac_all--;
+ assert(vac[cur_j]==0);
+ vac[cur_j]=1;
+ Mmsc.check();
+ prev_j=cur_j;
+ cur_j=msc.als_j[cur_j].next;
+ }
+ while(cur_j) { // process following words of cept
+ k++;
+ int vprev=vacancies(vac,prev_j);
+ d5Table->getCountRef_bigger(vacancies(vac,cur_j),vprev,d5Table->fwordclasses.getClass(ef.get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-msc.fert(i)+k)+=normalized_ascore;
+ vac_all--;
+ vac[cur_j]=1;
+ Mmsc.check();
+ prev_j=cur_j;
+ cur_j=msc.als_j[cur_j].next;
+ }
+ assert(k==msc.fert(i));
+ if( k )
+ prev_cept=i;
+ }
+ assert(vac_all==msc.fert(0));
+}
+
+extern int NumberOfAlignmentsInSophisticatedCountCollection;
+
+template<class TRANSPAIR,class MODEL>
+double collectCountsOverNeighborhoodForSophisticatedModels(const MoveSwapMatrix<TRANSPAIR>&msc,LogProb normalized_ascore,MODEL*d5Table)
+{
+ const PositionIndex m=msc.get_m(),l=msc.get_l();
+ alignment x(msc);
+ double sum=0;
+ msc.check();
+ if( !msc.isCenterDeleted() )
+ {
+ _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),normalized_ascore,d5Table);
+ NumberOfAlignmentsInSophisticatedCountCollection++;
+ sum+=normalized_ascore;
+ }
+ msc.check();
+ for(WordIndex j=1;j<=m;j++)for(WordIndex i=0;i<=l;i++)
+ {
+ WordIndex old=x(j);
+ if( i!=old&& !msc.isDelMove(i,j) )
+ {
+ msc.check();
+ double c=msc.cmove(i,j)*normalized_ascore;
+ if(c > COUNTINCREASE_CUTOFF_AL )
+ {
+ x.set(j,i);
+ _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),c,d5Table);
+ NumberOfAlignmentsInSophisticatedCountCollection++;
+ x.set(j,old);
+ sum+=c;
+ }
+ msc.check();
+ }
+ }
+ for(PositionIndex j1=1;j1<=m;j1++)
+ for(PositionIndex j2=j1+1;j2<=m;j2++)
+ if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) )
+ {
+ double c=msc.cswap(j1,j2)*normalized_ascore;
+ msc.check();
+ if(c > COUNTINCREASE_CUTOFF_AL )
+ {
+ int old1=msc(j1),old2=msc(j2);
+ x.set(j1,old2);
+ x.set(j2,old1);
+ _collectCountsOverNeighborhoodForSophisticatedModels<TRANSPAIR>(msc,x,msc.get_ef(),c,d5Table);
+ NumberOfAlignmentsInSophisticatedCountCollection++;
+ x.set(j1,old1);
+ x.set(j2,old2);
+ sum+=c;
+ }
+ msc.check();
+ }
+ msc.check();
+ return sum;
+}
+
+template<class TRANSPAIR,class MODEL>
+int collectCountsOverNeighborhood(const Vector<pair<MoveSwapMatrix<TRANSPAIR>*,LogProb> >&smsc,Vector<WordIndex>&es,Vector<WordIndex>&fs,tmodel<COUNT,PROB>&tTable,amodel<COUNT>&aCountTable,amodel<COUNT>&dCountTable,nmodel<COUNT>&nCountTable,double&p1count,double&p0count,LogProb&_total,float count,bool addCounts,MODEL*d4Table)
+{
+ int nAl=0;
+ const PositionIndex l=es.size()-1,m=fs.size()-1;
+ Array2<LogProb,Vector<LogProb> > dtcount(l+1,m+1),ncount(l+1,MAX_FERTILITY+1);
+ LogProb p0=0,p1=0,all_total=0;
+ for(unsigned int i=0;i<smsc.size();++i)
+ {
+ LogProb this_total=0;
+ nAl+=collectCountsOverNeighborhood(*smsc[i].first,smsc[i].second,dtcount,ncount,p1,p0,this_total);
+ all_total+=this_total;
+ }
+ _total=all_total;
+ all_total/=(double)count;
+ double sum2=0;
+ if( addCounts && d4Table )
+ {
+ for(unsigned int i=0;i<smsc.size();++i)
+ {
+ //for(WordIndex j=1;j<=m;j++)for(WordIndex ii=0;ii<=l;ii++)
+ // (*smsc[i].first).cmove(ii,j);
+ sum2+=collectCountsOverNeighborhoodForSophisticatedModels(*smsc[i].first,smsc[i].second/all_total,d4Table);
+ }
+ if(!(fabs(count-sum2)<0.05))
+ cerr << "WARNING: DIFFERENT SUMS: (" << count << ") (" << sum2 << ")\n";
+ }
+ if( addCounts )
+ {
+ for(PositionIndex i=0;i<=l;i++)
+ {
+ for(PositionIndex j=1;j<=m;j++)
+ {
+ LogProb ijadd=dtcount(i,j)/all_total;
+ if( ijadd>COUNTINCREASE_CUTOFF_AL )
+ {
+ tTable.incCount(es[i],fs[j],ijadd);
+ dCountTable.getRef(j,i,l,m)+=ijadd;
+ aCountTable.getRef(i,j,l,m)+=ijadd;
+ }
+ }
+ if( i>0 )
+ for(PositionIndex n=0;n<MAX_FERTILITY;n++)
+ nCountTable.getRef(es[i],n)+=ncount(i,n)/all_total;
+ }
+ p0count+=p0/all_total;
+ p1count+=p1/all_total;
+ }
+ return nAl;
+}
+
+
+
+
+
+
+
+
+
diff --git a/GIZA++-v2/collCounts.h b/GIZA++-v2/collCounts.h
new file mode 100644
index 0000000..9a0529b
--- /dev/null
+++ b/GIZA++-v2/collCounts.h
@@ -0,0 +1,80 @@
+/*
+
+Copyright (C) 1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef collCounts_h_defined
+#define collCounts_h_defined
+#include "alignment.h"
+#include "transpair_model3.h"
+#include <map>
+#include "MoveSwapMatrix.h"
+#include "D4Tables.h"
+#include "transpair_model4.h"
+
+class OneMoveSwap
+{
+ public:
+ short type;
+ short a,b;
+ OneMoveSwap(short _type,short _a,short _b)
+ : type(_type),a(_a),b(_b)
+ {}
+ OneMoveSwap()
+ : type(0){}
+};
+
+inline bool operator<(const OneMoveSwap&a,const OneMoveSwap&b)
+{
+ if(a.type<b.type)return 1;
+ else if(b.type<a.type)return 0;
+ else if(a.a<b.a)return 1;
+ else if(b.a<a.a)return 0;
+ else return a.b<b.b;
+}
+
+inline bool operator==(const OneMoveSwap&a,const OneMoveSwap&b)
+{
+ return a.type==b.type&&a.a==b.a&&a.b==b.b;
+}
+
+inline ostream&operator<<(ostream&out,const OneMoveSwap&o)
+{
+ return out << '(' << o.type << "," << o.a << "," << o.b << ")";
+}
+
+inline ostream &operator<<(ostream &out,const set<OneMoveSwap>&s)
+{
+ for(set<OneMoveSwap>::const_iterator i=s.begin();i!=s.end();++i)
+ cout << *i << ' ';
+ return out;
+}
+
+bool makeOneMoveSwap(const alignment&a,const alignment&b,set<OneMoveSwap>&oms);
+
+template<class TRANSPAIR,class MODEL>
+int collectCountsOverNeighborhood(const Vector<pair<MoveSwapMatrix<TRANSPAIR>*,LogProb> >&smsc,
+ Vector<WordIndex>&es,
+ Vector<WordIndex>&fs,tmodel<COUNT,PROB>&tTable,
+ amodel<COUNT>&aCountTable,amodel<COUNT>&dCountTable,
+ nmodel<COUNT>&nCountTable,double&p1count,double&p0count,
+ LogProb&_total,float count,bool addCounts,MODEL*d4Table=0);
+
+#endif
diff --git a/GIZA++-v2/defs.h b/GIZA++-v2/defs.h
new file mode 100644
index 0000000..e94addd
--- /dev/null
+++ b/GIZA++-v2/defs.h
@@ -0,0 +1,78 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _defs_h
+#define _defs_h 1
+#include <string>
+#include <math.h>
+#include <limits.h>
+
+const int TRANSFER_SIMPLE=1;
+const int TRANSFER=0;
+
+const unsigned int MAX_SENTENCE_LENGTH_ALLOWED=101;
+const int TRAIN_BUFFER_SIZE= 50000;
+//#ifdef WORDINDEX_WITH_4_BYTE
+typedef unsigned int WordIndex;
+const unsigned int MAX_VOCAB_SIZE=UINT_MAX;
+typedef unsigned int PositionIndex;
+//#else
+//typedef unsigned short WordIndex;
+//const unsigned int MAX_VOCAB_SIZE=USHRT_MAX;
+//typedef unsigned short PositionIndex;
+//#endif
+extern WordIndex MAX_FERTILITY;
+
+const int MAX_W=457979;
+extern double LAMBDA; // Lambda that is used to scale cross_entropy factor
+
+typedef float PROB ;
+typedef float COUNT ;
+
+class LogProb {
+ private:
+ double x ;
+ public:
+ LogProb():x(0){}
+ LogProb(double y):x(y){}
+ LogProb(float y):x(y){}
+ LogProb(int y):x(y){}
+ LogProb(WordIndex y):x(y){}
+ operator double() const {return x;}
+ LogProb operator *= (double y) { x *= y ; return *this;}
+ LogProb operator *= (LogProb y) { x *= y.x ; return *this;}
+ LogProb operator /= (double y) { x /= y ; return *this;}
+ LogProb operator /= (LogProb y) { x /= y.x ; return *this;}
+ LogProb operator += (double y) { x += y ; return *this;}
+ LogProb operator += (LogProb y) { x += y.x ; return *this;}
+};
+
+const int PARLEV_ITER=1;
+const int PARLEV_OPTHEUR=2;
+const int PARLEV_OUTPUT=3;
+const int PARLEV_SMOOTH=4;
+const int PARLEV_EM=5;
+const int PARLEV_MODELS=6;
+const int PARLEV_SPECIAL=7;
+const int PARLEV_INPUT=8;
+
+#endif
+
diff --git a/GIZA++-v2/dependencies b/GIZA++-v2/dependencies
new file mode 100644
index 0000000..682ff2d
--- /dev/null
+++ b/GIZA++-v2/dependencies
@@ -0,0 +1,635 @@
+#Automatically generated dependecy list
+optimized/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h
+optimized/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
+ ATables.h Array4.h TTables.h Globals.h alignment.h
+optimized/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
+optimized/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
+optimized/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h
+optimized/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
+ FlexArray.h
+optimized/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
+optimized/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
+ HMMTables.cpp
+optimized/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
+ Pointer.h
+optimized/logprob.o: logprob.cpp logprob.h
+optimized/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
+ transpair_model4.h transpair_model5.h
+optimized/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
+ Dictionary.h utility.h Parameter.h Pointer.h
+optimized/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
+ Pointer.h
+optimized/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h utility.h
+optimized/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
+optimized/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
+ transpair_model5.h Parameter.h Pointer.h
+optimized/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h utility.h
+optimized/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
+ Pointer.h collCounts.cpp
+optimized/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
+optimized/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
+optimized/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
+optimized/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
+ Array2.h Pointer.h Globals.h defs.h Vector.h
+optimized/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
+ Pointer.h
+optimized/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h Globals.h
+optimized/plain2snt.o: plain2snt.cpp
+optimized/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
+ Pointer.h
+optimized/snt2cooc.o: snt2cooc.cpp
+optimized/snt2plain.o: snt2plain.cpp
+optimized/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
+ transpair_model1.h
+optimized/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
+ Pointer.h
+optimized/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
+ transpair_model1.h Parameter.h Pointer.h
+optimized/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
+optimized/utility.o: utility.cpp mymath.h
+optimized/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h
+#Automatically generated dependecy list
+debug/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h
+debug/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
+ ATables.h Array4.h TTables.h Globals.h alignment.h
+debug/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
+debug/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
+debug/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h
+debug/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
+ FlexArray.h
+debug/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
+debug/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
+ HMMTables.cpp
+debug/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
+ Pointer.h
+debug/logprob.o: logprob.cpp logprob.h
+debug/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
+ transpair_model4.h transpair_model5.h
+debug/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
+ Dictionary.h utility.h Parameter.h Pointer.h
+debug/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
+ Pointer.h
+debug/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h utility.h
+debug/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
+debug/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
+ transpair_model5.h Parameter.h Pointer.h
+debug/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h utility.h
+debug/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
+ Pointer.h collCounts.cpp
+debug/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
+debug/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
+debug/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
+debug/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
+ Array2.h Pointer.h Globals.h defs.h Vector.h
+debug/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
+ Pointer.h
+debug/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h Globals.h
+debug/plain2snt.o: plain2snt.cpp
+debug/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
+ Pointer.h
+debug/snt2cooc.o: snt2cooc.cpp
+debug/snt2plain.o: snt2plain.cpp
+debug/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
+ transpair_model1.h
+debug/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
+ Pointer.h
+debug/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
+ transpair_model1.h Parameter.h Pointer.h
+debug/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
+debug/utility.o: utility.cpp mymath.h
+debug/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h
+#Automatically generated dependecy list
+vdebug/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h
+vdebug/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
+ ATables.h Array4.h TTables.h Globals.h alignment.h
+vdebug/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
+vdebug/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
+vdebug/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h
+vdebug/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
+ FlexArray.h
+vdebug/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
+vdebug/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
+ HMMTables.cpp
+vdebug/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
+ Pointer.h
+vdebug/logprob.o: logprob.cpp logprob.h
+vdebug/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
+ transpair_model4.h transpair_model5.h
+vdebug/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
+ Dictionary.h utility.h Parameter.h Pointer.h
+vdebug/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
+ Pointer.h
+vdebug/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h utility.h
+vdebug/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
+vdebug/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
+ transpair_model5.h Parameter.h Pointer.h
+vdebug/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h utility.h
+vdebug/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
+ Pointer.h collCounts.cpp
+vdebug/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
+vdebug/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
+vdebug/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
+vdebug/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
+ Array2.h Pointer.h Globals.h defs.h Vector.h
+vdebug/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
+ Pointer.h
+vdebug/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h Globals.h
+vdebug/plain2snt.o: plain2snt.cpp
+vdebug/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
+ Pointer.h
+vdebug/snt2cooc.o: snt2cooc.cpp
+vdebug/snt2plain.o: snt2plain.cpp
+vdebug/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
+ transpair_model1.h
+vdebug/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
+ Pointer.h
+vdebug/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
+ transpair_model1.h Parameter.h Pointer.h
+vdebug/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
+vdebug/utility.o: utility.cpp mymath.h
+vdebug/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h
+#Automatically generated dependecy list
+norm/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h
+norm/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
+ ATables.h Array4.h TTables.h Globals.h alignment.h
+norm/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
+norm/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
+norm/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h
+norm/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
+ FlexArray.h
+norm/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
+norm/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
+ HMMTables.cpp
+norm/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
+ Pointer.h
+norm/logprob.o: logprob.cpp logprob.h
+norm/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
+ transpair_model4.h transpair_model5.h
+norm/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
+ Dictionary.h utility.h Parameter.h Pointer.h
+norm/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
+ Pointer.h
+norm/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h utility.h
+norm/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
+norm/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
+ transpair_model5.h Parameter.h Pointer.h
+norm/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h utility.h
+norm/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
+ Pointer.h collCounts.cpp
+norm/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
+norm/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
+norm/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
+norm/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
+ Array2.h Pointer.h Globals.h defs.h Vector.h
+norm/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
+ Pointer.h
+norm/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h Globals.h
+norm/plain2snt.o: plain2snt.cpp
+norm/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
+ Pointer.h
+norm/snt2cooc.o: snt2cooc.cpp
+norm/snt2plain.o: snt2plain.cpp
+norm/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
+ transpair_model1.h
+norm/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
+ Pointer.h
+norm/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
+ transpair_model1.h Parameter.h Pointer.h
+norm/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
+norm/utility.o: utility.cpp mymath.h
+norm/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h
+#Automatically generated dependecy list
+profile/alignment.o: alignment.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h
+profile/AlignTables.o: AlignTables.cpp AlignTables.h defs.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h transpair_model1.h NTables.h vocab.h \
+ ATables.h Array4.h TTables.h Globals.h alignment.h
+profile/ATables.o: ATables.cpp ATables.h defs.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h Array4.h Globals.h Parameter.h Pointer.h
+profile/collCounts.o: collCounts.cpp alignment.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ collCounts.h MoveSwapMatrix.h D4Tables.h WordClasses.h \
+ transpair_model4.h D5Tables.h transpair_model5.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h Parameter.h Pointer.h
+profile/Dictionary.o: Dictionary.cpp Dictionary.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h
+profile/ForwardBackward.o: ForwardBackward.cpp ForwardBackward.h myassert.h \
+ Array.h Vector.h mystl.h mymath.h Array2.h Globals.h defs.h HMMTables.h \
+ FlexArray.h
+profile/getSentence.o: getSentence.cpp getSentence.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h vocab.h Globals.h Parameter.h Pointer.h
+profile/hmm.o: hmm.cpp hmm.h Vector.h mystl.h myassert.h mymath.h Array2.h \
+ TTables.h defs.h vocab.h Globals.h ATables.h Array4.h getSentence.h \
+ model2.h model1.h Perplexity.h Dictionary.h WordClasses.h HMMTables.h \
+ FlexArray.h Array.h ForwardBackward.h utility.h Parameter.h Pointer.h \
+ HMMTables.cpp
+profile/HMMTables.o: HMMTables.cpp HMMTables.h FlexArray.h Array.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h Globals.h defs.h Parameter.h \
+ Pointer.h
+profile/logprob.o: logprob.cpp logprob.h
+profile/main.o: main.cpp getSentence.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h defs.h vocab.h Globals.h TTables.h model1.h Perplexity.h \
+ Dictionary.h model2.h ATables.h Array4.h model3.h MoveSwapMatrix.h \
+ alignment.h transpair_model3.h NTables.h transpair_model2.h \
+ transpair_model1.h transpair_modelhmm.h ForwardBackward.h Array.h hmm.h \
+ WordClasses.h HMMTables.h FlexArray.h D4Tables.h AlignTables.h \
+ file_spec.h utility.h Parameter.h Pointer.h D5Tables.h \
+ transpair_model4.h transpair_model5.h
+profile/model1.o: model1.cpp model1.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h vocab.h defs.h TTables.h Globals.h getSentence.h Perplexity.h \
+ Dictionary.h utility.h Parameter.h Pointer.h
+profile/model2.o: model2.cpp model2.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h TTables.h defs.h vocab.h Globals.h ATables.h Array4.h \
+ getSentence.h model1.h Perplexity.h Dictionary.h utility.h Parameter.h \
+ Pointer.h
+profile/model2to3.o: model2to3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h utility.h
+profile/model345-peg.o: model345-peg.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h collCounts.h transpair_model4.h
+profile/model3.o: model3.cpp model3.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h MoveSwapMatrix.h alignment.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h transpair_modelhmm.h ForwardBackward.h \
+ Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h D4Tables.h \
+ AlignTables.h collCounts.h transpair_model4.h utility.h D5Tables.h \
+ transpair_model5.h Parameter.h Pointer.h
+profile/model3_viterbi.o: model3_viterbi.cpp model3.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h MoveSwapMatrix.h alignment.h defs.h \
+ transpair_model3.h NTables.h vocab.h ATables.h Array4.h TTables.h \
+ Globals.h transpair_model2.h transpair_model1.h getSentence.h model2.h \
+ model1.h Perplexity.h Dictionary.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h WordClasses.h HMMTables.h FlexArray.h \
+ D4Tables.h AlignTables.h utility.h
+profile/model3_viterbi_with_tricks.o: model3_viterbi_with_tricks.cpp mystl.h \
+ myassert.h mymath.h Array2.h model3.h Vector.h MoveSwapMatrix.h \
+ alignment.h defs.h transpair_model3.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h transpair_model2.h transpair_model1.h \
+ getSentence.h model2.h model1.h Perplexity.h Dictionary.h \
+ transpair_modelhmm.h ForwardBackward.h Array.h hmm.h WordClasses.h \
+ HMMTables.h FlexArray.h D4Tables.h AlignTables.h collCounts.h \
+ transpair_model4.h utility.h D5Tables.h transpair_model5.h Parameter.h \
+ Pointer.h collCounts.cpp
+profile/MoveSwapMatrix.o: MoveSwapMatrix.cpp MoveSwapMatrix.h alignment.h Vector.h \
+ mystl.h myassert.h mymath.h Array2.h defs.h transpair_model3.h \
+ NTables.h vocab.h ATables.h Array4.h TTables.h Globals.h \
+ transpair_model2.h transpair_model1.h transpair_model4.h D4Tables.h \
+ WordClasses.h transpair_model5.h D5Tables.h transpair_modelhmm.h \
+ ForwardBackward.h Array.h hmm.h getSentence.h model2.h model1.h \
+ Perplexity.h Dictionary.h HMMTables.h FlexArray.h
+profile/myassert.o: myassert.cpp mystl.h myassert.h mymath.h Array2.h
+profile/NTables.o: NTables.cpp NTables.h Array2.h mystl.h myassert.h mymath.h \
+ Vector.h defs.h vocab.h Parameter.h Pointer.h Globals.h
+profile/Parameter.o: Parameter.cpp Parameter.h mystl.h myassert.h mymath.h \
+ Array2.h Pointer.h Globals.h defs.h Vector.h
+profile/parse.o: parse.cpp defs.h utility.h Perplexity.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h TTables.h vocab.h getSentence.h \
+ D4Tables.h WordClasses.h D5Tables.h ATables.h Array4.h Parameter.h \
+ Pointer.h
+profile/Perplexity.o: Perplexity.cpp Perplexity.h Vector.h mystl.h myassert.h \
+ mymath.h Array2.h defs.h Globals.h
+profile/plain2snt.o: plain2snt.cpp
+profile/reports.o: reports.cpp defs.h vocab.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h Perplexity.h Globals.h getSentence.h TTables.h Parameter.h \
+ Pointer.h
+profile/snt2cooc.o: snt2cooc.cpp
+profile/snt2plain.o: snt2plain.cpp
+profile/transpair_model3.o: transpair_model3.cpp transpair_model3.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h transpair_model2.h \
+ transpair_model1.h
+profile/transpair_model4.o: transpair_model4.cpp transpair_model4.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D4Tables.h WordClasses.h \
+ transpair_model3.h transpair_model2.h transpair_model1.h Parameter.h \
+ Pointer.h
+profile/transpair_model5.o: transpair_model5.cpp transpair_model5.h Array2.h \
+ mystl.h myassert.h mymath.h defs.h Vector.h NTables.h vocab.h ATables.h \
+ Array4.h TTables.h Globals.h alignment.h D5Tables.h D4Tables.h \
+ WordClasses.h transpair_model4.h transpair_model3.h transpair_model2.h \
+ transpair_model1.h Parameter.h Pointer.h
+profile/TTables.o: TTables.cpp TTables.h defs.h vocab.h Vector.h mystl.h \
+ myassert.h mymath.h Array2.h Globals.h Parameter.h Pointer.h
+profile/utility.o: utility.cpp mymath.h
+profile/vocab.o: vocab.cpp vocab.h defs.h Vector.h mystl.h myassert.h mymath.h \
+ Array2.h
diff --git a/GIZA++-v2/file_spec.h b/GIZA++-v2/file_spec.h
new file mode 100644
index 0000000..8fc7236
--- /dev/null
+++ b/GIZA++-v2/file_spec.h
@@ -0,0 +1,59 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef FILE_SPEC_H
+#define FILE_SPEC_H
+
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+/* This function returns a string, locally called file_spec. This
+ string is the concatenation of the date and time of execution
+ and the user who is performing the execution */
+/* Originally implemented in C by Yaser Al-Onaizan;
+ editions for C++ and formatting by Noah A. Smith, 9 July 1999 */
+
+char *Get_File_Spec (){
+ struct tm *local;
+ time_t t;
+ char *user;
+ char time_stmp[17];
+ char *file_spec = 0;
+
+ t = time(NULL);
+ local = localtime(&t);
+
+ sprintf(time_stmp, "%02d-%02d-%02d.%02d%02d%02d.", local->tm_year,
+ (local->tm_mon + 1), local->tm_mday, local->tm_hour,
+ local->tm_min, local->tm_sec);
+ user = getenv("USER");
+
+ file_spec = (char *)malloc(sizeof(char) *
+ (strlen(time_stmp) + strlen(user) + 1));
+ file_spec[0] = '\0';
+ strcat(file_spec, time_stmp) ;
+ strcat(file_spec, user);
+ return file_spec;
+}
+
+#endif
diff --git a/GIZA++-v2/getSentence.cpp b/GIZA++-v2/getSentence.cpp
new file mode 100644
index 0000000..78aafcf
--- /dev/null
+++ b/GIZA++-v2/getSentence.cpp
@@ -0,0 +1,340 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/* --------------------------------------------------------------------------*
+ * *
+ * Module : getSentece *
+ * *
+ * Method Definitions File: getSentence.cc *
+ * *
+ * Objective: Defines clases and methods for handling I/O for the parallel *
+ * corpus. *
+ *****************************************************************************/
+
+
+#include "getSentence.h"
+#include <iostream>
+#include <strstream>
+#include "Parameter.h"
+#include "errno.h"
+
+int PrintedTooLong=0;
+
+/* -------------- Method Defnitions for Class sentenceHandler ---------------*/
+
+GLOBAL_PARAMETER(double,ManlexMAX_MULTIPLICITY,"manlexMAX_MULTIPLICITY","",PARLEV_EM,20.0);
+GLOBAL_PARAMETER(double,Manlexfactor1,"manlexfactor1","",PARLEV_EM,0.0);
+GLOBAL_PARAMETER(double,Manlexfactor2,"manlexfactor2","",PARLEV_EM,0.0);
+
+sentenceHandler::sentenceHandler(const char* filename, vcbList* elist,
+ vcbList* flist) : realCount(0)
+ // This method is the constructor of the class, it also intitializes the
+ // sentence pair sequential number (count) to zero.
+
+{
+ readflag = false ;
+ allInMemory = false ;
+ inputFilename = filename ;
+ inputFile = new ifstream(filename);
+ pair_no = 0 ;
+ if(!(*inputFile)){
+ cerr << "\nERROR:(a) Cannot open " << filename;
+ exit(1);
+ }
+ currentSentence = 0;
+ totalPairs1 = 0 ;
+ totalPairs2 =0;
+ pair_no = 0 ;
+ noSentInBuffer = 0 ;
+ Buffer.clear();
+ bool isNegative=0;
+ if (elist && flist){
+ cout << "Calculating vocabulary frequencies from corpus " << filename << '\n';
+ sentPair s ;
+ while (getNextSentence(s, elist, flist))
+ {
+ totalPairs1++;
+ totalPairs2+=s.realCount;
+ // NOTE: this value might change during training
+ // for words from the manual dictionary, yet this is ignored!
+
+ if( s.noOcc<0 )
+ isNegative=1;
+ }
+ }
+ if( isNegative==1 )
+ {
+ cerr << "WARNING: corpus contains negative occurrency frequencies => these are interpreted as entries of a manual dictionary.\n";
+ realCount=new Vector<double>(totalPairs1,1.0);
+ }
+ else
+ realCount=0;
+}
+
+void sentenceHandler::rewind()
+{
+ currentSentence = 0;
+ readflag = false ;
+ if (!allInMemory ||
+ !(Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1)){
+ // check if the buffer doe not already has the first chunk of pairs
+ if (Buffer.size() > 0)
+ cerr << ' ' << Buffer[currentSentence].sentenceNo << '\n';
+ // totalPairs = 0 ;
+ pair_no = 0 ;
+ noSentInBuffer = 0 ;
+ Buffer.clear();
+ }
+ if (!allInMemory){
+ delete inputFile;
+ inputFile = new ifstream(inputFilename);
+ if(!(*inputFile)){
+ cerr << "\nERROR:(b) Cannot open " << inputFilename << " " << (int)errno;
+ }
+ }
+}
+
+
+bool sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist)
+{
+ sentPair s ;
+ if (readflag){
+ cerr << "Attempting to read from the end of corpus, rewinding\n";
+ rewind();
+ return(false);
+ }
+ if (currentSentence >= noSentInBuffer){
+ if (allInMemory)
+ return(false);
+ /* no more sentences in buffer */
+ noSentInBuffer = 0 ;
+ currentSentence = 0 ;
+ Buffer.clear();
+ cout << "Reading more sentence pairs into memory ... \n";
+ while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){
+ if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){
+ cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<<
+ "the maximum allowed limit for a source word fertility\n"<<
+ " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 <<
+ " ratio " << double(s.fSent.size()-1)/ (s.eSent.size()-1) << " ferility limit : " <<
+ MAX_FERTILITY-1 << '\n';
+ cerr << "Shortening sentence \n";
+ cerr << s;
+ s.eSent.resize(min(s.eSent.size(),s.fSent.size()));
+ s.fSent.resize(min(s.eSent.size(),s.fSent.size()));
+ }
+ Buffer.push_back(s) ;
+ if (elist && flist){
+ if ((*elist).size() > 0)
+ for (WordIndex i= 0 ; i < s.eSent.size() ; i++){
+ if (s.eSent[i] >= (*elist).uniqTokens()){
+ if( PrintedTooLong++<100)
+ cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n";
+ exit(-1);
+ }
+ (*elist).incFreq(s.eSent[i], s.realCount);
+ }
+ if ((*flist).size() > 0)
+ for (WordIndex j= 1 ; j < s.fSent.size() ; j++){
+ if (s.fSent[j] >= (*flist).uniqTokens()){
+ cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n";
+ exit(-1);
+ }
+ (*flist).incFreq(s.fSent[j], s.realCount);
+ }
+ }
+ noSentInBuffer++;
+ }
+ if (inputFile->eof()){
+ allInMemory = (Buffer.size() >= 1 &&
+ Buffer[currentSentence].sentenceNo == 1) ;
+ if (allInMemory)
+ cout << "Corpus fits in memory, corpus has: " << Buffer.size() <<
+ " sentence pairs.\n";
+ }
+ }
+ if(noSentInBuffer <= 0 ){
+ //cerr << "# sent in buffer " << noSentInBuffer << '\n';
+ readflag = true ;
+ return(false);
+ }
+ sent = Buffer[currentSentence++] ;
+ if( sent.noOcc<0 && realCount )
+ {
+ if( Manlexfactor1 && sent.noOcc==-1.0 )
+ sent.realCount=Manlexfactor1;
+ else if( Manlexfactor2 && sent.noOcc==-2.0 )
+ sent.realCount=Manlexfactor2;
+ else
+ sent.realCount=(*realCount)[sent.getSentenceNo()-1];
+ }
+ return true ;
+}
+bool sentenceHandler::readNextSentence(sentPair& sent)
+ /* This method reads in a new pair of sentences, each pair is read from the
+ corpus file as line triples. The first line the no of times this line
+ pair occured in the corpus, the second line is the source sentence and
+ the third is the target sentence. The sentences are represented by a space
+ separated positive integer token ids. */
+{
+
+ string line;
+ bool fail(false) ;
+
+ sent.clear();
+ if (getline(*inputFile, line)){
+ istrstream buffer(line.c_str());
+ buffer >> sent.noOcc;
+ if( sent.noOcc<0 )
+ {
+ if( realCount )
+ {
+ if( Manlexfactor1 && sent.noOcc==-1.0 )
+ sent.realCount=Manlexfactor1;
+ else if( Manlexfactor2 && sent.noOcc==-2.0 )
+ sent.realCount=Manlexfactor2;
+ else
+ {
+ sent.realCount=(*realCount)[pair_no];
+ }
+ }
+ else
+ sent.realCount=1.0;
+ }
+ else
+ sent.realCount=sent.noOcc;
+ }
+ else {
+ fail = true ;;
+ }
+ if (getline(*inputFile, line)){
+ istrstream buffer(line.c_str());
+ WordIndex w; // w is a local variabe for token id
+ sent.eSent.push_back(0); // each source word is assumed to have 0 ==
+ // a null word (id 0) at the begining of the sentence.
+ while(buffer>>w){ // read source sentece , word by word .
+ if (sent.eSent.size() < MAX_SENTENCE_LENGTH)
+ sent.eSent.push_back(w);
+ else {
+ if( PrintedTooLong++<100)
+ cerr << "{WARNING:(a)truncated sentence "<<pair_no<<"}";
+ //cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n';
+ //cerr << "The following sentence will be truncated\n" << line;
+ break ;
+ }
+ }
+ }
+ else {
+ fail = true ;
+ }
+ if (getline(*inputFile, line)){
+ istrstream buffer(line.c_str());
+ WordIndex w; // w is a local variabe for token id
+ sent.fSent.push_back(0); //0 is inserted for program uniformity
+ while(buffer>>w){ // read target sentece , word by word .
+ if (sent.fSent.size() < MAX_SENTENCE_LENGTH)
+ sent.fSent.push_back(w);
+ else {
+ if( PrintedTooLong++<100)
+ cerr << "{WARNING:(b)truncated sentence "<<pair_no<<"}";
+ //cerr << "ERROR: getSentence.cc:getNextSentence(): sentence exceeds preset length limit of : " << MAX_SENTENCE_LENGTH << '\n';
+ //cerr << "The following sentence will be truncated\n" << line;
+ break ;
+ }
+ }
+ }
+ else {
+ fail = true ;
+ }
+ if (fail){
+ sent.eSent.clear();
+ sent.fSent.clear();
+ sent.sentenceNo = 0 ;
+ sent.noOcc = 0 ;
+ sent.realCount=0;
+ return(false);
+ }
+ if( sent.eSent.size()==1||sent.fSent.size()==1 )
+ cerr << "ERROR: Forbidden zero sentence length " << sent.sentenceNo << endl;
+ sent.sentenceNo = ++pair_no;
+ if(pair_no % 100000 == 0)
+ cout << "[sent:" << sent.sentenceNo << "]"<< '\n';
+ return true;
+}
+
+double optimize_lambda(Vector<double>&vd)
+{
+ Vector<double> l;
+ for(double lambda=1.0;lambda<ManlexMAX_MULTIPLICITY;lambda+=0.33)
+ {
+ double prod=0.0;
+ for(unsigned int i=0;i<vd.size();++i)
+ {
+ prod += vd[i]*exp(lambda*vd[i])/(exp(lambda*vd[i])-1.0);
+ }
+ l.push_back(fabs(prod-1.0));
+ }
+ double lam=double(min_element(l.begin(),l.end())-l.begin())*0.33+1.0;
+ if( lam<1.0 )
+ {
+ cerr << "ERROR: lambda is smaller than one: " << lam << endl;
+ for(unsigned int i=0;i<vd.size();++i)
+ cerr << vd[i] << ' ';
+ cerr << endl;
+ }
+ return lam;
+}
+
+void sentenceHandler::setProbOfSentence(const sentPair&s,double d)
+{
+ if( realCount==0 )
+ return;
+ else
+ {
+ if( s.noOcc<=0 )
+ {
+ double ed=exp(d);
+ if( oldPairs.size()>0&&(oldPairs.back().get_eSent()!=s.get_eSent()||oldPairs.back().getSentenceNo()>=s.getSentenceNo()) )
+ {
+ double lambda=optimize_lambda(oldProbs);
+ for(unsigned int i=0;i<oldPairs.size();++i)
+ {
+ if( oldProbs[i]<1e-5 )
+ (*realCount)[oldPairs[i].getSentenceNo()-1]=1.0;
+ else
+ (*realCount)[oldPairs[i].getSentenceNo()-1]=lambda*oldProbs[i]/(1-exp(-lambda*oldProbs[i]));
+ }
+ oldPairs.clear();
+ oldProbs.clear();
+ }
+ oldPairs.push_back(s);
+ oldProbs.push_back(ed);
+ }
+ }
+}
+
+/* ------------- End of Method Definition of Class sentenceHandler ----------*/
+
+
+
+
+
+
diff --git a/GIZA++-v2/getSentence.h b/GIZA++-v2/getSentence.h
new file mode 100644
index 0000000..246ae1c
--- /dev/null
+++ b/GIZA++-v2/getSentence.h
@@ -0,0 +1,123 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/* --------------------------------------------------------------------------*
+ * *
+ * Module : getSentence *
+ * *
+ * Prototypes File: getSentence.h *
+ * *
+ * Objective: Defines clases and methods for handling I/O for the parallel *
+ * corpus. *
+ *****************************************************************************/
+
+
+
+
+
+#ifndef _sentenceHandler_h
+#define _sentenceHandler_h 1
+
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include "Vector.h"
+#include "defs.h"
+#include "vocab.h"
+#include "Globals.h"
+/*----------------------- Class Prototype Definition ------------------------*
+ Class Name: sentenceHandleer
+ Objective: This class is defined to handle training sentece pairs from the
+ parallel corpus. Each pair has: a target sentece, called here French; a
+ source sentece, called here English sentece; and an integer number denoting
+ the number of times this pair occured in trining corpus. Both source and
+ target senteces are represented as integer vector (variable size arrays),
+ each entry is a numeric value which is the token id for the particular token
+ in the sentece.
+
+ *---------------------------------------------------------------------------*/
+
+class sentPair{
+ public:
+ int sentenceNo ;
+ float noOcc;
+ float realCount;
+ Vector<WordIndex> eSent ;
+ Vector<WordIndex> fSent;
+
+ public:
+ sentPair(){};
+ void clear(){ eSent.clear(); fSent.clear(); noOcc=0; realCount=0; sentenceNo=0;};
+ const Vector<WordIndex>&get_eSent()const
+ { return eSent; }
+ const Vector<WordIndex>&get_fSent()const
+ { return fSent; }
+ int getSentenceNo()const
+ { return sentenceNo; }
+ double getCount()const
+ { return realCount; }
+};
+
+inline ostream&operator<<(ostream&of,const sentPair&s)
+{
+ of << "Sent No: " << s.sentenceNo << " , No. Occurrences: " << s.noOcc << '\n';
+ if( s.noOcc!=s.realCount )
+ of << " Used No. Occurrences: " << s.realCount << '\n';
+ unsigned int i;
+ for(i=0; i < s.eSent.size(); i++)
+ of << s.eSent[i] << ' ';
+ of << '\n';
+ for(i=1; i < s.fSent.size(); i++)
+ of << s.fSent[i] << ' ';
+ of << '\n';
+ return of;
+}
+
+class sentenceHandler{
+public:
+ const char * inputFilename; // parallel corpus file name, similar for all
+ // sentence pair objects
+ ifstream *inputFile; // parallel corpus file handler
+ Vector<sentPair> Buffer;
+ int noSentInBuffer ;
+ int currentSentence ;
+ int totalPairs1 ;
+ double totalPairs2;
+ bool readflag ; // true if you reach the end of file
+ bool allInMemory ;
+ int pair_no ;
+ Vector<double> *realCount;
+
+ Vector<sentPair> oldPairs;
+ Vector<double> oldProbs;
+ sentenceHandler(const char* filename, vcbList* elist=0, vcbList* flist=0);
+ void rewind();
+ bool getNextSentence(sentPair&, vcbList* = 0, vcbList* = 0); // will be defined in the definition file, this
+ int getTotalNoPairs1()const {return totalPairs1;};
+ double getTotalNoPairs2()const {return totalPairs2;};
+ // method will read the next pair of sentence from memory buffer
+ bool readNextSentence(sentPair&); // will be defined in the definition file, this
+ void setProbOfSentence(const sentPair&s,double d);
+};
+
+#endif
+
diff --git a/GIZA++-v2/hmm.cpp b/GIZA++-v2/hmm.cpp
new file mode 100644
index 0000000..fc4284c
--- /dev/null
+++ b/GIZA++-v2/hmm.cpp
@@ -0,0 +1,405 @@
+/*
+
+Copyright (C) 1998,1999,2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "hmm.h"
+#include "Globals.h"
+#include "utility.h"
+#include "HMMTables.h"
+#include "ForwardBackward.h"
+#include "Parameter.h"
+
+#define CLASSIFY(i,empty,ianf) bool empty=(i>=l); unsigned int ianf=(i%l);
+#define CLASSIFY2(i,ianf) unsigned int ianf=(i%l);
+
+
+short PredictionInAlignments=0;
+short UniformEntryExit=3;
+short HMMTrainingSpecialFlags=0;
+
+GLOBAL_PARAMETER2(int,ModelH_Dump_Freq,"HMM DUMP FREQUENCY","th","dump frequency of HMM",PARLEV_OUTPUT,0);
+
+GLOBAL_PARAMETER(short,CompareAlDeps,"emAlignmentDependencies",
+ "lextrain: dependencies in the HMM alignment model. "
+ " &1: sentence length; &2: previous class; &4: previous position; "
+ " &8: French position; &16: French class"
+ ,PARLEV_MODELS,2);
+GLOBAL_PARAMETER(double,GLOBALProbabilityForEmpty,"emProbForEmpty",
+ "f-b-trn: probability for empty word",PARLEV_MODELS,0.4);
+GLOBAL_PARAMETER(short,SmoothHMM,"emSmoothHMM",
+ "f-b-trn: smooth HMM model &1: modified counts; &2:perform smoothing with -emAlSmooth",PARLEV_SPECIAL,2);
+GLOBAL_PARAMETER(double,HMMAlignmentModelSmoothFactor,"emAlSmooth",
+ "f-b-trn: smoothing factor for HMM alignment model (can be ignored by -emSmoothHMM)",PARLEV_SMOOTH,0.2);
+
+
+/*template<class T>
+void smooth_standard(T*a,T*b,double p)
+{
+ int n=b-a;
+ if( n==0 )
+ return;
+ double pp=p/n;
+ for(T*i=a;i!=b;++i)
+ *i = (1.0-p)*(*i)+pp;
+}*/
+
+
+hmm::hmm(model2& m)
+ : model2(m),counts(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses),
+ probs(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses)
+{ }
+
+void hmm::initialize_table_uniformly(sentenceHandler&){}
+
+int hmm::em_with_tricks(int noIterations)
+{
+ double minErrors=1.0;int minIter=0;
+ string modelName="Hmm",shortModelName="hmm";
+ int dumpFreq=ModelH_Dump_Freq;
+ time_t it_st, st, it_fn, fn;
+ string tfile, afile,afileh, number, alignfile, test_alignfile;
+ int pair_no = 0;
+ bool dump_files = false ;
+ ofstream of2 ;
+ st = time(NULL) ;
+ sHandler1.rewind();
+ cout << "\n==========================================================\n";
+ cout << modelName << " Training Started at: " << ctime(&st);
+ for(int it=1; it <= noIterations ; it++){
+ pair_no = 0;
+ it_st = time(NULL) ;
+ cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n';
+ dump_files = (dumpFreq != 0) && ((it % dumpFreq) == 0) && !NODUMPS;
+ number = "";
+ int n = it;
+ do{
+ number.insert((size_t)0, 1, (char)(n % 10 + '0'));
+ } while((n /= 10) > 0);
+ tfile = Prefix + ".t" + shortModelName + "." + number ;
+ afile = Prefix + ".a" + shortModelName + "." + number ;
+ afileh = Prefix + ".h" + shortModelName + "." + number ;
+ alignfile = Prefix + ".A" + shortModelName + "." + number ;
+ test_alignfile = Prefix + ".tst.A" + shortModelName + "." + number ;
+ counts=HMMTables<int,WordClasses>(GLOBALProbabilityForEmpty,ewordclasses,fwordclasses);
+ aCountTable.clear();
+ initAL();
+ em_loop(perp, sHandler1, dump_files , alignfile.c_str(), trainViterbiPerp, false,it==1,it);
+ if( errorsAL()<minErrors )
+ {
+ minErrors=errorsAL();
+ minIter=it;
+ }
+ if (testPerp && testHandler)
+ em_loop(*testPerp, *testHandler, dump_files, test_alignfile.c_str(), *testViterbiPerp, true,it==1,it);
+ if (dump_files&&OutputInAachenFormat==1)
+ tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
+ tTable.normalizeTable(Elist, Flist);
+ aCountTable.normalize(aTable);
+ probs=counts;
+ cout << modelName << ": ("<<it<<") TRAIN CROSS-ENTROPY " << perp.cross_entropy()
+ << " PERPLEXITY " << perp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<") TEST CROSS-ENTROPY " << (*testPerp).cross_entropy()
+ << " PERPLEXITY " << (*testPerp).perplexity()
+ << '\n';
+ cout << modelName << ": ("<<it<<") VITERBI TRAIN CROSS-ENTROPY " << trainViterbiPerp.cross_entropy()
+ << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<") VITERBI TEST CROSS-ENTROPY " << testViterbiPerp->cross_entropy()
+ << " PERPLEXITY " << testViterbiPerp->perplexity()
+ << '\n';
+ if (dump_files){
+ if( OutputInAachenFormat==0)
+ tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
+ ofstream afilestream(afileh.c_str());
+ probs.writeJumps(afilestream);
+ aCountTable.printTable(afile.c_str());
+ }
+ it_fn = time(NULL) ;
+ cout << "\n" << modelName << " Iteration: " << it<< " took: " <<
+ difftime(it_fn, it_st) << " seconds\n";
+ } // end of iterations
+ fn = time(NULL) ;
+ cout << endl << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n";
+ //cout << "tTable contains " << tTable.getHash().bucket_count()
+ // << " buckets and " << tTable.getHash().size() << " entries." ;
+ cout << "==========================================================\n";
+ return minIter;
+}
+
+/*template<class T>
+T normalize_if_possible_with_increment(T*a,T*b,int increment)
+{
+ T sum=0;
+ for(T*i=a;i!=b;i+=increment)
+ sum+=*i;
+ if( sum )
+ for(T*i=a;i!=b;i+=increment)
+ *i/=sum;
+ else
+ {
+ T factor=increment/(b-a);
+ for(T*i=a;i!=b;i+=increment)
+ *i=factor;
+ }
+ return sum;
+}*/
+
+void hmm::load_table(const char* aname){
+ cout << "Hmm: loading a table not implemented.\n";
+ abort();
+ ifstream anamefile(aname);
+ probs.readJumps(anamefile);
+}
+
+HMMNetwork *hmm::makeHMMNetwork(const Vector<WordIndex>& es,const Vector<WordIndex>&fs,bool doInit)const
+{
+ unsigned int i,j;
+ unsigned int l = es.size() - 1;
+ unsigned int m = fs.size() - 1;
+ unsigned int I=2*l,J=m;
+ int IJ=I*J;
+ bool DependencyOfJ=(CompareAlDeps&(16|8))||(PredictionInAlignments==2);
+ bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(PredictionInAlignments==0);
+ HMMNetwork *net = new HMMNetwork(I,J);
+ fill(net->alphainit.begin(),net->alphainit.end(),0.0);
+ fill(net->betainit.begin(),net->betainit.end(),0.0);
+ for(j=1;j<=m;j++)
+ {
+ for(i=1;i<=l;i++)
+ net->n(i-1,j-1)=tTable.getProb(es[i], fs[j]) ;
+ double emptyContribution=0;
+ emptyContribution=tTable.getProb(es[0],fs[j]) ;
+ for(i=1;i<=l;i++)
+ net->n(i+l-1,j-1)=emptyContribution;
+ net->finalMultiply*=max(normalize_if_possible_with_increment(&net->n(0,j-1),&net->n(0,j-1)+IJ,J),double(1e-12));
+ }
+ if( DependencyOfJ )
+ net->e.resize(m-1);
+ else
+ net->e.resize(J>1);
+ for(j=0;j<net->e.size();j++)
+ {
+ int frenchClass=fwordclasses.getClass(fs[1+min(int(m)-1,int(j)+1)]);
+ net->e[j].resize(I,I,0);
+ for(unsigned int i1=0;i1<I;++i1) {
+ Array<double> al(l);
+ CLASSIFY2(i1,i1real);
+ for(unsigned int i2=0;i2<l;i2++)
+ al[i2]=probs.getAlProb(i1real,i2,l,m,ewordclasses.getClass(es[1+i1real]),frenchClass
+ ,j+1);
+ normalize_if_possible(conv<double>(al.begin()),conv<double>(al.end()));
+ if( SmoothHMM&2 )
+ smooth_standard(conv<double>(al.begin()),conv<double>(al.end()),HMMAlignmentModelSmoothFactor);
+ for(unsigned int i2=0;i2<I;i2++) {
+ CLASSIFY(i2,empty_i2,i2real);
+ net->e[j](i1,i2) = al[i2real];
+
+ if( empty_i2 )
+ if(i1real!=i2real)
+ {
+ net->e[j](i1,i2)=0;
+ }
+ else
+ {
+ net->e[j](i1,i2)=doInit?al[0]:(probs.getProbabilityForEmpty()); // make first HMM iteration like IBM-1
+ }
+ }
+ normalize_if_possible(&net->e[j](i1,0),&net->e[j](i1,0)+I);
+ }
+ }
+ if( doInit )
+ {
+ for(unsigned int i=0;i<I;++i)
+ {
+ net->alphainit[i]=net->betainit[i]=(i<I/2)?1:(2.0/I);
+ net->betainit[i]=1.0;
+ }
+ }
+ else
+ {
+ if( DependencyOfPrevAJ==0 )
+ {
+ for(i=0;i<I;i++)
+ {
+ CLASSIFY2(i,ireal);
+ net->alphainit[i]=probs.getAlProb(-1,ireal,l,m,0,fwordclasses.getClass(fs[1+0]),0);
+ }
+ }
+ else
+ {
+ if( UniformEntryExit&2 )probs.getBetaInit(I,net->betainit);
+ if( UniformEntryExit&1 )probs.getAlphaInit(I,net->alphainit);
+ }
+ }
+ massert( net->alphainit.size()==I );massert( net->betainit.size()==I );
+ normalize_if_possible(conv<double>(net->alphainit.begin()),conv<double>(net->alphainit.end()));
+ normalize_if_possible(conv<double>(net->betainit.begin()),conv<double>(net->betainit.end()));
+ transform(net->betainit.begin(),net->betainit.end(),net->betainit.begin(),bind1st(multiplies<double>(),2*l));
+ return net;
+}
+extern float MINCOUNTINCREASE;
+
+void hmm::em_loop(Perplexity& perp, sentenceHandler& sHandler1,
+ bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp,
+ bool test,bool doInit,int
+)
+{
+ WordIndex i, j, l, m ;
+ double cross_entropy;
+ int pair_no=0 ;
+ perp.clear();
+ viterbi_perp.clear();
+ ofstream of2;
+ // for each sentence pair in the corpus
+ if (dump_alignment||FEWDUMPS )
+ of2.open(alignfile);
+ sentPair sent ;
+ sHandler1.rewind();
+ while(sHandler1.getNextSentence(sent)){
+ const Vector<WordIndex>& es = sent.get_eSent();
+ const Vector<WordIndex>& fs = sent.get_fSent();
+ const float so = sent.getCount();
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ cross_entropy = log(1.0);
+ Vector<WordIndex> viterbi_alignment(fs.size());
+
+ unsigned int I=2*l,J=m;
+ bool DependencyOfJ=(CompareAlDeps&(16|8))||(PredictionInAlignments==2);
+ bool DependencyOfPrevAJ=(CompareAlDeps&(2|4))||(PredictionInAlignments==0);
+ HMMNetwork *net=makeHMMNetwork(es,fs,doInit);
+ Array<double> gamma;
+ Array<Array2<double> > epsilon(DependencyOfJ?(m-1):1);
+ double trainProb;
+ trainProb=ForwardBackwardTraining(*net,gamma,epsilon);
+ if( !test )
+ {
+ double *gp=conv<double>(gamma.begin());
+ for(unsigned int i2=0;i2<J;i2++)for(unsigned int i1=0;i1<I;++i1,++gp)
+ if( *gp>MINCOUNTINCREASE )
+ {
+ COUNT add= *gp*so;
+ if( i1>=l )
+ {
+ tTable.incCount(es[0],fs[1+i2],add);
+ aCountTable.getRef(0,i2+1,l,m)+=add;
+ }
+ else
+ {
+ tTable.incCount(es[1+i1],fs[1+i2],add);
+ aCountTable.getRef(1+i1,1+i2,l,m)+=add;
+ }
+ }
+ double p0c=0.0,np0c=0.0;
+ for(unsigned int jj=0;jj<epsilon.size();jj++)
+ {
+ int frenchClass=fwordclasses.getClass(fs[1+min(int(m)-1,int(jj)+1)]);
+ double *ep=epsilon[jj].begin();
+ if( ep )
+ {
+ //for(i=0;i<I;i++)
+ // normalize_if_possible_with_increment(ep+i,ep+i+I*I,I);
+ // for(i=0;i<I*I;++i)
+ // ep[i] *= I;
+ //if( DependencyOfJ )
+ // if( J-1 )
+ // for(i=0;i<I*I;++i)
+ // ep[i] /= (J-1);
+ double mult=1.0;
+ mult*=l;
+ //if( DependencyOfJ && J-1)
+ // mult/=(J-1);
+ for(i=0;i<I;i++)
+ {
+ for(unsigned int i_bef=0;i_bef<I;i_bef++,ep++)
+ {
+ CLASSIFY(i,i_empty,ireal);
+ CLASSIFY2(i_bef,i_befreal);
+ if( i_empty )
+ p0c+=*ep * mult;
+ else
+ {
+ counts.addAlCount(i_befreal,ireal,l,m,ewordclasses.getClass(es[1+i_befreal]),
+ frenchClass ,jj+1,*ep * mult,0.0);
+ np0c+=*ep * mult;
+ }
+ massert( &epsilon[jj](i,i_bef)== ep);
+ }
+ }
+ }
+ }
+ double *gp1=conv<double>(gamma.begin()),*gp2=conv<double>(gamma.end())-I;
+ Array<double>&ai=counts.doGetAlphaInit(I);
+ Array<double>&bi=counts.doGetBetaInit(I);
+ int firstFrenchClass=(fs.size()>1)?(fwordclasses.getClass(fs[1+0])):0;
+ for(i=0;i<I;i++,gp1++,gp2++)
+ {
+ CLASSIFY(i,i_empty,ireal);
+ ai[i]+= *gp1;
+ bi[i]+= *gp2;
+ if( DependencyOfPrevAJ==0 )
+ {
+ if( i_empty )
+ p0c+=*gp1;
+ else
+ {
+ counts.addAlCount(-1,ireal,l,m,0,firstFrenchClass,0,*gp1,0.0);
+ np0c+=*gp1;
+ }
+ }
+ }
+ if( Verbose )
+ cout << "l: " << l << "m: " << m << " p0c: " << p0c << " np0c: " << np0c << endl;
+ }
+ cross_entropy+=log(max(trainProb,1e-100))+log(max(net->finalMultiply,1e-100));
+ Array<int>vit;
+ double viterbi_score=1.0;
+ if( (HMMTrainingSpecialFlags&1) )
+ HMMViterbi(*net,gamma,vit);
+ else
+ viterbi_score=HMMRealViterbi(*net,vit);
+ for(j=1;j<=m;j++)
+ {
+ viterbi_alignment[j]=vit[j-1]+1;
+ if( viterbi_alignment[j]>l)
+ viterbi_alignment[j]=0;
+ }
+ sHandler1.setProbOfSentence(sent,cross_entropy);
+ perp.addFactor(cross_entropy, so, l, m,1);
+ viterbi_perp.addFactor(log(viterbi_score)+log(max(net->finalMultiply,1e-100)), so, l, m,1);
+ if( Verbose )
+ cout << "Viterbi-perp: " << log(viterbi_score) << ' ' << log(max(net->finalMultiply,1e-100)) << ' ' << viterbi_score << ' ' << net->finalMultiply << ' ' << *net << "gamma: " << gamma << endl;
+ delete net;net=0;
+ if (dump_alignment||(FEWDUMPS&&sent.getSentenceNo()<1000) )
+ printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.getSentenceNo(), viterbi_score);
+ addAL(viterbi_alignment,sent.getSentenceNo(),l);
+ pair_no++;
+ } /* of while */
+ sHandler1.rewind();
+ perp.record("HMM");
+ viterbi_perp.record("HMM");
+ errorReportAL(cout,"HMM");
+}
+
+#include "HMMTables.cpp"
+template class HMMTables<int,WordClasses>;
+
diff --git a/GIZA++-v2/hmm.h b/GIZA++-v2/hmm.h
new file mode 100644
index 0000000..d4f3301
--- /dev/null
+++ b/GIZA++-v2/hmm.h
@@ -0,0 +1,88 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _hmm_h
+#define _hmm_h 1
+
+#include <assert.h>
+
+#include <iostream>
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <set>
+#include "Vector.h"
+#include <utility>
+
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+#include <fstream.h>
+#include <math.h>
+#include <time.h>
+
+#include "TTables.h"
+#include "ATables.h"
+#include "getSentence.h"
+#include "defs.h"
+#include "model2.h"
+#include "Perplexity.h"
+#include "vocab.h"
+#include "WordClasses.h"
+#include "HMMTables.h"
+#include "ForwardBackward.h"
+
+class hmm : public model2
+{
+ private:
+ WordClasses ewordclasses;
+ WordClasses fwordclasses;
+ HMMTables<int,WordClasses> counts,probs;
+ public:
+ template<class MAPPER>
+ void makeWordClasses(const MAPPER&m1,const MAPPER&m2,string efile,string ffile)
+ {
+ ifstream estrm(efile.c_str()),fstrm(ffile.c_str());
+ if( !estrm )
+ {
+ cerr << "ERROR: can not read " << efile << endl;
+ }
+ else
+ ewordclasses.read(estrm,m1);
+ if( !fstrm )
+ cerr << "ERROR: can not read " << ffile << endl;
+ else
+ fwordclasses.read(fstrm,m2);
+ }
+ hmm(model2&m2);
+ void initialize_table_uniformly(sentenceHandler&);
+ int em_with_tricks(int);
+ void load_table(const char* aname);
+ void em_loop(Perplexity& perp, sentenceHandler& sHandler1, bool dump_files,
+ const char* alignfile, Perplexity&, bool test,bool doInit,int iter);
+ HMMNetwork *makeHMMNetwork(const Vector<WordIndex>& es,const Vector<WordIndex>&fs,bool doInit)const;
+ friend class model3;
+};
+
+#endif
diff --git a/GIZA++-v2/logprob.cpp b/GIZA++-v2/logprob.cpp
new file mode 100644
index 0000000..8134b77
--- /dev/null
+++ b/GIZA++-v2/logprob.cpp
@@ -0,0 +1,161 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#pragma implementation
+
+// Routines to perform integer exponential arithmetic.
+// A number x is represented as n, where x = b**n.
+// It is assumed that b > 1, something like b = 1.001;
+
+#include "logprob.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <fstream>
+#include <string>
+double *LogProb::ntof = NULL; // Tables will be initialized
+int *LogProb::addtbl = NULL; // in Initialize function.
+int *LogProb::subtbl = NULL; //
+
+const int LogProb::max_2byte_integer = 32767;
+const int LogProb::min_2byte_integer = -32768;
+const double LogProb::b = 1.001; // a logarithm basis
+const double LogProb::logb2 = log(b);
+//const int LogProb::nmax = round(78.0E0 * log(1.0E1) / logb2);
+const int LogProb::nmax = round(300.0E0 * log(1.0E1) / logb2);
+const int LogProb::nmin = -nmax;
+const int LogProb::tblbnd = round(log((b-1.0E0)/2.0E0)/logb2);
+const int LogProb::zeron = round(pow(-2, 23));
+const int LogProb::onen = 0;
+const int LogProb::infn = onen - zeron;
+
+const int LogProb::initialized = LogProb::Initialize();
+const LogProb LogProb::zero(0);
+const LogProb LogProb::one(1);
+const LogProb LogProb::minus2(1e-2);
+const LogProb LogProb::minus4(1e-4);
+const LogProb LogProb::minus6(1e-6);
+const LogProb LogProb::minus8(1e-8);
+const LogProb LogProb::minus10(1e-10);
+const LogProb LogProb::minus12(1e-12);
+const LogProb LogProb::minus14(1e-14);
+const LogProb LogProb::minus16(1e-16);
+
+// static table initialization function
+int LogProb::Initialize()
+{
+ int nbytes = sizeof(double)*(nmax-nmin+1) + sizeof(int)*(0-tblbnd+1);
+ cerr << nbytes << " bytes used for LogProb tables (C++ version)\n";
+ ntof = new double[nmax-nmin+1];
+ addtbl = new int[-tblbnd+1];
+ subtbl = new int[-tblbnd+1];
+
+ // char filename[257];
+ // string filename ;
+ // ifstream ifs;
+ // ifs.open(filename.c_str());
+ // if (!ifs)
+ // {
+ int i;
+ cerr << "Building integer logs conversion tables\n";
+ ntof[0] = 0 ;
+
+ for (i=nmin+1; i<=nmax; ++i)
+ {
+ double x = i;
+ ntof[i-nmin] = exp(x*logb2);
+
+ }
+ for (i=tblbnd; i<=0; ++i)
+ {
+ double x = 1.0 + pow(b, i);
+ addtbl[i-tblbnd] = round(log(x)/logb2);
+ }
+ double sqrtb = exp(0.5*logb2);
+ for (i=0; i<=-tblbnd; ++i)
+ {
+ double x = sqrtb * pow(b, i) - 1.0;
+ subtbl[i] = round(log(x)/logb2);
+ }
+ // if (toolsRoot)
+ // {
+ // ofstream ofs(filename.c_str());
+ // if (!ofs)
+ // cerr << "Could not write LogProb data to " << filename << endl;
+ // else
+ // {
+ // ofs.write((const char *)ntof, sizeof(double) * (nmax-nmin+1));
+ // ofs.write((const char *)addtbl, sizeof(int) * (-tblbnd+1));
+ // ofs.write((const char *)subtbl, sizeof(int) * (-tblbnd+1));
+ // }
+ // }
+ // }
+ // else
+ // {
+ // ifs.read((char *)ntof, sizeof(double) * (nmax - nmin + 1));
+ // ifs.read((char *)addtbl, sizeof(int) * (-tblbnd+1));
+ // ifs.read((char *)subtbl, sizeof(int) * (-tblbnd+1));
+ // }
+ return 1;
+}
+
+void LogProb::FreeTables()
+{
+ delete [] addtbl;
+ delete [] subtbl;
+ delete [] ntof;
+}
+
+//---------------------------------------------------------------------------
+// Aritmetic operators
+//---------------------------------------------------------------------------
+
+
+// Subtract two logarithm numbers. Use the following method:
+// b**n - b**m = b**m( b**(n-m) - 1 ), assuming n >= m.
+LogProb& LogProb::operator-=(const LogProb &subs)
+{
+ if (subs.logr == zeron)
+ return *this;
+ int a = logr - subs.logr;
+ if (a <= 0)
+ {
+ if (a < 0)
+ {
+ cerr << "WARNING(logprob): Invalid arguments to nsub" <<(*this)<< " " << subs << endl;
+ //abort();
+ }
+ logr = zeron;
+ return *this;
+ }
+ if (a > -tblbnd)
+ return *this;
+ logr = subs.logr + subtbl[a];
+ return *this;
+}
+
+
+
+
+
+
+
+
diff --git a/GIZA++-v2/logprob.h b/GIZA++-v2/logprob.h
new file mode 100644
index 0000000..feaf7cb
--- /dev/null
+++ b/GIZA++-v2/logprob.h
@@ -0,0 +1,222 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _LOGPROB_H
+#define _LOGPROB_H
+
+#pragma interface
+
+// Routines to perform integer exponential arithmetic.
+// A number x is represented as n, where x = b**n
+// It is assumed that b > 1, something like b = 1.001
+
+#include <iostream>
+#include <math.h>
+#include <algorithm>
+
+//#define MAX(A,B) ((A) > (B) ? (A) : (B))
+//#define MIN(A,B) ((A) > (B) ? (B) : (A))
+
+
+class LogProb {
+public:
+ // mj for cross entropy
+ double base2() const {
+ return (logr * logb2 / log(2));
+ }
+
+ // Constructors
+ LogProb() : logr(zeron) {}
+ LogProb(const LogProb &obj) : logr(obj.logr) {}
+ LogProb(double x) : logr(x == 0.0 ? zeron : round(log(x)/logb2)) {}
+ // destructor
+ ~LogProb() {} // default destructor
+
+ operator double() const // converts logr to (double) b**logr
+ {
+ if (logr < nmin) return ntof[0];
+ if (logr > nmax) return ntof[nmax-nmin];
+ return ntof[logr-nmin];
+ }
+
+ LogProb &operator=(const LogProb &obj) { logr = obj.logr; return *this; }
+ int operator!() const { return logr == zeron; }
+
+ // iostream friend specifications
+ friend ostream& operator<<(ostream& os, const LogProb &obj);
+ friend istream& operator>>(istream& is, LogProb &obj);
+ friend ostream& operator<<=(ostream& os, const LogProb &obj);
+ friend istream& operator>>=(istream& is, LogProb &obj);
+
+ // arithmetic operators
+ LogProb &operator+=(const LogProb &add) // logr2 = logb ( b**logr2 + b**logr1 )
+ // Add two numbers represented as logarithms. Use the following method:
+ // b**n + b**m = b**n(1 + b**(m-n)), assuming n >= m.
+ {
+ if (add.logr == zeron)
+ return *this;
+ if (logr == zeron)
+ {
+ logr = add.logr;
+ return *this;
+ }
+ int a = add.logr - logr;
+ if (a > 0)
+ {
+ a = -a;
+ logr = add.logr;
+ }
+ if (a < tblbnd)
+ return *this;
+ logr += addtbl[a-tblbnd];
+ return *this;
+ }
+
+ LogProb &operator-=(const LogProb &); // logr2 = logb ( b**logr2 + b**logr1 )
+ LogProb operator*(const LogProb &mul) const // logr3 = logr2 + logr1
+ {
+ LogProb result; // start out with result == 0
+ if ((logr != zeron) && (mul.logr != zeron))
+ result.logr = std::max(logr+mul.logr, zeron);
+ return result;
+ }
+ LogProb operator*(double x) const // logr3 = logr2 + logr1
+ {
+ return (*this)*(LogProb)x;
+ }
+ LogProb operator^(const int i) const // logr2 = logr1 * i
+ {
+ LogProb result; // start out with result == 0
+ // if ((logr != zeron) && (mul.logr != zeron))
+ result.logr = logr * i ;
+ return result;
+ }
+ LogProb &operator*=(const LogProb &mul) // logr2 += logr1
+ {
+ if ((logr == zeron) || (mul.logr == zeron))
+ logr = zeron;
+ else
+ logr = std::max(logr+mul.logr, zeron);
+ return *this;
+ }
+ LogProb operator/(const LogProb &div) const // logr3 = logr2 -logr1
+ {
+ LogProb result;
+ if (logr != zeron)
+ result.logr = std::max(logr - div.logr, zeron);
+ return result;
+ }
+ LogProb &operator/=(const LogProb &div) // logr2 -= logr1
+ {
+ if (logr != zeron)
+ logr = std::max(logr - div.logr, zeron);
+ return *this;
+ }
+ LogProb operator+(const LogProb &l) const // logr3 = logb ( b**logr2 + b**logr1 )
+ { LogProb result(*this); result += l; return result; }
+ LogProb operator-(const LogProb &l) const // logr3 = logb ( b**logr2 - b**logr1 )
+ { LogProb result(*this); result -= l; return result; }
+ LogProb power(const int n) const // logr2 = logr1 * int
+ { LogProb result(*this); result.logr *= n; return result; }
+
+ // Conditional operators
+ int operator<(const LogProb &obj) const { return logr < obj.logr; }
+ int operator<=(const LogProb &obj) const { return logr <= obj.logr; }
+ int operator>(const LogProb &obj) const { return logr > obj.logr; }
+ int operator>=(const LogProb &obj) const { return logr >= obj.logr; }
+ int operator==(const LogProb &obj) const { return logr == obj.logr; }
+ int operator!=(const LogProb &obj) const { return logr != obj.logr; }
+ int operator<(double d) const { return ((double)*this) < d; }
+ int operator<=(double d) const { return ((double)*this) <= d; }
+ int operator>(double d) const { return ((double)*this) > d; }
+ int operator>=(double d) const { return ((double)*this) >= d; }
+ int operator==(double d) const { return ((double)*this) == d; }
+ int operator!=(double d) const { return ((double)*this) != d; }
+
+
+ LogProb &SetZero() { logr = zeron; return *this; } // representation of 0,
+ LogProb &SetOne() { logr = onen; return *this; } // 1, and
+ LogProb &SetInf() { logr = infn; return *this; } // inf in logarithm domain
+
+private:
+ int logr; // a representation of logarithm
+ // static constants
+ static const int initialized; // initialization flag
+ static const double b;
+ static const double logb2;
+ static const int nmin, nmax;
+ static const int tblbnd;
+ static const int zeron, onen, infn; // zero, one, and inf in log domain
+ static const int max_2byte_integer, min_2byte_integer;
+
+ // Arithmetic computation Tables
+ static double *ntof;
+ static int *addtbl;
+ static int *subtbl;
+
+ static int Initialize();
+
+public:
+ static void FreeTables();
+ // constants for initializing LogProbs to 0 or 1
+ static const LogProb zero;
+ static const LogProb one;
+ static const LogProb minus2;
+ static const LogProb minus4;
+ static const LogProb minus6;
+ static const LogProb minus8;
+ static const LogProb minus10;
+ static const LogProb minus12;
+ static const LogProb minus14;
+ static const LogProb minus16;
+};
+
+// iostream friend operators
+inline ostream &operator<<(ostream& os, const LogProb &obj)
+{
+ return os << (double) obj; // output in linear domain, b**logr
+}
+
+inline istream &operator>>(istream& is, LogProb &obj)
+{
+ double d;
+ is >> d;
+ obj = d;
+ return is;
+}
+
+inline ostream &operator<<=(ostream& os, const LogProb &obj) // write binary
+{
+ os.write((const char *)&obj.logr, sizeof(obj.logr));
+ return os;
+}
+
+inline istream &operator>>=(istream& is, LogProb &obj)
+{
+ is.read((char *)&obj.logr, sizeof(obj.logr));
+ return is;
+}
+
+#endif
+
+
+
+
diff --git a/GIZA++-v2/main.cpp b/GIZA++-v2/main.cpp
new file mode 100644
index 0000000..6032ebe
--- /dev/null
+++ b/GIZA++-v2/main.cpp
@@ -0,0 +1,718 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+#include <strstream>
+#include "getSentence.h"
+#include "TTables.h"
+#include "model1.h"
+#include "model2.h"
+#include "model3.h"
+#include "hmm.h"
+#include "file_spec.h"
+#include "defs.h"
+#include "vocab.h"
+#include "Perplexity.h"
+#include "Dictionary.h"
+#include "utility.h"
+#include "Parameter.h"
+#include "myassert.h"
+#include "D4Tables.h"
+#include "D5Tables.h"
+#include "transpair_model4.h"
+#include "transpair_model5.h"
+
+#define ITER_M2 0
+#define ITER_MH 5
+
+GLOBAL_PARAMETER3(int,Model1_Iterations,"Model1_Iterations","NO. ITERATIONS MODEL 1","m1","number of iterations for Model 1",PARLEV_ITER,5);
+GLOBAL_PARAMETER3(int,Model2_Iterations,"Model2_Iterations","NO. ITERATIONS MODEL 2","m2","number of iterations for Model 2",PARLEV_ITER,ITER_M2);
+GLOBAL_PARAMETER3(int,HMM_Iterations,"HMM_Iterations","mh","number of iterations for HMM alignment model","mh", PARLEV_ITER,ITER_MH);
+GLOBAL_PARAMETER3(int,Model3_Iterations,"Model3_Iterations","NO. ITERATIONS MODEL 3","m3","number of iterations for Model 3",PARLEV_ITER,5);
+GLOBAL_PARAMETER3(int,Model4_Iterations,"Model4_Iterations","NO. ITERATIONS MODEL 4","m4","number of iterations for Model 4",PARLEV_ITER,5);
+GLOBAL_PARAMETER3(int,Model5_Iterations,"Model5_Iterations","NO. ITERATIONS MODEL 5","m5","number of iterations for Model 5",PARLEV_ITER,0);
+GLOBAL_PARAMETER3(int,Model6_Iterations,"Model6_Iterations","NO. ITERATIONS MODEL 6","m6","number of iterations for Model 6",PARLEV_ITER,0);
+
+
+GLOBAL_PARAMETER(float, PROB_SMOOTH,"probSmooth","probability smoothing (floor) value ",PARLEV_OPTHEUR,1e-7);
+GLOBAL_PARAMETER(float, MINCOUNTINCREASE,"minCountIncrease","minimal count increase",PARLEV_OPTHEUR,1e-7);
+
+GLOBAL_PARAMETER2(int,Transfer_Dump_Freq,"TRANSFER DUMP FREQUENCY","t2to3","output: dump of transfer from Model 2 to 3",PARLEV_OUTPUT,0);
+GLOBAL_PARAMETER2(bool,Verbose,"verbose","v","0: not verbose; 1: verbose",PARLEV_OUTPUT,0);
+GLOBAL_PARAMETER(bool,Log,"log","0: no logfile; 1: logfile",PARLEV_OUTPUT,0);
+
+
+GLOBAL_PARAMETER(double,P0,"p0","fixed value for parameter p_0 in IBM-3/4 (if negative then it is determined in training)",PARLEV_EM,-1.0);
+GLOBAL_PARAMETER(double,M5P0,"m5p0","fixed value for parameter p_0 in IBM-5 (if negative then it is determined in training)",PARLEV_EM,-1.0);
+GLOBAL_PARAMETER3(bool,Peg,"pegging","p","DO PEGGING? (Y/N)","0: no pegging; 1: do pegging",PARLEV_EM,0);
+
+GLOBAL_PARAMETER(short,OldADBACKOFF,"adbackoff","",-1,0);
+GLOBAL_PARAMETER2(unsigned int,MAX_SENTENCE_LENGTH,"ml","MAX SENTENCE LENGTH","maximum sentence length",0,MAX_SENTENCE_LENGTH_ALLOWED);
+
+
+GLOBAL_PARAMETER(short, DeficientDistortionForEmptyWord,"DeficientDistortionForEmptyWord","0: IBM-3/IBM-4 as described in (Brown et al. 1993); 1: distortion model of empty word is deficient; 2: distoriton model of empty word is deficient (differently); setting this parameter also helps to avoid that during IBM-3 and IBM-4 training too many words are aligned with the empty word",PARLEV_MODELS,0);
+short OutputInAachenFormat=0;
+bool Transfer=TRANSFER;
+bool Transfer2to3=0;
+short NoEmptyWord=0;
+bool FEWDUMPS=0;
+GLOBAL_PARAMETER(bool,ONLYALDUMPS,"ONLYALDUMPS","1: do not write any files",PARLEV_OUTPUT,0);
+GLOBAL_PARAMETER(short,CompactAlignmentFormat,"CompactAlignmentFormat","0: detailled alignment format, 1: compact alignment format ",PARLEV_OUTPUT,0);
+GLOBAL_PARAMETER2(bool,NODUMPS,"NODUMPS","NO FILE DUMPS? (Y/N)","1: do not write any files",PARLEV_OUTPUT,0);
+
+GLOBAL_PARAMETER(WordIndex,MAX_FERTILITY,"MAX_FERTILITY","maximal fertility for fertility models",PARLEV_EM,10);
+
+Vector<map< pair<int,int>,char > > ReferenceAlignment;
+
+
+bool useDict = false ;
+string CoocurrenceFile;
+string Prefix, LogFilename, OPath, Usage,
+ SourceVocabFilename, TargetVocabFilename, CorpusFilename,
+ TestCorpusFilename, t_Filename, a_Filename, p0_Filename, d_Filename,
+ n_Filename, dictionary_Filename;
+
+ofstream logmsg ;
+const string str2Num(int n){
+ string number = "";
+ do{
+ number.insert((size_t)0, 1, (char)(n % 10 + '0'));
+ } while((n /= 10) > 0);
+ return(number) ;
+}
+
+
+double LAMBDA=1.09;
+sentenceHandler *testCorpus=0,*corpus=0;
+Perplexity trainPerp, testPerp, trainViterbiPerp, testViterbiPerp ;
+
+string ReadTablePrefix;
+
+
+void printGIZAPars(ostream&out)
+{
+ out << "general parameters:\n"
+ "-------------------\n";
+ printPars(out,getGlobalParSet(),0);
+ out << '\n';
+
+ out << "No. of iterations:\n-"
+ "------------------\n";
+ printPars(out,getGlobalParSet(),PARLEV_ITER);
+ out << '\n';
+
+ out << "parameter for various heuristics in GIZA++ for efficient training:\n"
+ "------------------------------------------------------------------\n";
+ printPars(out,getGlobalParSet(),PARLEV_OPTHEUR);
+ out << '\n';
+
+ out << "parameters for describing the type and amount of output:\n"
+ "-----------------------------------------------------------\n";
+ printPars(out,getGlobalParSet(),PARLEV_OUTPUT);
+ out << '\n';
+
+ out << "parameters describing input files:\n"
+ "----------------------------------\n";
+ printPars(out,getGlobalParSet(),PARLEV_INPUT);
+ out << '\n';
+
+ out << "smoothing parameters:\n"
+ "---------------------\n";
+ printPars(out,getGlobalParSet(),PARLEV_SMOOTH);
+ out << '\n';
+
+ out << "parameters modifying the models:\n"
+ "--------------------------------\n";
+ printPars(out,getGlobalParSet(),PARLEV_MODELS);
+ out << '\n';
+
+ out << "parameters modifying the EM-algorithm:\n"
+ "--------------------------------------\n";
+ printPars(out,getGlobalParSet(),PARLEV_EM);
+ out << '\n';
+}
+
+const char*stripPath(const char*fullpath)
+ // strip the path info from the file name
+{
+ const char *ptr = fullpath + strlen(fullpath) - 1 ;
+ while(ptr && ptr > fullpath && *ptr != '/'){ptr--;}
+ if( *ptr=='/' )
+ return(ptr+1);
+ else
+ return ptr;
+}
+
+
+void printDecoderConfigFile()
+{
+ string decoder_config_file = Prefix + ".Decoder.config" ;
+ cerr << "writing decoder configuration file to " << decoder_config_file.c_str() <<'\n';
+ ofstream decoder(decoder_config_file.c_str());
+ if(!decoder){
+ cerr << "\nCannot write to " << decoder_config_file <<'\n';
+ exit(1);
+ }
+ decoder << "# Template for Configuration File for the Rewrite Decoder\n# Syntax:\n"
+ << "# <Variable> = <value>\n# '#' is the comment character\n"
+ << "#================================================================\n"
+ << "#================================================================\n"
+ << "# LANGUAGE MODEL FILE\n# The full path and file name of the language model file:\n";
+ decoder << "LanguageModelFile =\n";
+ decoder << "#================================================================\n"
+ << "#================================================================\n"
+ << "# TRANSLATION MODEL FILES\n# The directory where the translation model tables as created\n"
+ << "# by Giza are located:\n#\n"
+ << "# Notes: - All translation model \"source\" files are assumed to be in\n"
+ << "# TM_RawDataDir, the binaries will be put in TM_BinDataDir\n"
+ << "#\n# - Attention: RELATIVE PATH NAMES DO NOT WORK!!!\n"
+ << "#\n# - Absolute paths (file name starts with /) will override\n"
+ << "# the default directory.\n\n";
+ // strip file prefix info and leave only the path name in Prefix
+ string path = Prefix.substr(0, Prefix.find_last_of("/")+1);
+ if( path=="" )
+ path=".";
+ decoder << "TM_RawDataDir = " << path << '\n';
+ decoder << "TM_BinDataDir = " << path << '\n' << '\n';
+ decoder << "# file names of the TM tables\n# Notes:\n"
+ << "# 1. TTable and InversTTable are expected to use word IDs not\n"
+ << "# strings (Giza produces both, whereby the *.actual.* files\n"
+ << "# use strings and are THE WRONG CHOICE.\n"
+ << "# 2. FZeroWords, on the other hand, is a simple list of strings\n"
+ << "# with one word per line. This file is typically edited\n"
+ << "# manually. Hoeever, this one listed here is generated by GIZA\n\n";
+
+ int lastmodel;
+ if (Model5_Iterations>0)
+ lastmodel = 5 ;
+ else if (Model4_Iterations>0)
+ lastmodel = 4 ;
+ else if (Model3_Iterations>0)
+ lastmodel = 3 ;
+ else if (Model2_Iterations>0)
+ lastmodel = 2 ;
+ else lastmodel = 1 ;
+ string lastModelName = str2Num(lastmodel);
+ string p=Prefix + ".t" + /*lastModelName*/"3" +".final";
+ decoder << "TTable = " << stripPath(p.c_str()) << '\n';
+ p = Prefix + ".ti.final" ;
+ decoder << "InverseTTable = " << stripPath(p.c_str()) << '\n';
+ p=Prefix + ".n" + /*lastModelName*/"3" + ".final";
+ decoder << "NTable = " << stripPath(p.c_str()) << '\n';
+ p=Prefix + ".d" + /*lastModelName*/"3" + ".final";
+ decoder << "D3Table = " << stripPath(p.c_str()) << '\n';
+ p=Prefix + ".D4.final";
+ decoder << "D4Table = " << stripPath(p.c_str()) << '\n';
+ p=Prefix + ".p0_"+ /*lastModelName*/"3" + ".final";
+ decoder << "PZero = " << stripPath(p.c_str()) << '\n';
+ decoder << "Source.vcb = " << SourceVocabFilename << '\n';
+ decoder << "Target.vcb = " << TargetVocabFilename << '\n';
+ // decoder << "Source.classes = " << SourceVocabFilename + ".classes" << '\n';
+ // decoder << "Target.classes = " << TargetVocabFilename + ".classes" <<'\n';
+ decoder << "Source.classes = " << SourceVocabFilename+".classes" << '\n';
+ decoder << "Target.classes = " << TargetVocabFilename + ".classes" <<'\n';
+ p=Prefix + ".fe0_"+ /*lastModelName*/"3" + ".final";
+ decoder << "FZeroWords = " <<stripPath(p.c_str()) << '\n' ;
+
+ /* decoder << "# Translation Parameters\n"
+ << "# Note: TranslationModel and LanguageModelMode must have NUMBERS as\n"
+ << "# values, not words\n"
+ << "# CORRECT: LanguageModelMode = 2\n"
+ << "# WRONG: LanguageModelMode = bigrams # WRONG, WRONG, WRONG!!!\n";
+ decoder << "TMWeight = 0.6 # weight of TM for calculating alignment probability\n";
+ decoder << "TranslationModel = "<<lastmodel<<" # which model to use (3 or 4)\n";
+ decoder << "LanguageModelMode = 2 # (2 (bigrams) or 3 (trigrams)\n\n";
+ decoder << "# Output Options\n"
+ << "TellWhatYouAreDoing = TRUE # print diagnostic messages to stderr\n"
+ << "PrintOriginal = TRUE # repeat original sentence in the output\n"
+ << "TopTranslations = 3 # number of n best translations to be returned\n"
+ << "PrintProbabilities = TRUE # give the probabilities for the translations\n\n";
+
+ decoder << "# LOGGING OPTIONS\n"
+ << "LogFile = - # empty means: no log, dash means: STDOUT\n"
+ << "LogLM = true # log language model lookups\n"
+ << "LogTM = true # log translation model lookups\n";
+ */
+}
+
+
+void printAllTables(vcbList& eTrainVcbList, vcbList& eTestVcbList,
+ vcbList& fTrainVcbList, vcbList& fTestVcbList, model1& m1)
+{
+ cerr << "writing Final tables to Disk \n";
+ string t_inv_file = Prefix + ".ti.final" ;
+ if( !FEWDUMPS)
+ m1.getTTable().printProbTableInverse(t_inv_file.c_str(), m1.getEnglishVocabList(),
+ m1.getFrenchVocabList(),
+ m1.getETotalWCount(),
+ m1.getFTotalWCount());
+ t_inv_file = Prefix + ".actual.ti.final" ;
+ if( !FEWDUMPS )
+ m1.getTTable().printProbTableInverse(t_inv_file.c_str(),
+ eTrainVcbList.getVocabList(),
+ fTrainVcbList.getVocabList(),
+ m1.getETotalWCount(),
+ m1.getFTotalWCount(), true);
+
+ string perp_filename = Prefix + ".perp" ;
+ ofstream of_perp(perp_filename.c_str());
+
+ cout << "Writing PERPLEXITY report to: " << perp_filename << '\n';
+ if(!of_perp){
+ cerr << "\nERROR: Cannot write to " << perp_filename <<'\n';
+ exit(1);
+ }
+
+ if (testCorpus)
+ generatePerplexityReport(trainPerp, testPerp, trainViterbiPerp,
+ testViterbiPerp, of_perp, (*corpus).getTotalNoPairs1(),
+ (*testCorpus).getTotalNoPairs1(),
+ true);
+ else
+ generatePerplexityReport(trainPerp, testPerp, trainViterbiPerp, testViterbiPerp,
+ of_perp, (*corpus).getTotalNoPairs1(), 0, true);
+
+ string eTrainVcbFile = Prefix + ".trn.src.vcb" ;
+ ofstream of_eTrainVcb(eTrainVcbFile.c_str());
+ cout << "Writing source vocabulary list to : " << eTrainVcbFile << '\n';
+ if(!of_eTrainVcb){
+ cerr << "\nERROR: Cannot write to " << eTrainVcbFile <<'\n';
+ exit(1);
+ }
+ eTrainVcbList.printVocabList(of_eTrainVcb) ;
+
+ string fTrainVcbFile = Prefix + ".trn.trg.vcb" ;
+ ofstream of_fTrainVcb(fTrainVcbFile.c_str());
+ cout << "Writing source vocabulary list to : " << fTrainVcbFile << '\n';
+ if(!of_fTrainVcb){
+ cerr << "\nERROR: Cannot write to " << fTrainVcbFile <<'\n';
+ exit(1);
+ }
+ fTrainVcbList.printVocabList(of_fTrainVcb) ;
+
+ //print test vocabulary list
+
+ string eTestVcbFile = Prefix + ".tst.src.vcb" ;
+ ofstream of_eTestVcb(eTestVcbFile.c_str());
+ cout << "Writing source vocabulary list to : " << eTestVcbFile << '\n';
+ if(!of_eTestVcb){
+ cerr << "\nERROR: Cannot write to " << eTestVcbFile <<'\n';
+ exit(1);
+ }
+ eTestVcbList.printVocabList(of_eTestVcb) ;
+
+ string fTestVcbFile = Prefix + ".tst.trg.vcb" ;
+ ofstream of_fTestVcb(fTestVcbFile.c_str());
+ cout << "Writing source vocabulary list to : " << fTestVcbFile << '\n';
+ if(!of_fTestVcb){
+ cerr << "\nERROR: Cannot write to " << fTestVcbFile <<'\n';
+ exit(1);
+ }
+ fTestVcbList.printVocabList(of_fTestVcb) ;
+ printDecoderConfigFile();
+ if (testCorpus)
+ printOverlapReport(m1.getTTable(), *testCorpus, eTrainVcbList,
+ fTrainVcbList, eTestVcbList, fTestVcbList);
+
+}
+
+bool readNextSent(istream&is,map< pair<int,int>,char >&s,int&number)
+{
+ string x;
+ if( !(is >> x) ) return 0;
+ if( x=="SENT:" ) is >> x;
+ int n=atoi(x.c_str());
+ if( number==-1 )
+ number=n;
+ else
+ if( number!=n )
+ {
+ cerr << "ERROR: readNextSent: DIFFERENT NUMBERS: " << number << " " << n << '\n';
+ return 0;
+ }
+ int nS,nP,nO;
+ nS=nP=nO=0;
+ while( is >> x )
+ {
+ if( x=="SENT:" )
+ return 1;
+ int n1,n2;
+ is >> n1 >> n2;
+ map< pair<int,int>,char >::const_iterator i=s.find(pair<int,int>(n1,n2));
+ if( i==s.end()||i->second=='P' )
+ s[pair<int,int>(n1,n2)]=x[0];
+ massert(x[0]=='S'||x[0]=='P');
+ nS+= (x[0]=='S');
+ nP+= (x[0]=='P');
+ nO+= (!(x[0]=='S'||x[0]=='P'));
+ }
+ return 1;
+}
+
+bool emptySent(map< pair<int,int>,char >&x)
+{
+ x = map< pair<int,int>,char >();
+ return 1;
+}
+
+void ReadAlignment(const string&x,Vector<map< pair<int,int>,char > >&a)
+{
+ ifstream infile(x.c_str());
+ a.clear();
+ map< pair<int,int>,char >sent;
+ int number=0;
+ while( emptySent(sent) && (readNextSent(infile,sent,number)) )
+ {
+ if( int(a.size())!=number )
+ cerr << "ERROR: ReadAlignment: " << a.size() << " " << number << '\n';
+ a.push_back(sent);
+ number++;
+ }
+ cout << "Read: " << a.size() << " sentences in reference alignment." << '\n';
+}
+
+
+void initGlobals(void)
+{
+ NODUMPS = false ;
+ Prefix = Get_File_Spec();
+ LogFilename= Prefix + ".log";
+ MAX_SENTENCE_LENGTH = MAX_SENTENCE_LENGTH_ALLOWED ;
+}
+
+void convert(const map< pair<int,int>,char >&reference,alignment&x)
+{
+ int l=x.get_l();
+ int m=x.get_m();
+ for(map< pair<int,int>,char >::const_iterator i=reference.begin();i!=reference.end();++i)
+ {
+ if( i->first.first+1>int(m) )
+ {
+ cerr << "ERROR m to big: " << i->first.first << " " << i->first.second+1 << " " << l << " " << m << " is wrong.\n";
+ continue;
+ }
+ if( i->first.second+1>int(l) )
+ {
+ cerr << "ERROR l to big: " << i->first.first << " " << i->first.second+1 << " " << l << " " << m << " is wrong.\n";
+ continue;
+ }
+ if( x(i->first.first+1)!=0 )
+ cerr << "ERROR: position " << i->first.first+1 << " already set\n";
+ x.set(i->first.first+1,i->first.second+1);
+ }
+}
+double ErrorsInAlignment(const map< pair<int,int>,char >&reference,const Vector<WordIndex>&test,int l,int&missing,int&toomuch,int&eventsMissing,int&eventsToomuch,int pair_no)
+{
+ int err=0;
+ for(unsigned int j=1;j<test.size();j++)
+ {
+ if( test[j]>0 )
+ {
+ map< pair<int,int>,char >::const_iterator i=reference.find(make_pair(test[j]-1,j-1));
+ if( i==reference.end() )
+ {
+ toomuch++;
+ err++;
+ }
+ else
+ if( !(i->second=='S' || i->second=='P'))
+ cerr << "ERROR: wrong symbol in reference alignment '" << i->second << ' ' << int(i->second) << " no:" << pair_no<< "'\n";
+ eventsToomuch++;
+ }
+ }
+ for(map< pair<int,int>,char >::const_iterator i=reference.begin();i!=reference.end();++i)
+ {
+ if( i->second=='S' )
+ {
+ unsigned int J=i->first.second+1;
+ unsigned int I=i->first.first+1;
+ if( int(J)>=int(test.size())||int(I)>int(l)||int(J)<1||int(I)<1 )
+ cerr << "ERROR: alignment outside of range in reference alignment" << J << " " << test.size() << " (" << I << " " << l << ") no:" << pair_no << '\n';
+ else
+ {
+ if(test[J]!=I)
+ {
+ missing++;
+ err++;
+ }
+ }
+ eventsMissing++;
+ }
+ }
+ if( Verbose )
+ cout << err << " errors in sentence\n";
+ if( eventsToomuch+eventsMissing )
+ return (toomuch+missing)/(eventsToomuch+eventsMissing);
+ else
+ return 1.0;
+}
+
+
+vcbList *globeTrainVcbList,*globfTrainVcbList;
+
+double StartTraining(int&result)
+{
+ double errors=0.0;
+ vcbList eTrainVcbList, fTrainVcbList;
+ globeTrainVcbList=&eTrainVcbList;
+ globfTrainVcbList=&fTrainVcbList;
+
+
+ string repFilename = Prefix + ".gizacfg" ;
+ ofstream of2(repFilename.c_str());
+ writeParameters(of2,getGlobalParSet(),-1) ;
+
+ cout << "reading vocabulary files \n";
+ eTrainVcbList.setName(SourceVocabFilename.c_str());
+ fTrainVcbList.setName(TargetVocabFilename.c_str());
+ eTrainVcbList.readVocabList();
+ fTrainVcbList.readVocabList();
+ cout << "Source vocabulary list has " << eTrainVcbList.uniqTokens() << " unique tokens \n";
+ cout << "Target vocabulary list has " << fTrainVcbList.uniqTokens() << " unique tokens \n";
+
+ vcbList eTestVcbList(eTrainVcbList) ;
+ vcbList fTestVcbList(fTrainVcbList) ;
+
+ corpus = new sentenceHandler(CorpusFilename.c_str(), &eTrainVcbList, &fTrainVcbList);
+
+ if (TestCorpusFilename == "NONE")
+ TestCorpusFilename = "";
+
+ if (TestCorpusFilename != ""){
+ cout << "Test corpus will be read from: " << TestCorpusFilename << '\n';
+ testCorpus= new sentenceHandler(TestCorpusFilename.c_str(),
+ &eTestVcbList, &fTestVcbList);
+ cout << " Test total # sentence pairs : " <<(*testCorpus).getTotalNoPairs1()<<" weighted:"<<(*testCorpus).getTotalNoPairs2() <<'\n';
+
+ cout << "Size of the source portion of test corpus: " << eTestVcbList.totalVocab() << " tokens\n";
+ cout << "Size of the target portion of test corpus: " << fTestVcbList.totalVocab() << " tokens \n";
+ cout << "In source portion of the test corpus, only " << eTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
+ cout << "In target portion of the test corpus, only " << fTestVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
+ cout << "ratio (target/source) : " << double(fTestVcbList.totalVocab()) /
+ eTestVcbList.totalVocab() << '\n';
+ }
+
+ cout << " Train total # sentence pairs (weighted): " << corpus->getTotalNoPairs2() << '\n';
+ cout << "Size of source portion of the training corpus: " << eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2() << " tokens\n";
+ cout << "Size of the target portion of the training corpus: " << fTrainVcbList.totalVocab() << " tokens \n";
+ cout << "In source portion of the training corpus, only " << eTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
+ cout << "In target portion of the training corpus, only " << fTrainVcbList.uniqTokensInCorpus() << " unique tokens appeared\n";
+ cout << "lambda for PP calculation in IBM-1,IBM-2,HMM:= " << double(fTrainVcbList.totalVocab()) << "/(" << eTrainVcbList.totalVocab() << "-" << corpus->getTotalNoPairs2() << ")=";
+ LAMBDA = double(fTrainVcbList.totalVocab()) / (eTrainVcbList.totalVocab()-corpus->getTotalNoPairs2());
+ cout << "= " << LAMBDA << '\n';
+ // load dictionary
+ Dictionary *dictionary;
+ if (useDict) dictionary = new Dictionary(dictionary_Filename.c_str());
+ else dictionary = new Dictionary("");
+ int minIter=0;
+#ifdef BINARY_SEARCH_FOR_TTABLE
+ if( CoocurrenceFile.length()==0 )
+ {
+ cerr << "ERROR: NO COOCURRENCE FILE GIVEN!\n";
+ abort();
+ }
+ //ifstream coocs(CoocurrenceFile.c_str());
+ tmodel<COUNT, PROB> tTable(CoocurrenceFile);
+#else
+ tmodel<COUNT, PROB> tTable;
+#endif
+
+ model1 m1(CorpusFilename.c_str(), eTrainVcbList, fTrainVcbList,tTable,trainPerp,
+ *corpus,&testPerp, testCorpus,
+ trainViterbiPerp, &testViterbiPerp);
+ amodel<PROB> aTable(false);
+ amodel<COUNT> aCountTable(false);
+ model2 m2(m1,aTable,aCountTable);
+ hmm h(m2);
+ model3 m3(m2);
+ if(ReadTablePrefix.length() )
+ {
+ string number = "final";
+ string tfile,afilennfile,dfile,d4file,p0file,afile,nfile; //d5file
+ tfile = ReadTablePrefix + ".t3." + number ;
+ afile = ReadTablePrefix + ".a3." + number ;
+ nfile = ReadTablePrefix + ".n3." + number ;
+ dfile = ReadTablePrefix + ".d3." + number ;
+ d4file = ReadTablePrefix + ".d4." + number ;
+ //d5file = ReadTablePrefix + ".d5." + number ;
+ p0file = ReadTablePrefix + ".p0_3." + number ;
+ tTable.readProbTable(tfile.c_str());
+ aTable.readTable(afile.c_str());
+ m3.dTable.readTable(dfile.c_str());
+ m3.nTable.readNTable(nfile.c_str());
+ sentPair sent ;
+ double p0;
+ ifstream p0f(p0file.c_str());
+ p0f >> p0;
+ d4model d4m(MAX_SENTENCE_LENGTH);
+ d4m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
+ d4m.readProbTable(d4file.c_str());
+ //d5model d5m(d4m);
+ //d5m.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
+ //d5m.readProbTable(d5file.c_str());
+ makeSetCommand("model4smoothfactor","0.0",getGlobalParSet(),2);
+ //makeSetCommand("model5smoothfactor","0.0",getGlobalParSet(),2);
+ if( corpus||testCorpus )
+ {
+ sentenceHandler *x=corpus;
+ if(x==0)
+ x=testCorpus;
+ cout << "Text corpus exists.\n";
+ x->rewind();
+ while(x&&x->getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ int l=es.size()-1;
+ int m=fs.size()-1;
+ transpair_model4 tm4(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d4m);
+ alignment al(l,m);
+ cout << "I use the alignment " << sent.sentenceNo-1 << '\n';
+ //convert(ReferenceAlignment[sent.sentenceNo-1],al);
+ transpair_model3 tm3(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,0);
+ double p=tm3.prob_of_target_and_alignment_given_source(al,1);
+ cout << "Sentence " << sent.sentenceNo << " has IBM-3 prob " << p << '\n';
+ p=tm4.prob_of_target_and_alignment_given_source(al,3,1);
+ cout << "Sentence " << sent.sentenceNo << " has IBM-4 prob " << p << '\n';
+ //transpair_model5 tm5(es,fs,m1.tTable,m2.aTable,m3.dTable,m3.nTable,1-p0,p0,&d5m);
+ //p=tm5.prob_of_target_and_alignment_given_source(al,3,1);
+ //cout << "Sentence " << sent.sentenceNo << " has IBM-5 prob " << p << '\n';
+ }
+ }
+ else
+ {
+ cout << "No corpus exists.\n";
+ }
+ }
+ else
+ {
+ // initialize model1
+ bool seedModel1 = false ;
+ if(Model1_Iterations > 0){
+ if (t_Filename != "NONE" && t_Filename != ""){
+ seedModel1 = true ;
+ m1.load_table(t_Filename.c_str());
+ }
+ minIter=m1.em_with_tricks(Model1_Iterations,seedModel1,*dictionary, useDict);
+ errors=m1.errorsAL();
+ }
+
+ {
+ if(Model2_Iterations > 0){
+ m2.initialize_table_uniformly(*corpus);
+ minIter=m2.em_with_tricks(Model2_Iterations);
+ errors=m2.errorsAL();
+ }
+ if(HMM_Iterations > 0){
+ cout << "NOTE: I am doing iterations with the HMM model!\n";
+ h.makeWordClasses(m1.Elist,m1.Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
+ h.initialize_table_uniformly(*corpus);
+ minIter=h.em_with_tricks(HMM_Iterations);
+ errors=h.errorsAL();
+ }
+
+ if(Transfer2to3||HMM_Iterations==0){
+ if( HMM_Iterations>0 )
+ cout << "WARNING: transfor is not needed, as results are overwritten bei transfer from HMM.\n";
+ string test_alignfile = Prefix +".tst.A2to3";
+ if (testCorpus)
+ m2.em_loop(testPerp, *testCorpus,Transfer_Dump_Freq==1&&!NODUMPS,test_alignfile.c_str(), testViterbiPerp, true);
+ if (testCorpus)
+ cout << "\nTransfer: TEST CROSS-ENTROPY " << testPerp.cross_entropy() << " PERPLEXITY " << testPerp.perplexity() << "\n\n";
+ if (Transfer == TRANSFER_SIMPLE)
+ m3.transferSimple(*corpus, Transfer_Dump_Freq==1&&!NODUMPS,trainPerp, trainViterbiPerp);
+ else
+ m3.transfer(*corpus, Transfer_Dump_Freq==1&&!NODUMPS, trainPerp, trainViterbiPerp);
+ errors=m3.errorsAL();
+ }
+
+ if( HMM_Iterations>0 )
+ m3.setHMM(&h);
+ if(Model3_Iterations > 0 || Model4_Iterations > 0 || Model5_Iterations || Model6_Iterations
+ )
+ {
+ minIter=m3.viterbi(Model3_Iterations,Model4_Iterations,Model5_Iterations,Model6_Iterations);
+ errors=m3.errorsAL();
+ }
+ if (FEWDUMPS||!NODUMPS)
+ {
+ printAllTables(eTrainVcbList,eTestVcbList,fTrainVcbList,fTestVcbList,m1 );
+ }
+ }
+ }
+ result=minIter;
+ return errors;
+}
+
+int main(int argc, char* argv[])
+{
+#ifdef BINARY_SEARCH_FOR_TTABLE
+ getGlobalParSet().insert(new Parameter<string>("CoocurrenceFile",ParameterChangedFlag,"",CoocurrenceFile,PARLEV_SPECIAL));
+#endif
+ getGlobalParSet().insert(new Parameter<string>("ReadTablePrefix",ParameterChangedFlag,"optimized",ReadTablePrefix,-1));
+ getGlobalParSet().insert(new Parameter<string>("S",ParameterChangedFlag,"source vocabulary file name",SourceVocabFilename,PARLEV_INPUT));
+ getGlobalParSet().insert(new Parameter<string>("SOURCE VOCABULARY FILE",ParameterChangedFlag,"source vocabulary file name",SourceVocabFilename,-1));
+ getGlobalParSet().insert(new Parameter<string>("T",ParameterChangedFlag,"target vocabulary file name",TargetVocabFilename,PARLEV_INPUT));
+ getGlobalParSet().insert(new Parameter<string>("TARGET VOCABULARY FILE",ParameterChangedFlag,"target vocabulary file name",TargetVocabFilename,-1));
+ getGlobalParSet().insert(new Parameter<string>("C",ParameterChangedFlag,"training corpus file name",CorpusFilename,PARLEV_INPUT));
+ getGlobalParSet().insert(new Parameter<string>("CORPUS FILE",ParameterChangedFlag,"training corpus file name",CorpusFilename,-1));
+ getGlobalParSet().insert(new Parameter<string>("TC",ParameterChangedFlag,"test corpus file name",TestCorpusFilename,PARLEV_INPUT));
+ getGlobalParSet().insert(new Parameter<string>("TEST CORPUS FILE",ParameterChangedFlag,"test corpus file name",TestCorpusFilename,-1));
+ getGlobalParSet().insert(new Parameter<string>("d",ParameterChangedFlag,"dictionary file name",dictionary_Filename,PARLEV_INPUT));
+ getGlobalParSet().insert(new Parameter<string>("DICTIONARY",ParameterChangedFlag,"dictionary file name",dictionary_Filename,-1));
+ getGlobalParSet().insert(new Parameter<string>("l",ParameterChangedFlag,"log file name",LogFilename,PARLEV_OUTPUT));
+ getGlobalParSet().insert(new Parameter<string>("LOG FILE",ParameterChangedFlag,"log file name",LogFilename,-1));
+
+ getGlobalParSet().insert(new Parameter<string>("o",ParameterChangedFlag,"output file prefix",Prefix,PARLEV_OUTPUT));
+ getGlobalParSet().insert(new Parameter<string>("OUTPUT FILE PREFIX",ParameterChangedFlag,"output file prefix",Prefix,-1));
+ getGlobalParSet().insert(new Parameter<string>("OUTPUT PATH",ParameterChangedFlag,"output path",OPath,PARLEV_OUTPUT));
+
+ time_t st1, fn;
+ st1 = time(NULL); // starting time
+
+ string temp(argv[0]);
+ Usage = temp + " <config_file> [options]\n";
+ if(argc < 2)
+ {
+ printHelp();
+ exit(1);
+ }
+
+ initGlobals() ;
+ parseArguments(argc, argv);
+
+ if (Log)
+ logmsg.open(LogFilename.c_str(), ios::out);
+
+ printGIZAPars(cout);
+ int a=-1;
+ double errors=0.0;
+ if( OldADBACKOFF!=0 )
+ cerr << "WARNING: Parameter -adBackOff does not exist further; use CompactADTable instead.\n";
+ if( MAX_SENTENCE_LENGTH > MAX_SENTENCE_LENGTH_ALLOWED )
+ cerr << "ERROR: MAX_SENTENCE_LENGTH is too big " << MAX_SENTENCE_LENGTH << " > " << MAX_SENTENCE_LENGTH_ALLOWED << '\n';
+ errors=StartTraining(a);
+ fn = time(NULL); // finish time
+ cout << '\n' << "Entire Training took: " << difftime(fn, st1) << " seconds\n";
+ cout << "Program Finished at: "<< ctime(&fn) << '\n';
+ cout << "==========================================================\n";
+ return 0;
+}
+
diff --git a/GIZA++-v2/model1.cpp b/GIZA++-v2/model1.cpp
new file mode 100644
index 0000000..b1b6d92
--- /dev/null
+++ b/GIZA++-v2/model1.cpp
@@ -0,0 +1,283 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "model1.h"
+#include "Globals.h"
+#include "utility.h"
+#include "Parameter.h"
+
+extern short NoEmptyWord;
+extern int VerboseSentence;
+
+GLOBAL_PARAMETER2(int,Model1_Dump_Freq,"MODEL 1 DUMP FREQUENCY","t1","dump frequency of Model 1",PARLEV_OUTPUT,0);
+int NumberOfVALIalignments=100;
+
+model1::model1(const char* efname, vcbList& evcblist, vcbList& fvcblist,tmodel<COUNT, PROB>&_tTable,Perplexity& _perp,
+ sentenceHandler& _sHandler1,
+ Perplexity* _testPerp,
+ sentenceHandler* _testHandler,
+ Perplexity& _trainViterbiPerp,
+ Perplexity* _testViterbiPerp):
+ report_info(_perp,_sHandler1,_testPerp,_testHandler,_trainViterbiPerp,_testViterbiPerp),
+ efFilename(efname), Elist(evcblist), Flist(fvcblist),
+ eTotalWCount(Elist.totalVocab()), fTotalWCount(Flist.totalVocab()),
+ noEnglishWords(Elist.size()), noFrenchWords(Flist.size()), tTable(_tTable),
+ evlist(Elist.getVocabList()), fvlist(Flist.getVocabList())
+{}
+
+void model1::initialize_table_uniformly(sentenceHandler& sHandler1){
+ WordIndex i, j;
+
+ cout << "Initialize tTable\n";
+
+ sentPair sent ;
+ sHandler1.rewind();
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ PROB uniform = 1.0/es.size() ;
+ for( i=0; i < es.size(); i++)
+ for(j=1; j < fs.size(); j++)
+ tTable.insert(es[i],fs[j],0,uniform);
+ }
+}
+
+
+int model1::em_with_tricks(int noIterations, /*Perplexity& perp, sentenceHandler& sHandler1, */
+ bool seedModel1, Dictionary& dictionary, bool useDict /*Perplexity* testPerp, sentenceHandler* testHandler,
+ Perplexity& trainViterbiPerp, Perplexity* testViterbiPerp */ )
+{
+ double minErrors=1.0;int minIter=0;
+ string modelName="Model1",shortModelName="1";
+ time_t st, it_st, fn, it_fn;
+ string tfile, number, alignfile, test_alignfile;
+ int pair_no;
+ bool dump_files = false ;
+ st = time(NULL);
+ sHandler1.rewind();
+ cout << "==========================================================\n";
+ cout << modelName << " Training Started at: "<< ctime(&st) << "\n";
+ for(int it = 1; it <= noIterations; it++){
+ pair_no = 0 ;
+ it_st = time(NULL);
+ cout << "-----------\n" << modelName << ": Iteration " << it << '\n';
+ dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0) && !NODUMPS ;
+ number = "";
+ int n = it;
+ do{
+ number.insert((size_t)0, 1, (char)(n % 10 + '0'));
+ } while((n /= 10) > 0);
+ tfile = Prefix + ".t" + shortModelName + "." + number ;
+ alignfile = Prefix + ".A" + shortModelName + "." + number ;
+ test_alignfile = Prefix +".tst.A" + shortModelName + "." + number ;
+ initAL();
+ em_loop(it,perp, sHandler1, seedModel1, dump_files, alignfile.c_str(), dictionary, useDict, trainViterbiPerp);
+ if (testPerp && testHandler) // calculate test perplexity
+ em_loop(it,*testPerp, *testHandler, seedModel1, dump_files, test_alignfile.c_str(), dictionary, useDict, *testViterbiPerp, true);
+ if( errorsAL()<minErrors )
+ {
+ minErrors=errorsAL();
+ minIter=it;
+ }
+ if (dump_files){
+ if( OutputInAachenFormat==1 )
+ tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
+ }
+ tTable.normalizeTable(Elist, Flist);
+ cout << modelName << ": ("<<it<<") TRAIN CROSS-ENTROPY " << perp.cross_entropy()
+ << " PERPLEXITY " << perp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<") TEST CROSS-ENTROPY " << (*testPerp).cross_entropy()
+ << " PERPLEXITY " << (*testPerp).perplexity()
+ << '\n';
+ cout << modelName << ": ("<<it<<") VITERBI TRAIN CROSS-ENTROPY " << trainViterbiPerp.cross_entropy()
+ << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<") VITERBI TEST CROSS-ENTROPY " << (*testViterbiPerp).cross_entropy()
+ << " PERPLEXITY " << (*testViterbiPerp).perplexity()
+ << '\n';
+ if (dump_files){
+ if( OutputInAachenFormat==0 )
+ tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
+ }
+ it_fn = time(NULL);
+ cout << "Model 1 Iteration: " << it<< " took: " << difftime(it_fn, it_st) << " seconds\n";
+ }
+ fn = time(NULL) ;
+ cout << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n";
+ return minIter;
+}
+
+void model1::load_table(const char* tname){
+ /* This function loads the t table from the given file; use it
+ when you want to load results from previous t training
+ without doing any new training.
+ NAS, 7/11/99
+ */
+ cout << "Model1: loading t table \n" ;
+ tTable.readProbTable(tname);
+}
+
+
+extern float MINCOUNTINCREASE;
+void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1,
+ bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Perplexity& viterbi_perp, bool test)
+{
+ WordIndex i, j, l, m ;
+ double cross_entropy;
+ int pair_no=0 ;
+ perp.clear();
+ viterbi_perp.clear();
+ ofstream of2;
+ // for each sentence pair in the corpus
+ if (dump_alignment||FEWDUMPS)
+ of2.open(alignfile);
+ PROB uniform = 1.0/noFrenchWords ;
+ sentPair sent ;
+ sHandler1.rewind();
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ const float so = sent.getCount();
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ cross_entropy = log(1.0);
+ Vector<WordIndex> viterbi_alignment(fs.size());
+ double viterbi_score = 1 ;
+
+ bool eindict[l + 1];
+ bool findict[m + 1];
+ bool indict[m + 1][l + 1];
+ if(it == 1 && useDict){
+ for(unsigned int dummy = 0; dummy <= l; dummy++) eindict[dummy] = false;
+ for(unsigned int dummy = 0; dummy <= m; dummy++){
+ findict[dummy] = false;
+ for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++)
+ indict[dummy][dummy2] = false;
+ }
+ for(j = 0; j <= m; j++)
+ for(i = 0; i <= l; i++)
+ if(dict.indict(fs[j], es[i])){
+ eindict[i] = findict[j] = indict[j][i] = true;
+ }
+ }
+
+ for(j=1; j <= m; j++){
+ // entries that map fs to all possible ei in this sentence.
+ Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table
+ LpPair<COUNT,PROB> **sPtrCachePtr;
+
+ PROB denom = 0.0;
+ WordIndex best_i = 0 ; // i for which fj is best maped to ei
+ PROB word_best_score = 0 ; // score for the best mapping of fj
+ if (it == 1 && !seedModel1){
+ denom = uniform * es.size() ;
+ word_best_score = uniform ;
+ }
+ else
+ for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
+ PROB e(0.0) ;
+ (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ;
+ if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH)
+ e = (*((*sPtrCachePtr))).prob;
+ else e = PROB_SMOOTH ;
+ denom += e ;
+ if (e > word_best_score){
+ word_best_score = e ;
+ best_i = i ;
+ } }
+ viterbi_alignment[j] = best_i ;
+ viterbi_score *= word_best_score ; /// denom ;
+ if (denom == 0){
+ if (test)
+ cerr << "WARNING: denom is zero (TEST)\n";
+ else
+ cerr << "WARNING: denom is zero (TRAIN)\n";
+ }
+ cross_entropy += log(denom) ;
+ if (!test){
+ if(denom > 0){
+ COUNT val = COUNT(so) / (COUNT) double(denom) ;
+ /* this if loop implements a constraint on counting:
+ count(es[i], fs[j]) is implemented if and only if
+ es[i] and fs[j] occur together in the dictionary,
+ OR
+ es[i] does not occur in the dictionary with any fs[x] and
+ fs[j] does not occur in the dictionary with any es[y]
+ */
+ if(it == 1 && useDict){
+ for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
+ if(indict[j][i] || (!findict[j] && !eindict[i])){
+ PROB e(0.0) ;
+ if (it == 1 && !seedModel1)
+ e = uniform ;
+ else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH)
+ e = (*((*sPtrCachePtr))).prob;
+ else e = PROB_SMOOTH ;
+ COUNT x=e*val;
+ if( it==1||x>MINCOUNTINCREASE )
+ if ((*sPtrCachePtr) != 0)
+ (*((*sPtrCachePtr))).count += x;
+ else
+ tTable.incCount(es[i], fs[j], x);
+ } /* end of if */
+ } /* end of for i */
+ } /* end of it == 1 */
+ // Old code:
+ else{
+ for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){
+ //for(i=0; i <= l; i++) {
+ PROB e(0.0) ;
+ if (it == 1 && !seedModel1)
+ e = uniform ;
+ else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH)
+ e = (*((*sPtrCachePtr))).prob;
+ else e = PROB_SMOOTH ;
+ //if( !(i==0) )
+ //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl;
+ COUNT x=e*val;
+ if( pair_no==VerboseSentence )
+ cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl;
+ if( it==1||x>MINCOUNTINCREASE )
+ if( NoEmptyWord==0 || i!=0 )
+ if ((*sPtrCachePtr) != 0)
+ (*((*sPtrCachePtr))).count += x;
+ else
+ tTable.incCount(es[i], fs[j], x);
+ } /* end of for i */
+ } // end of else
+ } // end of if (denom > 0)
+ }// if (!test)
+ } // end of for (j) ;
+ sHandler1.setProbOfSentence(sent,cross_entropy);
+ //cerr << sent << "CE: " << cross_entropy << " " << so << endl;
+ perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1);
+ viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1);
+ if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000))
+ printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
+ addAL(viterbi_alignment,sent.sentenceNo,l);
+ pair_no++;
+ } /* of while */
+ sHandler1.rewind();
+ perp.record("Model1");
+ viterbi_perp.record("Model1");
+ errorReportAL(cout, "IBM-1");
+}
diff --git a/GIZA++-v2/model1.h b/GIZA++-v2/model1.h
new file mode 100644
index 0000000..ef16a29
--- /dev/null
+++ b/GIZA++-v2/model1.h
@@ -0,0 +1,164 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _model1_h
+#define _model1_h 1
+
+#include <assert.h>
+
+#include <iostream>
+#include <strstream>
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <set>
+#include <utility>
+
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+#include <time.h>
+#include <fstream.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "Vector.h"
+#include "vocab.h"
+#include "TTables.h"
+#include "getSentence.h"
+#include "Perplexity.h"
+#include "vocab.h"
+#include "Dictionary.h"
+
+extern int NumberOfVALIalignments;
+
+class report_info{
+ protected:
+ Perplexity& perp;
+ sentenceHandler& sHandler1;
+ Perplexity* testPerp;
+ sentenceHandler* testHandler;
+ Perplexity& trainViterbiPerp;
+ Perplexity* testViterbiPerp;
+ report_info(Perplexity& _perp,
+ sentenceHandler& _sHandler1,
+ Perplexity* _testPerp,
+ sentenceHandler* _testHandler,
+ Perplexity& _trainViterbiPerp,
+ Perplexity* _testViterbiPerp)
+ : perp(_perp),sHandler1(_sHandler1),testPerp(_testPerp),testHandler(_testHandler),trainViterbiPerp(_trainViterbiPerp),testViterbiPerp(_testViterbiPerp)
+ {}
+};
+
+class model1 : public report_info{
+public:
+ string efFilename;
+ vcbList& Elist ;
+ vcbList& Flist ;
+ double eTotalWCount ; // size of source copus in number of words
+ double fTotalWCount ; // size of target corpus in number of words
+ int noEnglishWords;
+ int noFrenchWords;
+ tmodel<COUNT, PROB>&tTable;
+ Vector<WordEntry>& evlist ;
+ Vector<WordEntry>& fvlist ;
+public:
+ int ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch;
+ int ALmissingVALI,ALtoomuchVALI,ALeventsMissingVALI,ALeventsToomuchVALI;
+ int ALmissingTEST,ALtoomuchTEST,ALeventsMissingTEST,ALeventsToomuchTEST;
+ model1 (const char* efname, vcbList& evcblist, vcbList& fvcblist,tmodel<COUNT, PROB>&_tTable,Perplexity& _perp,
+ sentenceHandler& _sHandler1,
+ Perplexity* _testPerp,
+ sentenceHandler* _testHandler,
+ Perplexity& _trainViterbiPerp,
+ Perplexity* _testViterbiPerp);
+ void initialize_table_uniformly(sentenceHandler& sHandler1);
+ int em_with_tricks(int noIterations,
+ bool seedModel1, Dictionary& dictionary, bool useDict);
+ void load_table(const char* tname);
+ void readVocabFile(const char* fname, Vector<WordEntry>& vlist, int& vsize,
+ int& total);
+ inline const Vector<WordEntry>& getEnglishVocabList(void)const {return Elist.getVocabList();};
+ inline const Vector<WordEntry>& getFrenchVocabList(void)const {return Flist.getVocabList();};
+ inline const double getETotalWCount(void) const {return eTotalWCount;};
+ inline const double getFTotalWCount(void) const {return fTotalWCount;};
+ inline const int getNoEnglishWords(void) const {return noEnglishWords;};
+ inline const int getNoFrenchWords(void) const {return noFrenchWords;};
+ inline tmodel<COUNT, PROB>& getTTable(void) {return tTable;};
+ inline string& getEFFilename(void) {return efFilename;};
+ private:
+ void em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, bool , const char*, Dictionary& dictionary, bool useDict,
+ Perplexity& viterbiperp, bool=false);
+ friend class model2;
+ friend class hmm;
+ public:
+ void addAL(const Vector<WordIndex>& viterbi_alignment,int pair_no,int l)
+ {
+ if( pair_no<=int(ReferenceAlignment.size()) )
+ {
+ //cerr << "AL: " << viterbi_alignment << " " << pair_no << endl;
+ ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no);
+ if( pair_no<=NumberOfVALIalignments )
+ ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissingVALI,ALtoomuchVALI,ALeventsMissingVALI,ALeventsToomuchVALI,pair_no);
+ if( pair_no>NumberOfVALIalignments )
+ ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissingTEST,ALtoomuchTEST,ALeventsMissingTEST,ALeventsToomuchTEST,pair_no);
+ }
+ }
+ void initAL()
+ {ALmissingVALI=ALtoomuchVALI=ALeventsMissingVALI=ALeventsToomuchVALI=ALmissingTEST=ALtoomuchTEST=ALeventsMissingTEST=ALeventsToomuchTEST=ALmissing=ALtoomuch=ALeventsMissing=ALeventsToomuch=0;}
+ double errorsAL()const
+ {
+ if( ALeventsMissingVALI+ALeventsToomuchVALI )
+ return (ALmissingVALI+ALtoomuchVALI)/double(ALeventsMissingVALI+ALeventsToomuchVALI);
+ else
+ return 0.0;
+ }
+ void errorReportAL(ostream&out,string m)const
+ {
+ if( ALeventsMissing+ALeventsToomuch )
+ out << "alignmentErrors (" << m << "): "
+ << 100.0*(ALmissing+ALtoomuch)/double(ALeventsMissing+ALeventsToomuch)
+ << " recall: " << 100.0*(1.0-ALmissing/double(ALeventsMissing))
+ << " precision: " << 100.0*(1.0-ALtoomuch/double(ALeventsToomuch))
+ << " (missing:" << ALmissing << "/" << ALeventsMissing << " " << ALtoomuch
+ << " " << ALeventsToomuch << ")\n";
+ if( ALeventsMissingVALI+ALeventsToomuchVALI )
+ out << "alignmentErrors VALI (" << m << "): "
+ << 100.0*(ALmissingVALI+ALtoomuchVALI)/double(ALeventsMissingVALI+ALeventsToomuchVALI)
+ << " recall: " << 100.0*(1.0-ALmissingVALI/double(ALeventsMissingVALI))
+ << " precision: " << 100.0*(1.0-ALtoomuchVALI/double(ALeventsToomuchVALI))
+ << " (missing:" << ALmissingVALI << "/" << ALeventsMissingVALI << " " << ALtoomuchVALI
+ << " " << ALeventsToomuchVALI << ")\n";
+ if( ALeventsMissingTEST+ALeventsToomuchTEST )
+ out << "alignmentErrors TEST(" << m << "): "
+ << 100.0*(ALmissingTEST+ALtoomuchTEST)/double(ALeventsMissingTEST+ALeventsToomuchTEST)
+ << " recall: " << 100.0*(1.0-ALmissingTEST/double(ALeventsMissingTEST))
+ << " precision: " << 100.0*(1.0-ALtoomuchTEST/double(ALeventsToomuchTEST))
+ << " (missing:" << ALmissingTEST << "/" << ALeventsMissingTEST << " " << ALtoomuchTEST
+ << " " << ALeventsToomuchTEST << ")\n";
+
+ }
+};
+
+#endif
diff --git a/GIZA++-v2/model2.cpp b/GIZA++-v2/model2.cpp
new file mode 100644
index 0000000..945b91e
--- /dev/null
+++ b/GIZA++-v2/model2.cpp
@@ -0,0 +1,232 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "model2.h"
+#include "Globals.h"
+#include "utility.h"
+#include "Parameter.h"
+#include "defs.h"
+
+extern short NoEmptyWord;
+
+
+GLOBAL_PARAMETER2(int,Model2_Dump_Freq,"MODEL 2 DUMP FREQUENCY","t2","dump frequency of Model 2",PARLEV_OUTPUT,0);
+
+model2::model2(model1& m,amodel<PROB>&_aTable,amodel<COUNT>&_aCountTable):
+ model1(m),aTable(_aTable),aCountTable(_aCountTable)
+{ }
+
+void model2::initialize_table_uniformly(sentenceHandler& sHandler1){
+ // initialize the aTable uniformly (run this before running em_with_tricks)
+ int n=0;
+ sentPair sent ;
+ sHandler1.rewind();
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ WordIndex l = es.size() - 1;
+ WordIndex m = fs.size() - 1;
+ n++;
+ if(1<=m&&aTable.getValue(l,m,l,m)<=PROB_SMOOTH)
+ {
+ PROB uniform_val = 1.0 / (l+1) ;
+ for(WordIndex j=1; j <= m; j++)
+ for(WordIndex i=0; i <= l; i++)
+ aTable.setValue(i,j, l, m, uniform_val);
+ }
+ }
+}
+
+int model2::em_with_tricks(int noIterations)
+{
+ double minErrors=1.0;int minIter=0;
+ string modelName="Model2",shortModelName="2";
+ time_t it_st, st, it_fn, fn;
+ string tfile, afile, number, alignfile, test_alignfile;
+ int pair_no = 0;
+ bool dump_files = false ;
+ ofstream of2 ;
+ st = time(NULL) ;
+ sHandler1.rewind();
+ cout << "\n==========================================================\n";
+ cout << modelName << " Training Started at: " << ctime(&st) << " iter: " << noIterations << "\n";
+ for(int it=1; it <= noIterations ; it++){
+ pair_no = 0;
+ it_st = time(NULL) ;
+ cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n';
+ dump_files = (Model2_Dump_Freq != 0) && ((it % Model2_Dump_Freq) == 0) && !NODUMPS;
+ number = "";
+ int n = it;
+ do{
+ number.insert((size_t)0, 1, (char)(n % 10 + '0'));
+ } while((n /= 10) > 0);
+ tfile = Prefix + ".t" + shortModelName + "." + number ;
+ afile = Prefix + ".a" + shortModelName + "." + number ;
+ alignfile = Prefix + ".A" + shortModelName + "." + number ;
+ test_alignfile = Prefix + ".tst.A" + shortModelName + "." + number ;
+ aCountTable.clear();
+ initAL();
+ em_loop(perp, sHandler1, dump_files, alignfile.c_str(), trainViterbiPerp, false);
+ if( errorsAL()<minErrors )
+ {
+ minErrors=errorsAL();
+ minIter=it;
+ }
+ if (testPerp && testHandler)
+ em_loop(*testPerp, *testHandler, dump_files, test_alignfile.c_str(), *testViterbiPerp, true);
+ if (dump_files&&OutputInAachenFormat==1)
+ tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
+ tTable.normalizeTable(Elist, Flist);
+ aCountTable.normalize(aTable);
+ cout << modelName << ": ("<<it<<") TRAIN CROSS-ENTROPY " << perp.cross_entropy()
+ << " PERPLEXITY " << perp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<") TEST CROSS-ENTROPY " << (*testPerp).cross_entropy()
+ << " PERPLEXITY " << (*testPerp).perplexity()
+ << '\n';
+ cout << modelName << ": ("<<it<<") VITERBI TRAIN CROSS-ENTROPY " << trainViterbiPerp.cross_entropy()
+ << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<") VITERBI TEST CROSS-ENTROPY " << testViterbiPerp->cross_entropy()
+ << " PERPLEXITY " << testViterbiPerp->perplexity()
+ << '\n';
+ if (dump_files)
+ {
+ if(OutputInAachenFormat==0)
+ tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
+ aCountTable.printTable(afile.c_str());
+ }
+ it_fn = time(NULL) ;
+ cout << modelName << " Iteration: " << it<< " took: " << difftime(it_fn, it_st) << " seconds\n";
+ } // end of iterations
+ aCountTable.clear();
+ fn = time(NULL) ;
+ cout << endl << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n";
+ // cout << "tTable contains " << tTable.getHash().bucket_count()
+ // << " buckets and " << tTable.getHash().size() << " entries." ;
+ cout << "==========================================================\n";
+ return minIter;
+}
+
+void model2::load_table(const char* aname){
+ /* This function loads the a table from the given file; use it
+ when you want to load results from previous a training without
+ doing any new training.
+ NAS, 7/11/99
+ */
+ cout << "Model2: loading a table \n";
+ aTable.readTable(aname);
+}
+
+
+void model2::em_loop(Perplexity& perp, sentenceHandler& sHandler1,
+ bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp,
+ bool test)
+{
+ massert( aTable.is_distortion==0 );
+ massert( aCountTable.is_distortion==0 );
+ WordIndex i, j, l, m ;
+ double cross_entropy;
+ int pair_no=0 ;
+ perp.clear();
+ viterbi_perp.clear();
+ ofstream of2;
+ // for each sentence pair in the corpus
+ if (dump_alignment||FEWDUMPS )
+ of2.open(alignfile);
+ sentPair sent ;
+
+ vector<double> ferts(evlist.size());
+
+ sHandler1.rewind();
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ const float so = sent.getCount();
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ cross_entropy = log(1.0);
+ Vector<WordIndex> viterbi_alignment(fs.size());
+ double viterbi_score = 1;
+ for(j=1; j <= m; j++){
+ Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table
+ // entries that map fs to all possible ei in this sentence.
+ PROB denom = 0.0;
+ PROB e = 0.0, word_best_score = 0;
+ WordIndex best_i = 0 ; // i for which fj is best maped to ei
+ for(i=0; i <= l; i++){
+ sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ;
+ if (sPtrCache[i] != 0 &&(*(sPtrCache[i])).prob > PROB_SMOOTH )
+ e = (*(sPtrCache[i])).prob * aTable.getValue(i,j, l, m) ;
+ else e = PROB_SMOOTH * aTable.getValue(i,j, l, m);
+ denom += e ;
+ if (e > word_best_score){
+ word_best_score = e ;
+ best_i = i ;
+ }
+ }
+ viterbi_alignment[j] = best_i ;
+ viterbi_score *= word_best_score; ///denom ;
+ cross_entropy += log(denom) ;
+ if (denom == 0){
+ if (test)
+ cerr << "WARNING: denom is zero (TEST)\n";
+ else
+ cerr << "WARNING: denom is zero (TRAIN)\n";
+ }
+ if (!test){
+ if(denom > 0){
+ COUNT val = COUNT(so) / (COUNT) double(denom) ;
+ for( i=0; i <= l; i++){
+ PROB e(0.0);
+ if (sPtrCache[i] != 0 && (*(sPtrCache[i])).prob > PROB_SMOOTH)
+ e = (*(sPtrCache[i])).prob ;
+ else e = PROB_SMOOTH ;
+ e *= aTable.getValue(i,j, l, m);
+ COUNT temp = COUNT(e) * val ;
+ if( NoEmptyWord==0 || i!=0 )
+ if (sPtrCache[i] != 0)
+ (*(sPtrCache[i])).count += temp ;
+ else
+ tTable.incCount(es[i], fs[j], temp);
+ aCountTable.getRef(i,j, l, m)+= temp ;
+ } /* end of for i */
+ } // end of if (denom > 0)
+ }// if (!test)
+ } // end of for (j) ;
+ sHandler1.setProbOfSentence(sent,cross_entropy);
+ perp.addFactor(cross_entropy, so, l, m,1);
+ viterbi_perp.addFactor(log(viterbi_score), so, l, m,1);
+ if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000) )
+ printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
+ addAL(viterbi_alignment,sent.sentenceNo,l);
+ pair_no++;
+ } /* of while */
+ sHandler1.rewind();
+ perp.record("Model2");
+ viterbi_perp.record("Model2");
+ errorReportAL(cout,"IBM-2");
+}
+
+
+
+
+
diff --git a/GIZA++-v2/model2.h b/GIZA++-v2/model2.h
new file mode 100644
index 0000000..8d31b99
--- /dev/null
+++ b/GIZA++-v2/model2.h
@@ -0,0 +1,70 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _model2_h
+#define _model2_h 1
+
+#include <assert.h>
+
+#include <iostream>
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <set>
+#include "Vector.h"
+#include <utility>
+
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+
+#include <fstream.h>
+#include <math.h>
+#include <time.h>
+
+#include "TTables.h"
+#include "ATables.h"
+#include "getSentence.h"
+#include "defs.h"
+#include "model1.h"
+#include "Perplexity.h"
+#include "vocab.h"
+
+class model2 : public model1
+{
+ public:
+ amodel<PROB>&aTable;
+ amodel<COUNT>&aCountTable;
+ public:
+ model2(model1& m1,amodel<PROB>&,amodel<COUNT>&);
+ void initialize_table_uniformly(sentenceHandler&);
+ int em_with_tricks(int);
+ void load_table(const char* aname);
+ inline amodel<PROB>& getATable(void) {return aTable;};
+ inline amodel<COUNT>& getACountTable(void) {return aCountTable;};
+ void em_loop(Perplexity& perp,sentenceHandler& sHandler1, bool dump_files,const char* alignfile, Perplexity&, bool test);
+ friend class model3;
+};
+
+#endif
diff --git a/GIZA++-v2/model2to3.cpp b/GIZA++-v2/model2to3.cpp
new file mode 100644
index 0000000..22cbf50
--- /dev/null
+++ b/GIZA++-v2/model2to3.cpp
@@ -0,0 +1,398 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "model3.h"
+#include "utility.h"
+#include "Globals.h"
+
+#define _MAX_FERTILITY 10
+
+double get_sum_of_partitions(int n, int source_pos, double alpha[_MAX_FERTILITY][MAX_SENTENCE_LENGTH_ALLOWED])
+{
+ int done, init ;
+ double sum = 0, prod ;
+ int s, w, u, v;
+ WordIndex k, k1, i ;
+ WordIndex num_parts = 0 ;
+ int total_partitions_considered = 0;
+
+ int part[_MAX_FERTILITY], mult[_MAX_FERTILITY];
+
+ done = false ;
+ init = true ;
+ for (i = 0 ; i < _MAX_FERTILITY ; i++){
+ part[i] = mult[i] = 0 ;
+ }
+
+ //printf("Entering get sum of partitions\n");
+ while(! done){
+ total_partitions_considered++;
+ if (init){
+ part[1] = n ;
+ mult[1] = 1 ;
+ num_parts = 1 ;
+ init = false ;
+ }
+ else {
+ if ((part[num_parts] > 1) || (num_parts > 1)){
+ if (part[num_parts] == 1){
+ s = part[num_parts-1] + mult[num_parts];
+ k = num_parts - 1;
+ }
+ else {
+ s = part[num_parts];
+ k = num_parts ;
+ }
+ w = part[k] - 1 ;
+ u = s / w ;
+ v = s % w ;
+ mult[k] -= 1 ;
+ if (mult[k] == 0)
+ k1 = k ;
+ else k1 = k + 1 ;
+ mult[k1] = u ;
+ part[k1] = w ;
+ if (v == 0){
+ num_parts = k1 ;
+ }
+ else {
+ mult[k1+1] = 1 ;
+ part[k1+1] = v ;
+ num_parts = k1 + 1;
+ }
+ } /* of if num_parts > 1 || part[num_parts] > 1 */
+ else {
+ done = true ;
+ }
+ }
+ /* of else of if(init) */
+ if (!done){
+ prod = 1.0 ;
+ if (n != 0)
+ for (i = 1 ; i <= num_parts ; i++){
+ prod *= pow(alpha[part[i]][source_pos], mult[i]) / factorial(mult[i]) ;
+ }
+ sum += prod ;
+ }
+ } /* of while */
+ if (sum < 0) sum = 0 ;
+ return(sum) ;
+}
+
+void model3::estimate_t_a_d(sentenceHandler& sHandler1, Perplexity& perp, Perplexity& trainVPerp,
+ bool simple, bool dump_files,bool updateT)
+{
+ string tfile, nfile, dfile, p0file, afile, alignfile;
+ WordIndex i, j, l, m, max_fertility_here, k ;
+ PROB val, temp_mult[MAX_SENTENCE_LENGTH_ALLOWED][MAX_SENTENCE_LENGTH_ALLOWED];
+ double cross_entropy;
+ double beta, sum,
+ alpha[_MAX_FERTILITY][MAX_SENTENCE_LENGTH_ALLOWED];
+ double total, temp, r ;
+
+ dCountTable.clear();
+ aCountTable.clear();
+ initAL();
+ nCountTable.clear() ;
+ if (simple)
+ nTable.clear();
+ perp.clear() ;
+ trainVPerp.clear() ;
+ ofstream of2;
+ if (dump_files){
+ alignfile = Prefix +".A2to3";
+ of2.open(alignfile.c_str());
+ }
+ if (simple) cerr <<"Using simple estimation for fertilties\n";
+ sHandler1.rewind() ;
+ sentPair sent ;
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ const float count = sent.getCount();
+ Vector<WordIndex> viterbi_alignment(fs.size());
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ cross_entropy = log(1.0);
+ double viterbi_score = 1 ;
+ PROB word_best_score ; // score for the best mapping of fj
+ for(j = 1 ; j <= m ; j++){
+ word_best_score = 0 ; // score for the best mapping of fj
+ Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0);
+ total = 0 ;
+ WordIndex best_i = 0 ;
+ for(i = 0; i <= l ; i++){
+ sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ;
+ if (sPtrCache[i] != 0 && (*(sPtrCache[i])).prob > PROB_SMOOTH) // if valid pointer
+ temp_mult[i][j]= (*(sPtrCache[i])).prob * aTable.getValue(i, j, l, m) ;
+ else
+ temp_mult[i][j] = PROB_SMOOTH * aTable.getValue(i, j, l, m) ;
+ total += temp_mult[i][j] ;
+ if (temp_mult[i][j] > word_best_score){
+ word_best_score = temp_mult[i][j] ;
+ best_i = i ;
+ }
+ } // end of for (i)
+ viterbi_alignment[j] = best_i ;
+ viterbi_score *= word_best_score ; /// total ;
+ cross_entropy += log(total) ;
+ if (total == 0){
+ cerr << "WARNING: total is zero (TRAIN)\n";
+ viterbi_score = 0 ;
+ }
+ if (total > 0){
+ for(i = 0; i <= l ; i++){
+ temp_mult[i][j] /= total ;
+ if (temp_mult[i][j] == 1) // smooth to prevent underflow
+ temp_mult[i][j] = 0.99 ;
+ else if (temp_mult[i][j] == 0)
+ temp_mult[i][j] = PROB_SMOOTH ;
+ val = temp_mult[i][j] * PROB(count) ;
+ if ( val > PROB_SMOOTH) {
+ if( updateT )
+ {
+ if (sPtrCache[i] != 0)
+ (*(sPtrCache[i])).count += val ;
+ else
+ tTable.incCount(es[i], fs[j], val);
+ }
+ aCountTable.getRef(i, j, l, m)+=val;
+ if (0 != i)
+ dCountTable.getRef(j, i, l, m)+=val;
+ }
+ } // for (i = ..)
+ } // for (if total ...)
+ } // end of for (j ...)
+ if (dump_files)
+ printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
+ addAL(viterbi_alignment,sent.sentenceNo,l);
+ if (!simple){
+ max_fertility_here = min(WordIndex(m+1), MAX_FERTILITY);
+ for (i = 1; i <= l ; i++) {
+ for ( k = 1; k < max_fertility_here; k++){
+ beta = 0 ;
+ alpha[k][i] = 0 ;
+ for (j = 1 ; j <= m ; j++){
+ temp = temp_mult[i][j];
+ if (temp > 0.95) temp = 0.95; // smooth to prevent under/over flow
+ else if (temp < 0.05) temp = 0.05;
+ beta += pow(temp/(1.0-temp), (double) k);
+ }
+ alpha[k][i] = beta * pow((double) -1, (double) (k+1)) / (double) k ;
+ }
+ }
+ for (i = 1; i <= l ; i++){
+ r = 1;
+ for (j = 1 ; j <= m ; j++)
+ r *= (1 - temp_mult[i][j]);
+ for (k = 0 ; k < max_fertility_here ; k++){
+ sum = get_sum_of_partitions(k, i, alpha);
+ temp = r * sum * count;
+ nCountTable.getRef(es[i], k)+=temp;
+ } // end of for (k ..)
+ } // end of for (i == ..)
+ } // end of if (!simple)
+ perp.addFactor(cross_entropy, count, l, m,1);
+ trainVPerp.addFactor(log(viterbi_score), count, l, m,1);
+ } // end of while
+ sHandler1.rewind();
+ cerr << "Normalizing t, a, d, n count tables now ... " ;
+ if( dump_files && OutputInAachenFormat==1 )
+ {
+ tfile = Prefix + ".t2to3" ;
+ tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
+ }
+ if( updateT )
+ tTable.normalizeTable(Elist, Flist);
+ aCountTable.normalize(aTable);
+ dCountTable.normalize(dTable);
+ if (!simple)
+ nCountTable.normalize(nTable,&Elist.getVocabList());
+ else {
+ for (i = 0 ; i< Elist.uniqTokens() ; i++){
+ if (0 < MAX_FERTILITY){
+ nTable.getRef(i,0)=PROB(0.2);
+ if (1 < MAX_FERTILITY){
+ nTable.getRef(i,1)=PROB(0.65);
+ if (2 < MAX_FERTILITY){
+ nTable.getRef(i,2)=PROB(0.1);
+ if (3 < MAX_FERTILITY)
+ nTable.getRef(i,3)=PROB(0.04);
+ PROB val = 0.01/(MAX_FERTILITY-4);
+ for (k = 4 ; k < MAX_FERTILITY ; k++)
+ nTable.getRef(i, k)=val;
+ }
+ }
+ }
+ }
+ } // end of else (!simple)
+ p0 = 0.95;
+ p1 = 0.05;
+ if (dump_files){
+ tfile = Prefix + ".t2to3" ;
+ afile = Prefix + ".a2to3" ;
+ nfile = Prefix + ".n2to3" ;
+ dfile = Prefix + ".d2to3" ;
+ p0file = Prefix + ".p0_2to3" ;
+
+ if( OutputInAachenFormat==0 )
+ tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
+ aTable.printTable(afile.c_str());
+ dTable.printTable(dfile.c_str());
+ nCountTable.printNTable(Elist.uniqTokens(), nfile.c_str(), Elist.getVocabList(),OutputInAachenFormat);
+ ofstream of(p0file.c_str());
+ of << p0;
+ of.close();
+ }
+ errorReportAL(cerr,"IBM-2");
+ if(simple)
+ {
+ perp.record("T2To3");
+ trainVPerp.record("T2To3");
+ }
+ else
+ {
+ perp.record("ST2To3");
+ trainVPerp.record("ST2To3");
+ }
+}
+
+void model3::transferSimple(/*model1& m1, model2& m2, */ sentenceHandler& sHandler1,
+ bool dump_files, Perplexity& perp, Perplexity& trainVPerp,bool updateT)
+{
+ /*
+ This function performs simple Model 2 -> Model 3 transfer.
+ It sets values for n and p without considering Model 2's ideas.
+ It sets d values based on a.
+ */
+ time_t st, fn;
+ // just inherit these from the previous models, to avoid data duplication
+
+ st = time(NULL);
+ cerr << "==========================================================\n";
+ cerr << "\nTransfer started at: "<< ctime(&st) << '\n';
+
+ cerr << "Simple tranfer of Model2 --> Model3 (i.e. estimating initial parameters of Model3 from Model2 tables)\n";
+
+ estimate_t_a_d(sHandler1, perp, trainVPerp, true, dump_files,updateT) ;
+ fn = time(NULL) ;
+ cerr << "\nTransfer: TRAIN CROSS-ENTROPY " << perp.cross_entropy()
+ << " PERPLEXITY " << perp.perplexity() << '\n';
+ cerr << "\nTransfer took: " << difftime(fn, st) << " seconds\n";
+ cerr << "\nTransfer Finished at: "<< ctime(&fn) << '\n';
+ cerr << "==========================================================\n";
+
+}
+
+
+void model3::transfer(sentenceHandler& sHandler1,bool dump_files, Perplexity& perp, Perplexity& trainVPerp,bool updateT)
+{
+ if (Transfer == TRANSFER_SIMPLE)
+ transferSimple(sHandler1,dump_files,perp, trainVPerp,updateT);
+ {
+ time_t st, fn ;
+
+ st = time(NULL);
+ cerr << "==========================================================\n";
+ cerr << "\nTransfer started at: "<< ctime(&st) << '\n';
+ cerr << "Transfering Model2 --> Model3 (i.e. estimating initial parameters of Model3 from Model2 tables)\n";
+
+ p1_count = p0_count = 0 ;
+
+ estimate_t_a_d(sHandler1, perp, trainVPerp, false, dump_files,updateT);
+
+
+
+ /* Below is a made-up stab at transferring t & a probs to p0/p1.
+ (Method not documented in IBM paper).
+ It seems to give p0 = .96, which may be right for Model 2, or may not.
+ I'm commenting it out for now and hardwiring p0 = .90 as above. -Kevin
+
+ // compute p0, p1 counts
+ Vector<LogProb> nm(Elist.uniqTokens(),0.0);
+
+ for(i=0; i < Elist.uniqTokens(); i++){
+ for(k=1; k < MAX_FERTILITY; k++){
+ nm[i] += nTable.getValue(i, k) * (LogProb) k;
+ }
+ }
+
+ LogProb mprime;
+ // sentenceHandler sHandler1(efFilename.c_str());
+ // sentPair sent ;
+
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ const float count = sent.noOccurrences;
+
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ mprime = 0 ;
+ for (i = 1; i <= l ; i++){
+ mprime += nm[es[i]] ;
+ }
+ mprime = LogProb((int((double) mprime + 0.5))); // round mprime to nearest integer
+ if ((mprime < m) && (2 * mprime >= m)) {
+ // cerr << "updating both p0_count and p1_count, mprime: " << mprime <<
+ // "m = " << m << "\n";
+ p1_count += (m - (double) mprime) * count ;
+ p0_count += (2 * (double) mprime - m) * count ;
+ // cerr << "p0_count = "<<p0_count << " , p1_count = " << p1_count << endl ;
+ }
+ else {
+ // p1_count += 0 ;
+ // cerr << "updating only p0_count, mprime: " << mprime <<
+ // "m = " << m << "\n";
+ p0_count += double(m * count) ;
+ // cerr << "p0_count = "<<p0_count << " , p1_count = " << p1_count << endl ;
+ }
+ }
+
+ // normalize p1, p0
+
+ cerr << "p0_count = "<<p0_count << " , p1_count = " << p1_count << endl ;
+ p1 = p1_count / (p1_count + p0_count ) ;
+ p0 = 1 - p1;
+ cerr << "p0 = "<<p0 << " , p1 = " << p1 << endl ;
+ // Smooth p0 probability to avoid getting zero probability.
+ if (0 == p0){
+ p0 = (LogProb) SMOOTH_THRESHOLD ;
+ p1 = p1 - (LogProb) SMOOTH_THRESHOLD ;
+ }
+ if (0 == p1){
+ p1 = (LogProb) SMOOTH_THRESHOLD ;
+ p0 = p0 - (LogProb) SMOOTH_THRESHOLD ;
+ }
+ */
+
+ fn = time(NULL) ;
+ cerr << "\nTransfer: TRAIN CROSS-ENTROPY " << perp.cross_entropy()
+ << " PERPLEXITY " << perp.perplexity() << '\n';
+ // cerr << "tTable contains " << tTable.getHash().bucket_count()
+ // << " buckets and " << tTable.getHash().size() << " entries." ;
+ cerr << "\nTransfer took: " << difftime(fn, st) << " seconds\n";
+ cerr << "\nTransfer Finished at: "<< ctime(&fn) << endl;
+ cerr << "==========================================================\n";
+
+ }
+
+}
diff --git a/GIZA++-v2/model3.cpp b/GIZA++-v2/model3.cpp
new file mode 100644
index 0000000..1fe0216
--- /dev/null
+++ b/GIZA++-v2/model3.cpp
@@ -0,0 +1,511 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "model3.h"
+#include "collCounts.h"
+#include "Globals.h"
+#include "utility.h"
+#include "D5Tables.h"
+#include "transpair_model5.h"
+#include "transpair_modelhmm.h"
+#include "Parameter.h"
+
+#define TRICKY_IBM3_TRAINING
+
+GLOBAL_PARAMETER(int,M4_Dependencies,"depm4","d_{=1}: &1:l, &2:m, &4:F, &8:E, d_{>1}&16:l, &32:m, &64:F, &128:E",PARLEV_MODELS,76);
+GLOBAL_PARAMETER(int,M5_Dependencies,"depm5","d_{=1}: &1:l, &2:m, &4:F, &8:E, d_{>1}&16:l, &32:m, &64:F, &128:E",PARLEV_MODELS,68);
+GLOBAL_PARAMETER4(int,Model3_Dump_Freq,"MODEL 345 DUMP FREQUENCY","MODEL 3 DUMP FREQUENCY","t3","t345","dump frequency of Model 3/4/5",PARLEV_OUTPUT,0);
+
+
+extern int Transfer_Dump_Freq;
+
+model3::model3(model2& m2) :
+ model2(m2),dTable(true), dCountTable(true),
+ nTable(m2.getNoEnglishWords()+1, MAX_FERTILITY),
+ nCountTable(m2.getNoEnglishWords()+1, MAX_FERTILITY),h(0)
+{}
+
+void model3::load_tables(const char *nfile, const char *dfile, const char *p0file){
+ cout << "Model3: loading n, d, p0 tables \n";
+
+ nTable.readNTable(nfile);
+ dTable.readTable(dfile);
+ ifstream inf(p0file);
+ if( !inf )
+ cerr << "Can not open: " << p0file << '\n';
+ else
+ {
+ cout << "Reading p0 value from " << p0file << "\n";
+ inf >> p0;
+ inf.close();
+ p1 = 1 - p0;
+ }
+ cout << "p0 is: " << p0 << " p1:" << p1 << '\n';
+}
+
+model3::~model3()
+{
+ dTable.clear();
+ dCountTable.clear();
+ nTable.clear();
+ nCountTable.clear();
+}
+
+
+void model3::em(int noIterations, sentenceHandler& sHandler1)
+{
+
+ LogProb all_prob, aprob, temp ;
+ WordIndex i, j, l, m ;
+ time_t it_st, st, it_fn, fn ;
+ string tfile, dfile, nfile, p0file, afile, number;
+
+ st = time(NULL) ;
+ if (Log)
+ logmsg << "\n" << "Starting Model3: Training";
+ cout << "\n" << "Starting Model3: Training";
+ // sentenceHandler sHandler1(efFilename.c_str());
+ sHandler1.rewind();
+ for(int it=1; it <= noIterations; it++){
+ it_st = time(NULL) ;
+ if (Log)
+ logmsg << "\n" << "Model3: Iteration " << it;
+ cout << "\n" << "Model3: Iteration " << it;
+
+ // set up the names of the files where the tables will be printed
+ int n = it;
+ number = "";
+ do{
+ //mj changed next line
+ number.insert((size_t) 0, 1, (char)(n % 10 + '0'));
+ } while((n /= 10) > 0);
+ tfile = Prefix + ".t3." + number ;
+ afile = Prefix + ".a3." + number ;
+ nfile = Prefix + ".n3." + number ;
+ dfile = Prefix + ".d3." + number ;
+ p0file = Prefix + ".p0_3." + number ;
+ // tCountTable.clear();
+ dCountTable.clear();
+ nCountTable.clear();
+ p0_count = p1_count = 0 ;
+ all_prob = 0 ;
+ sentPair sent ;
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ const float count = sent.getCount();
+ if ((sent.sentenceNo % 1000) == 0)
+ cout <<sent.sentenceNo << '\n';
+ Vector<WordIndex> A(fs.size(),/*-1*/0);
+ Vector<WordIndex> Fert(es.size(),0);
+ LogProb lcount=(LogProb)count;
+ l = es.size()-1;
+ m = fs.size()-1;
+ WordIndex x, y ;
+ all_prob = prob_of_target_given_source(tTable, fs, es);
+ if (all_prob == 0)
+ cout << "\n" <<"all_prob = 0";
+
+ for ( x = 0 ; x < pow(l+1.0, double(m)) ; x++){ // For all possible alignmets A
+ y = x ;
+ for (j = 1 ; j <= m ; j++){
+ A[j] = y % (l+1) ;
+ y /= (l+1) ;
+ }
+ for(i = 0 ; i <= l ; i++)
+ Fert[i] = 0 ;
+ for (j = 1 ; j <= m ; j++)
+ Fert[A[j]]++;
+ if (2 * Fert[0] <= m){ /* consider alignments that has Fert[0] less than
+ half the number of words in French sentence */
+ aprob = prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es);
+ temp = aprob/all_prob ;
+ LogProb templcount = temp*lcount;
+
+ for (j = 1 ; j <= m ; j++){
+ tTable.incCount(es[A[j]], fs[j], templcount);
+ if (0 != A[j])
+ dCountTable.getRef(j, A[j], l, m)+=templcount;
+ }
+ for(i = 0 ; i <= l ; i++)
+ {
+ nCountTable.getRef(es[i], Fert[i])+=templcount;
+ //cout << "AFTER INC2: " << templcount << " " << nCountTable.getRef(es[i], Fert[i]) << '\n';
+ }
+ p1_count += double(temp) * (Fert[0] * count) ;
+ p0_count += double(temp) * ((m - 2 * Fert[0]) * count) ;
+ }
+ } /* of looping over all alignments */
+ } /* of sentence pair E, F */
+ sHandler1.rewind();
+
+ // normalize tables
+ if( OutputInAachenFormat==1 )
+ tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
+ tTable.normalizeTable(Elist, Flist);
+ aCountTable.normalize(aTable);
+ dCountTable.normalize(dTable);
+ nCountTable.normalize(nTable,&Elist.getVocabList());
+
+ // normalize p1 & p0
+
+ if (p1_count + p0_count != 0){
+ p1 = p1_count / ( p1_count + p0_count ) ;
+ p0 = 1 - p1 ;
+ }
+ else {
+ p1 = p0 = 0 ;
+ }
+ // print tables
+ if( OutputInAachenFormat==0 )
+ tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
+ dTable.printTable(dfile.c_str());
+ nTable.printNTable(Elist.uniqTokens(), nfile.c_str(), Elist.getVocabList(),OutputInAachenFormat);
+ ofstream of(p0file.c_str());
+ of << p0;
+ of.close();
+ it_fn = time(NULL) ;
+ cout << "\n" << "Model3 Iteration "<<it<<" took: " << difftime(it_fn, it_st) << " seconds\n";
+
+ } /* of iterations */
+ fn = time(NULL) ;
+ cout << "\n" << "Entire Model3 Training took: " << difftime(fn, st) << " seconds\n";
+}
+
+
+
+
+
+
+
+//-----------------------------------------------------------------------
+
+/*
+void simpleModel3Test()
+{
+ PositionIndex l=6;
+ PositionIndex m=8;
+ alignment al(l,m);
+ al.set(1,1);
+ al.set(2,2);
+ al.set(3,3);
+ al.set(4,2);
+ al.set(5,0);
+ al.set(6,6);
+ al.set(7,3);
+ al.set(8,4);
+ cout << al;
+ PositionIndex prev_cept=0;
+ PositionIndex vac_all=m;
+ Vector<char> vac(m+1,0);
+ for(PositionIndex i=1;i<=l;i++)
+ {
+ PositionIndex cur_j=al.als_i[i];
+ cout << "LOOP: " << i << " " << cur_j << '\n';
+ PositionIndex prev_j=0;
+ PositionIndex k=0;
+ if(cur_j) { // process first word of cept
+ k++;
+ vac_all--;
+ assert(vac[cur_j]==0);
+ vac[cur_j]=1;
+ for(unsigned int q=0;q<vac.size();q++)cout << (vac[q]?'1':'0') << ' ';
+ cout << '\n';
+ cout << i << " " << cur_j << ": d1(" << vacancies(vac,cur_j) << "|" << vacancies(vac,al.get_center(prev_cept)) << "," << vac_all << "+" << -al.fert(i)<< "+" << +k << ")\n" << '\n';
+ prev_j=cur_j;
+ cur_j=al.als_j[cur_j].next;
+ }
+ while(cur_j) { // process following words of cept
+ k++;
+ vac_all--;
+ vac[cur_j]=1;
+ int vprev=vacancies(vac,prev_j);
+ cout << "PREV: " << prev_j << '\n';
+ for(unsigned int q=0;q<vac.size();q++)cout << (vac[q]?'1':'0') << ' ';
+ cout << '\n';
+ cout << i << " " << cur_j << ": d>1(" << vacancies(vac,cur_j) << "-" << vprev << "|" << vac_all<< "+" << -al.fert(i)<< "+" << +k << ")\n" << '\n';
+ prev_j=cur_j;
+ cur_j=al.als_j[cur_j].next;
+ }
+ assert(k==al.fert(i));
+ if( k )
+ prev_cept=i;
+ }
+ assert(vac_all==al.fert(0));
+}
+*/
+
+extern short DoViterbiTraining;
+
+int model3::viterbi(int noIterationsModel3, int noIterationsModel4,int noIterationsModel5,int noIterationsModel6)
+{
+ double minErrors=1.0;int minIter=0;
+ d4model d4m(MAX_SENTENCE_LENGTH);
+ d4m.makeWordClasses(Elist,Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
+ d5model d5m(d4m);
+ d5m.makeWordClasses(Elist,Flist,SourceVocabFilename+".classes",TargetVocabFilename+".classes");
+ time_t it_st, st, it_fn, fn;
+ bool dump_files = false ;
+ string tfile, tfile_actual, dfile, afile, nfile, nfile_actual, p0file, alignfile, number, test_alignfile, d4file,d5file,zeroFertFile;
+ st = time(NULL);
+ sHandler1.rewind();
+ if (testPerp && testHandler)
+ (*testHandler).rewind();
+ string trainingString;
+ trainingString+=(h?'H':'3');
+ for(int i=0;i<noIterationsModel3;++i) trainingString+='3';
+ for(int i=0;i<noIterationsModel4;++i) trainingString+='4';
+ for(int i=0;i<noIterationsModel5;++i) trainingString+='5';
+ for(int i=0;i<noIterationsModel6;++i) trainingString+='6';
+
+ cout << "\n==========================================================\n";
+ cout << "Starting "<<trainingString<<": Viterbi Training";
+ if (Log){
+ logmsg << "\n==========================================================\n";
+ logmsg << "Starting "<<trainingString<<": Viterbi Training";
+ }
+ cout << "\n "<<trainingString<<" Training Started at: "<< ctime(&st) << '\n';
+ for(unsigned int it=1; it < trainingString.length(); it++){
+ bool final=0;
+ if( it==trainingString.length()-1 )
+ final=1;
+ string modelName;
+ char fromModel=trainingString[it-1],toModel=trainingString[it];
+ if(fromModel==toModel)
+ modelName=string("Model")+fromModel;
+ else
+ modelName=string("T")+fromModel+"To"+toModel;
+ it_st = time(NULL);
+ cout <<"\n---------------------\n"<<modelName<<": Iteration " << it<<'\n';
+ if (Log)
+ logmsg <<"\n---------------------\n"<<modelName<<": Iteration " << it<<'\n';
+ dump_files = (final || ((Model3_Dump_Freq != 0) && ((it % Model3_Dump_Freq) == 0))) && !NODUMPS ;
+ string d4file2;
+ {
+ // set up the names of the files where the tables will be printed
+ int n = it;
+ number = "";
+ do{
+ //mj changed next line
+ number.insert((size_t) 0, 1, (char)(n % 10 + '0'));
+ } while((n /= 10) > 0);
+ if( final )
+ number="final";
+ tfile = Prefix + ".t3." + number ;
+ tfile_actual = Prefix + ".actual.t3." + number ;
+ afile = Prefix + ".a3." + number ;
+ nfile = Prefix + ".n3." + number ;
+ nfile_actual = Prefix + ".actual.n3." + number ;
+ dfile = Prefix + ".d3." + number ;
+ d4file = Prefix + ".d4." + number ;
+ d4file2 = Prefix + ".D4." + number ;
+ d5file = Prefix + ".d5." + number ;
+ alignfile = Prefix + ".A3." + number ;
+ test_alignfile = Prefix + ".tst.A3." + number ;
+ p0file = Prefix + ".p0_3." + number ;
+ }
+ // clear count tables
+ // tCountTable.clear();
+ dCountTable.clear();
+ aCountTable.clear();
+ initAL();
+ nCountTable.clear();
+ d4m.clear();
+ p0_count = p1_count = 0 ;
+
+#ifdef TRICKY_IBM3_TRAINING
+
+#define TRAIN_ARGS perp, trainViterbiPerp, sHandler1, dump_files, alignfile.c_str(), true, modelName,final
+#define TEST_ARGS *testPerp, *testViterbiPerp, *testHandler, dump_files, test_alignfile.c_str(),false, modelName,final
+
+
+ switch( toModel )
+ {
+ case '3':
+ switch(fromModel )
+ {
+ case 'H':
+ viterbi_loop_with_tricks <transpair_modelhmm,const hmm>(TRAIN_ARGS,h,(void*)0);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_modelhmm,const hmm>(TEST_ARGS, h,(void*)0);
+ break;
+ case '3':
+ viterbi_loop_with_tricks<transpair_model3>( TRAIN_ARGS, (void*)0,(void*)0);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_model3>( TEST_ARGS, (void*)0,(void*)0);
+ break;
+ default: abort();
+ }
+ break;
+ case '4':
+ {
+ switch(fromModel)
+ {
+ case 'H':
+ viterbi_loop_with_tricks <transpair_modelhmm,const hmm,d4model>(TRAIN_ARGS,h,&d4m);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_modelhmm,const hmm,d4model>(TEST_ARGS, h,&d4m);
+ break;
+ case '3':
+ viterbi_loop_with_tricks<transpair_model3, void,d4model>(TRAIN_ARGS, (void*)0,&d4m);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_model3, void,d4model>( TEST_ARGS , (void*)0,&d4m);
+ break;
+ case '4':
+ viterbi_loop_with_tricks<transpair_model4, d4model,d4model>(TRAIN_ARGS , &d4m,&d4m);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_model4, d4model,d4model>( TEST_ARGS, &d4m,&d4m);
+ break;
+ default: abort();
+ }
+ d4m.normalizeTable();
+ if( dump_files )
+ d4m.printProbTable(d4file.c_str(),d4file2.c_str());
+ }
+ break;
+ case '5':
+ {
+ switch(fromModel)
+ {
+ case 'H':
+ viterbi_loop_with_tricks <transpair_modelhmm,const hmm,d5model>(TRAIN_ARGS,h,&d5m);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_modelhmm,const hmm,d5model>(TEST_ARGS, h,&d5m);
+ break;
+ case '3':
+ viterbi_loop_with_tricks<transpair_model3, void,d5model>(TRAIN_ARGS, (void*)0,&d5m);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_model3, void,d5model>( TEST_ARGS , (void*)0,&d5m);
+ break;
+ case '4':
+ viterbi_loop_with_tricks<transpair_model4, d4model,d5model>(TRAIN_ARGS, &d4m,&d5m);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_model4, d4model,d5model>( TEST_ARGS, &d4m,&d5m);
+ break;
+ case '5':
+ viterbi_loop_with_tricks<transpair_model5, d5model, d5model>(TRAIN_ARGS, &d5m,&d5m);
+ if (testPerp && testHandler)
+ viterbi_loop_with_tricks<transpair_model5, d5model, d5model>( TEST_ARGS, &d5m,&d5m);
+ break;
+ default: abort();
+ }
+ d5m.d4m.normalizeTable();
+ if( dump_files )
+ d5m.d4m.printProbTable(d4file.c_str(),d4file2.c_str());
+ d5m.normalizeTable();
+ if( dump_files )
+ {
+ ofstream d5output(d5file.c_str());
+ d5output << d5m;
+ }
+ }
+ break;
+ default: abort();
+ }
+
+#else
+ viterbi_loop(perp, trainViterbiPerp, sHandler1, dump_files,
+ alignfile.c_str(), true, model);
+ if (testPerp && testHandler)
+ viterbi_loop(*testPerp, *testViterbiPerp, *testHandler,
+ dump_files, test_alignfile.c_str(), false, model);
+
+#endif
+ if( errorsAL()<minErrors )
+ {
+ minErrors=errorsAL();
+ minIter=it;
+ }
+
+ // now normalize count tables
+ if( dump_files&&OutputInAachenFormat==1 )
+ tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
+ tTable.normalizeTable(Elist, Flist);
+ aCountTable.normalize(aTable);
+ dCountTable.normalize(dTable);
+ nCountTable.normalize(nTable,&Elist.getVocabList());
+
+ // cout << "tTable contains " <<
+ // tTable.getHash().bucket_count() << " buckets and "<<
+ //tTable.getHash().size() << " entries.\n";
+
+ // normalize p1 & p0
+
+ cout << "p0_count is " << p0_count << " and p1 is " << p1_count << "; ";
+ if(P0!=-1.0)
+ {
+ p0 = P0;
+ p1 = 1-P0;
+ }
+ else {
+ if (p1_count + p0_count != 0){
+ p1 = p1_count / ( p1_count + p0_count ) ;
+ p0 = 1 - p1 ;
+ }
+ else {
+ p1 = p0 = 0 ;
+ cerr << "ERROR: p0_count+p1_count is zero!!!\n";
+ }
+ }
+ cout << "p0 is " << p0 << " p1: " << p1 << '\n';
+
+ cout << modelName<<": TRAIN CROSS-ENTROPY " << perp.cross_entropy()
+ << " PERPLEXITY " << perp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ":("<<it<<" TEST CROSS-ENTROPY " << (*testPerp).cross_entropy()
+ << " PERPLEXITY " << (*testPerp).perplexity() << " sum: " << (*testPerp).getSum()<<
+ " wc: " << (*testPerp).word_count() << '\n';
+ cout << modelName << ": ("<<it<<") TRAIN VITERBI CROSS-ENTROPY " << trainViterbiPerp.cross_entropy()
+ << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<")TEST VITERBI CROSS-ENTROPY " << (*testViterbiPerp).cross_entropy()
+ << " PERPLEXITY " << (*testViterbiPerp).perplexity() << " Sum: " << (*testViterbiPerp).getSum() <<
+ " wc: " << (*testViterbiPerp).word_count() << '\n';
+ if (dump_files)
+ {
+ if( OutputInAachenFormat==0 )
+ tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
+ aTable.printTable(afile.c_str());
+ dTable.printTable(dfile.c_str());
+ nTable.printNTable(Elist.uniqTokens(), nfile.c_str(), Elist.getVocabList(), OutputInAachenFormat);
+ ofstream of(p0file.c_str());
+ of << p0;
+ of.close();
+ }
+
+ it_fn = time(NULL) ;
+ cout << "\n" << modelName << " Viterbi Iteration : "<<it<< " took: " <<
+ difftime(it_fn, it_st) << " seconds\n";
+ } /* of iterations */
+ fn = time(NULL);
+ cout << trainingString <<" Training Finished at: " << ctime(&fn) << "\n";
+ cout << "\n" << "Entire Viterbi "<<trainingString<<" Training took: " << difftime(fn, st) << " seconds\n";
+ cout << "==========================================================\n";
+ if( noIterationsModel4||noIterationsModel5 )
+ minIter-=noIterationsModel3;
+ if( noIterationsModel5 )
+ minIter-=noIterationsModel4;
+ return minIter;
+}
+
+
+
+
diff --git a/GIZA++-v2/model3.h b/GIZA++-v2/model3.h
new file mode 100644
index 0000000..a7db406
--- /dev/null
+++ b/GIZA++-v2/model3.h
@@ -0,0 +1,138 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _model3_h
+#define _model3_h 1
+#include <assert.h>
+#include <iostream>
+#include <algorithm>
+#include <functional>
+#include <map>
+#include <set>
+#include "Vector.h"
+#include <utility>
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+
+
+#include <time.h>
+#include <fstream>
+#include <math.h>
+#include "MoveSwapMatrix.h"
+#include "TTables.h"
+#include "ATables.h"
+#include "NTables.h"
+#include "getSentence.h"
+#include "defs.h"
+#include "model2.h"
+#include "Perplexity.h"
+#include "transpair_model3.h"
+#include "transpair_modelhmm.h"
+#include "alignment.h"
+#include "vocab.h"
+#include "D4Tables.h"
+#include "AlignTables.h"
+
+class model3 : public model2
+{
+public:
+ amodel<PROB> dTable;
+ amodel<COUNT> dCountTable;
+
+ PROB p0,p1;
+ double p0_count, p1_count ;
+
+ nmodel<PROB> nTable;
+ nmodel<COUNT> nCountTable;
+ hmm*h;
+
+public:
+ void setHMM(hmm*_h){h=_h;}
+ model3(model2& m2);
+ ~model3();
+ // methods
+ void transfer(sentenceHandler&, bool, Perplexity&, Perplexity&,bool updateT=1);
+ void transferSimple(sentenceHandler&, bool, Perplexity&, Perplexity&,bool updateT=1);
+ void load_tables(const char *nfile, const char *dfile, const char *p0file);
+
+ void em(int, sentenceHandler&);
+ int viterbi(int, int, int,int);
+
+private:
+ LogProb prob_of_special(Vector<WordIndex>&,
+ Vector<WordIndex>&,
+ tmodel<COUNT, PROB>&,
+ Vector<WordIndex>&,
+ Vector<WordIndex>&);
+
+ LogProb prob_of_target_and_alignment_given_source(Vector<WordIndex>&,
+ Vector<WordIndex>&,
+ tmodel<COUNT, PROB>&,
+ Vector<WordIndex>&,
+ Vector<WordIndex>&);
+ LogProb prob_of_target_given_source(tmodel<COUNT, PROB>&,
+ Vector<WordIndex>&,
+ Vector<WordIndex>&);
+
+ LogProb scoreOfMove(Vector<WordIndex>&, Vector<WordIndex>&,
+ Vector<WordIndex>&, Vector<WordIndex>&,
+ tmodel<COUNT, PROB>&, WordIndex, WordIndex);
+
+ LogProb scoreOfSwap(Vector<WordIndex>&, Vector<WordIndex>&,
+ Vector<WordIndex>&, tmodel<COUNT, PROB>&, int, int);
+
+ void hillClimb(Vector<WordIndex>&, Vector<WordIndex>&,
+ Vector<WordIndex>&, Vector<WordIndex>&,
+ LogProb&, tmodel<COUNT, PROB>&, int, int);
+
+ void findBestAlignment(Vector<WordIndex>&, Vector<WordIndex>&,
+ Vector<WordIndex>&, Vector<WordIndex>&,
+ LogProb&,int , int);
+
+
+ void findAlignmentsNeighborhood( Vector<WordIndex>&,
+ Vector<WordIndex>&,
+ LogProb&align_total_count,
+ alignmodel&neighborhood,
+ int, int);
+ void collectCountsOverAlignement(const Vector<WordIndex>& es,
+ const Vector<WordIndex>& fs,
+ const Vector<WordIndex>&,
+ LogProb , float count);
+ LogProb viterbi_model2(const transpair_model3&ef, alignment&output, int pair_no,int i_peg = -1 , int j_peg = -1 )const;
+ LogProb _viterbi_model2(const transpair_model2&ef, alignment&output, int i_peg = -1 , int j_peg = -1 )const;
+ LogProb viterbi_model2(const transpair_modelhmm&ef, alignment&output, int pair_no,int i_peg = -1 , int j_peg = -1 )const;
+
+ private:
+ void estimate_t_a_d(sentenceHandler& sHandler1, Perplexity& perp, Perplexity& perp,bool simple, bool dump_files,bool updateT);
+ void viterbi_loop(Perplexity&, Perplexity&, sentenceHandler&, bool, const char*,bool,string model);
+
+ template<class MODEL_TYPE, class A,class B>
+ void viterbi_loop_with_tricks(Perplexity&, Perplexity&, sentenceHandler&,
+ bool, const char*, bool, string model, bool final,A*d4m,B*d5m);
+
+};
+
+#endif
diff --git a/GIZA++-v2/model345-peg.cpp b/GIZA++-v2/model345-peg.cpp
new file mode 100644
index 0000000..8c1bde6
--- /dev/null
+++ b/GIZA++-v2/model345-peg.cpp
@@ -0,0 +1,191 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "model3.h"
+#include "collCounts.h"
+
+bool makeOneMoveSwap(const alignment&x,const alignment&y,set<OneMoveSwap>&soms)
+{
+ OneMoveSwap oms;
+ oms.type=0;
+ int count=0;
+ Vector<int> positions(4);
+ assert(x.get_m()==y.get_m());
+ for(PositionIndex j=1;j<=x.get_m();j++)
+ if(x(j)!=y(j))
+ {
+ if(count==4)
+ return 0;
+ positions[count]=j;
+ count++;
+ }
+ assert(count>0);
+ if(count==1)
+ {
+ oms.type=1;
+ oms.a=positions[0];
+ oms.b=y(positions[0]);
+ soms.insert(oms);
+ for(unsigned int j=1;j<=x.get_m();++j)
+ {
+ if( int(j)!=positions[0]&&y(j)==y(positions[0]))
+ {
+ oms.type=3;
+ oms.a=j;
+ oms.b=x(positions[0]);
+ soms.insert(oms);
+ }
+ }
+ for(unsigned int j=1;j<=x.get_m();++j)
+ {
+ if( int(j)!=positions[0]&&x(j)==x(positions[0]))
+ {
+ oms.type=2;
+ oms.a=positions[0];
+ oms.b=j;
+ if( oms.b<oms.a)swap(oms.b,oms.a);
+ soms.insert(oms);
+ }
+ }
+ return 1;
+ }
+ else if(count==2)
+ {
+ if(x(positions[0])==y(positions[1]) && x(positions[1])==y(positions[0]))
+ {
+ oms.type=4;
+ oms.a=positions[0];
+ oms.b=positions[1];
+ soms.insert(oms);
+ for(unsigned int j=1;j<=x.get_m();++j)
+ {
+ if( int(j)!=positions[0]&&y(j)==y(positions[0]))
+ {
+ oms.type=2;oms.a=j;oms.b=positions[1];if( oms.b<oms.a)swap(oms.b,oms.a);soms.insert(oms);
+ }
+ if( int(j)!=positions[1]&&y(j)==y(positions[1]))
+ {
+ oms.type=2;oms.a=j;oms.b=positions[0];if( oms.b<oms.a)swap(oms.b,oms.a);soms.insert(oms);
+ }
+ }
+ }
+ else if(x(positions[0])==y(positions[1]) )
+ {
+ oms.type=3;
+ oms.a=positions[0];
+ oms.b=x(positions[1]);
+ soms.insert(oms);
+ oms.type=2;
+ oms.a=positions[0];
+ oms.b=positions[1];
+ soms.insert(oms);
+ }
+ else if( x(positions[1])==y(positions[0]) )
+ {
+ oms.type=3;
+ oms.a=positions[1];
+ oms.b=x(positions[0]);
+ soms.insert(oms);
+ oms.type=2;
+ oms.a=positions[0];
+ oms.b=positions[1];
+ soms.insert(oms);
+ }
+ oms.type=3;
+ oms.a=positions[0];
+ oms.b=x(positions[0]);
+ soms.insert(oms);
+ oms.a=positions[1];
+ oms.b=x(positions[1]);
+ soms.insert(oms);
+ return 1;
+ }
+ else if( count==3 )
+ { // three differences and three different numbers
+ Vector<int> xx(3),yy(3);
+ xx[0]=x(positions[0]);xx[1]=x(positions[1]);xx[2]=x(positions[2]);
+ yy[0]=y(positions[0]);yy[1]=y(positions[1]);yy[2]=y(positions[2]);
+ sort(xx.begin(),xx.end());
+ sort(yy.begin(),yy.end());
+ if(xx==yy)
+ {
+ oms.type=2;oms.a=positions[0];oms.b=positions[1];soms.insert(oms);
+ oms.type=2;oms.a=positions[0];oms.b=positions[2];soms.insert(oms);
+ oms.type=2;oms.a=positions[1];oms.b=positions[2];soms.insert(oms);
+ }
+ else
+ {
+ //cout << "HERE.\n";
+ if( x(positions[0])==y(positions[1])&&x(positions[1])==y(positions[0]) )
+ {
+ oms.type=2;oms.a=positions[0];oms.b=positions[1];
+ if( oms.b<oms.a) swap(oms.b,oms.a);
+ soms.insert(oms);
+ oms.type=3;oms.a=positions[2];oms.b=x(positions[2]);soms.insert(oms);
+ }
+ if( x(positions[2])==y(positions[1])&&x(positions[1])==y(positions[2]) )
+ {
+ oms.type=2;oms.a=positions[2];oms.b=positions[1];
+ if( oms.b<oms.a) swap(oms.b,oms.a);
+ soms.insert(oms);
+ oms.type=3;oms.a=positions[0];oms.b=x(positions[0]);soms.insert(oms);
+ }
+ if( x(positions[0])==y(positions[2])&&x(positions[2])==y(positions[0]) )
+ {
+ oms.type=2;oms.a=positions[0];oms.b=positions[2];
+ if( oms.b<oms.a) swap(oms.b,oms.a);
+ soms.insert(oms);
+ oms.type=3;oms.a=positions[1];oms.b=x(positions[1]);soms.insert(oms);
+ }
+ }
+ return 1;
+ }
+ else if(count==4)
+ {
+ Vector<int> xx(4),yy(4);
+ for(int i=0;i<4;++i)
+ {
+ xx[i]=x(positions[i]);
+ yy[i]=y(positions[i]);
+ }
+ sort(xx.begin(),xx.end());
+ sort(yy.begin(),yy.end());
+ if(xx==yy)
+ {
+ oms.type=2;
+ for(int j1=0;j1<4;j1++)
+ for(int j2=j1+1;j2<4;j2++)
+ {
+ if(x(positions[j1])!=x(positions[j2])&&
+ x(positions[j1])==y(positions[j2])&&
+ x(positions[j2])==y(positions[j1]))
+ {
+ oms.type=2;oms.a=positions[j1];oms.b=positions[j2];
+ soms.insert(oms);
+ }
+ }
+ }
+ return 1;
+ }
+ else
+ return 0;
+}
diff --git a/GIZA++-v2/model3_viterbi.cpp b/GIZA++-v2/model3_viterbi.cpp
new file mode 100644
index 0000000..bf1e7ab
--- /dev/null
+++ b/GIZA++-v2/model3_viterbi.cpp
@@ -0,0 +1,656 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "model3.h"
+#include "utility.h"
+#include "Globals.h"
+
+
+LogProb model3::prob_of_target_and_alignment_given_source(Vector<WordIndex>& A,
+ Vector<WordIndex>& Fert,
+ tmodel<COUNT, PROB>& tTable,
+ Vector<WordIndex>& fs,
+ Vector<WordIndex>& es)
+{
+ LogProb total = 1.0 ;
+ LogProb temp = 0.0 ;
+ const LogProb zero = 0.0 ;
+ WordIndex l = es.size()-1, m = fs.size()-1;
+ WordIndex i, j ;
+
+ total *= pow(double(1-p1), m-2.0 * Fert[0]) * pow(double(p1), double(Fert[0]));
+ if (total == 0)
+ return(zero);
+ for (i = 1 ; i <= Fert[0] ; i++){ // loop caculates m-fert[0] choose fert[0]
+ total *= double(m - Fert[0] - i + 1) / i ;
+ if (total == 0)
+ return(zero);
+ }
+ for (i = 1 ; i <= l ; i++){ // this loop calculates fertilities term
+ total *= double(nTable.getValue(es[i], Fert[i])) * (LogProb) factorial(Fert[i]);
+ if (total == 0)
+ return(zero);
+ }
+ for (j = 1 ; j <= m ; j++){
+ // temp = tTable.getValue(es[A[j]], fs[j]) ;
+ temp = double(tTable.getProb(es[A[j]], fs[j])) ;
+ total *= temp ;
+ if (0 != A[j])
+ total *= double(dTable.getValue(j, A[j], l, m));
+ if (total == 0)
+ return(zero);
+ }
+ return(total);
+}
+
+LogProb model3::prob_of_target_given_source(tmodel<COUNT, PROB>& tTable,
+ Vector<WordIndex>& fs,
+ Vector<WordIndex>& es)
+{
+
+ WordIndex x, y ;
+ LogProb total = 0 ;
+ // WordIndex l = es.size(), m = fs.size();
+ WordIndex l = es.size()-1, m = fs.size()-1;
+ Vector<WordIndex> A(fs.size(),/*-1*/0);
+ Vector<WordIndex> Fert(es.size(),0);
+ WordIndex i,j ;
+
+ for ( x = 0 ; x < pow(l+1.0, double(m)) ; x++){ // For all possible alignmets A
+ y = x ;
+ // for (j = 1 ; j < m ; j++){
+ for (j = 1 ; j <= m ; j++){
+ A[j] = y % (l+1) ;
+ y /= (l+1) ;
+ }
+ // for(i = 0 ; i < l ; i++)
+ for(i = 0 ; i <= l ; i++)
+ Fert[i] = 0 ;
+ // for (j = 1 ; j < m ; j++)
+ for (j = 1 ; j <= m ; j++)
+ Fert[A[j]]++;
+ // if (2 * Fert[0] < m){
+ if (2 * Fert[0] <= m){ /* consider alignments that has Fert[0] less than
+ half the length of french sentence */
+ total += prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es);
+ }
+ }
+ return(total);
+}
+
+
+LogProb model3::scoreOfMove(Vector<WordIndex>& es,
+ Vector<WordIndex>& fs,
+ Vector<WordIndex>& A,
+ Vector<WordIndex>& Fert,
+ tmodel<COUNT, PROB>& tTable,
+ WordIndex j,
+ WordIndex i)
+ // returns the scaling factor of the original score if A[j] is linked to
+ // i, no change is really made to A
+ // but the score is calculated if the move is to be taken (i.e.
+ // no side effects on Alignment A nor its Fertility Fert
+ // If the value of the scaling factor is:
+ // 1: then the score of the new alignment if the move is taken will
+ // not change.
+ // 0.5: the new score is half the score of the original alignment.
+ // 2.0: the new score will be twice as much.
+ //
+{
+ // LogProb score;
+ LogProb change ;
+ WordIndex m, l ;
+
+ m = fs.size() - 1;
+ l = es.size() - 1;
+
+
+ if (A[j] == i)
+ // return(original_score);
+ return(1) ;
+ else if (A[j] == 0){ // a move from position zero to something else
+ change = double(p0*p0)/p1 *
+ (double((Fert[0]*(m-Fert[0]+1))) / ((m-2*Fert[0]+1)*(m-2*Fert[0]+2))) *
+ (Fert[i]+1) *
+ double(nTable.getValue(es[i], Fert[i]+1)) /
+ double(nTable.getValue(es[i], Fert[i])) *
+ double(tTable.getProb(es[i], fs[j])) /
+ double(tTable.getProb(es[A[j]], fs[j])) *
+ double(dTable.getValue(j, i, l, m));
+ }
+ else if (i == 0){ // a move to position zero
+ change=
+ ((double(p1) / (p0*p0)) *
+ (double((m-2*Fert[0])*(m-2*Fert[0]-1))/((Fert[0]+1)*(m-Fert[0]))) *
+ (double(1)/Fert[A[j]]) *
+ double(nTable.getValue(es[A[j]], Fert[A[j]]-1)) /
+ double(nTable.getValue(es[A[j]], Fert[A[j]]))*
+ double(tTable.getProb(es[i], fs[j])) /
+ double(tTable.getProb(es[A[j]], fs[j])) *
+ 1.0 / double(dTable.getValue(j, A[j], l, m)));
+ }
+ else{ // a move that does not involve position zero
+ change =
+ ((double(Fert[i]+1)/Fert[A[j]]) *
+ double(nTable.getValue(es[A[j]], Fert[A[j]]-1)) /
+ double(nTable.getValue(es[A[j]], Fert[A[j]])) *
+ double(nTable.getValue(es[i], Fert[i]+1)) /
+ double(nTable.getValue(es[i], Fert[i])) *
+ double(tTable.getProb(es[i], fs[j]))/
+ double(tTable.getProb(es[A[j]], fs[j])) *
+ double(dTable.getValue(j, i, l, m))/
+ double(dTable.getValue(j, A[j], l, m)));
+ }
+ return(change);
+}
+
+
+LogProb model3::scoreOfSwap(Vector<WordIndex>& es,
+ Vector<WordIndex>& fs,
+ Vector<WordIndex>& A,
+ tmodel<COUNT, PROB>& tTable,
+ int j1,
+ int j2)
+ // returns the scaling factor of the original score if the swap to
+ // take place,
+ // No side effects here (none of the parameters passed is changed!
+ // (i.e. the alignment A is not really changed)
+ // If the value of the scaling factor is:
+ // 1: then the score of the new alignment if the move is taken will
+ // not change.
+ // 0.5: the new score is half the score of the original alignment.
+ // 2.0: the new score will be twice as much.
+ //
+{
+ LogProb score ;
+ WordIndex i1, i2, m, l ;
+
+ m = fs.size() - 1 ;
+ l = es.size() - 1 ;
+ if (j1 == j2 || A[j1] == A[j2]) // if swapping same position return ratio 1
+ return(1);
+ else {
+ i1 = A[j1] ;
+ i2 = A[j2] ;
+ score =
+ double(tTable.getProb(es[i2], fs[j1]))/double(tTable.getProb(es[i1], fs[j1])) *
+ double(tTable.getProb(es[i1], fs[j2]))/double(tTable.getProb(es[i2], fs[j2]));
+ if (i1 != 0){
+ score *= double(dTable.getValue(j2, i1, l, m))/double(dTable.getValue(j1, i1, l, m));
+ }
+ if (i2 != 0){
+ score *= double(dTable.getValue(j1, i2, l, m))/double(dTable.getValue(j2, i2, l, m));
+ }
+ return(score);
+ }
+}
+
+
+
+void model3::hillClimb(Vector<WordIndex>& es,
+ Vector<WordIndex>& fs,
+ Vector<WordIndex>& A,
+ Vector<WordIndex>& Fert,
+ LogProb& best_score,
+ tmodel<COUNT, PROB>& tTable,
+ int = -1,
+ int j_peg = -1)
+ // Hill climbing given alignment A .
+ // Alignment A will be updated and also best_score
+ // if no pegging is needed i_peg == -1, and j_peg == -1
+{
+ WordIndex i, j, l, m, j1, old_i;
+ LogProb change ;
+ bool local_minima;
+ int level = 0 ;
+ LogProb best_change_so_far, best_change ;
+ Vector<WordIndex> A_so_far;
+ Vector<WordIndex> Fert_so_far;
+
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ if (Log)
+ logmsg << "\nStarting hill climbing with original score: " << best_score <<"\n";
+ best_change = 1 ; // overall scaling factor (i.e. from the begining of climb
+ do {
+ best_change_so_far = 1 ; // best scaling factor of this level of hill climb
+ local_minima = true ;
+ for (j = 1 ; j <= m ; j++){
+ if (int(j) != j_peg){ // make sure not to change the pegged link
+ for (j1 = j + 1 ; j1 <= m; j1++){
+ // for all possible swaps
+ // make sure you are not swapping at same position
+ if ((A[j] != A[j1]) && (int(j1) != j_peg)){
+ // change = scoreOfSwap(es, fs, A, best_score, tTable, j, j1);
+ change = scoreOfSwap(es, fs, A, tTable, j, j1);
+ if (change > best_change_so_far){ // if better alignment found, keep it
+ local_minima = false ;
+ best_change_so_far = change ;
+ A_so_far = A ;
+ Fert_so_far = Fert ;
+ old_i = A_so_far[j] ;
+ A_so_far[j] = A_so_far[j1] ;
+ A_so_far[j1] = old_i ;
+ } // end of if (change > best_change_so_far)
+ } // end of if (A[j] != A[j1] ..)
+ } // of for (j1 = j+1 ....)
+ // for (i = 0 ; i < l ; i++){ // all possible moves
+ for (i = 0 ; i <= l ; i++){ // all possible moves
+ if (i != A[j]){ // make sure not to move to same position
+ if (i != 0 || (m >= 2 * (Fert[0]+1))){ // if moving to NULL word
+ // (pos 0), make sure not to violate the fertility restriction
+ // i.e. NULL can not take more than half the target words
+ // change = scoreOfMove(es, fs, A, Fert, best_score, tTable, j, i);
+ change = scoreOfMove(es, fs, A, Fert, tTable, j, i);
+ if (change > best_change_so_far){ // if better alignment found, keep it
+ best_change_so_far = change ;
+ local_minima = false ;
+ A_so_far = A ;
+ Fert_so_far = Fert ;
+ old_i = A_so_far[j] ;
+ A_so_far[j] = i ;
+ Fert_so_far[old_i]-- ;
+ Fert_so_far[i]++ ;
+ } // end of if (change > best_change_so_far)
+ } // end of if ((i!=0) ...
+ } // end of if (i != A[j] )
+ } // end of for (i = 0 ; ....)
+ } // end of if(j != j_peg)
+ } // end of for (j = 1 ; ...)
+ level++;
+ if (!local_minima){
+ if (best_change_so_far > 1){ // if current chage is improving
+ A = A_so_far ;
+ Fert = Fert_so_far ;
+ best_change *= best_change_so_far ;
+ }
+ else{
+ local_minima = true ;
+ }
+ } // end of if(!local_minima)
+ if (Log)
+ logmsg << "." ;
+ if (level> 15)
+ cerr << "." ;
+ } while (local_minima == false);
+ if (Log)
+ logmsg << "\n" << "Hill Climb Level: " << level << " score: scaling old: " <<(best_score*best_change) ;
+ if (level > 15)
+ cerr << "\nHill Climb Level: " << level << " score: scaling old: " <<(best_score*best_change) ;
+ best_score = prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es);
+ if (Log)
+ logmsg << " using new calc: " << best_score << '\n';
+ if (level>15)
+ cerr << " using new calc: " << best_score << '\n';
+}
+
+
+void model3::findBestAlignment(Vector<WordIndex>& es,
+ Vector<WordIndex>& fs,
+ Vector<WordIndex>& A,
+ Vector<WordIndex>& Fert,
+ LogProb& best_score,
+ /*tmodel<COUNT, PROB>& tTable,
+ amodel<PROB>& aTable, */
+ int i_peg = -1 ,
+ int j_peg = -1 )
+ // This finds the best Model2 alignment (i.e. no fertilities stuff) in A
+ // for the given sentence pair. Its score is returned in A. Its fertility
+ // info in Fert.
+ // if j_peg == -1 && i_peg == -1 then No pegging is performed.
+{
+ WordIndex i, j, l, m, best_i=0;
+ LogProb temp, score, ss;
+
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ for (i=0 ; i <= l ; i++)
+ Fert[i] = 0 ;
+ ss = 1 ;
+ if ((j_peg != -1) && (i_peg != -1)){ // if you're doing pegging
+ A[j_peg] = i_peg ;
+ Fert[i_peg] = 1 ;
+ ss *= double(tTable.getProb(es[i_peg], fs[j_peg])) *
+ double(aTable.getValue(i_peg, j_peg, l, m));
+ }
+ for (j = 1 ; j <= m ; j++){
+ if (int(j) != j_peg){
+ score = 0 ;
+ for (i = 0 ; i <= l ; i++){
+ // first make sure that connecting target word at pos j to source word
+ // at pos i will not lead to a violation on Fertility restrictions
+ // (e.g. maximum fertility for a word, max fertility for NULL word, etc)
+ if ((Fert[i]+1 < MAX_FERTILITY) && ((i == 0 && (m >= 2*(Fert[0]+1)))
+ || (i != 0))){
+ temp = double(tTable.getProb(es[i], fs[j])) *
+ double(aTable.getValue(i, j, l, m));
+ if (temp > score ){
+ best_i = i ;
+ score = temp ;
+ } // end of if (temp > score)
+ } // end of if (((i == 0 ...)
+ } // end of for (i= 0 ...)
+ if (score == 0){
+ cerr << "WARNING: In searching for model2 best alignment\n " ;
+ cerr << "Nothing was set for target token " << fs[j] <<
+ "at position j: " << j << "\n";
+ for (i = 0 ; i <= l ; i++){
+ cerr << "i: " << i << "ttable("<<es[i]<<", "<<fs[j]<<") = " <<
+ tTable.getProb(es[i], fs[j]) << " atable(" << i<<", "<<j<<", "<<
+ l<<", "<<m<<") = "<< aTable.getValue(i, j, l, m) << " product " <<
+ double(tTable.getProb(es[i], fs[j])) *
+ double(aTable.getValue(i, j, l, m)) << '\n';
+ if ((Fert[i]+1 < MAX_FERTILITY) && ((i == 0 && (m >= 2*(Fert[0]+1)))
+ || (i != 0)))
+ cerr <<"Passed fertility condition \n";
+ else
+ cerr <<"Failed fertility condition \n";
+ }
+
+ } // end of if (score == 0)
+ else {
+ Fert[best_i]++ ;
+ A[j] = best_i ;
+ }
+ ss *= score ;
+ } // end of if (j != j_peg)
+ } // end of for (j == 1 ; ...)
+ if (ss <= 0){
+ cerr << "WARNING: Model2 viterbi alignment has zero score for sentence pair:\n" ;
+ printSentencePair(es, fs, cerr);
+ }
+ best_score = prob_of_target_and_alignment_given_source(A, Fert, tTable, fs, es);
+ if (Log)
+ logmsg << "finding best alignment : score : " << ss <<"p(f, a/e) = "<< best_score<<"\n";
+}
+
+void model3::collectCountsOverAlignement(const Vector<WordIndex>& es,
+ const Vector<WordIndex>& fs,
+ const Vector<WordIndex>& A,
+ LogProb score,
+ float count)
+{
+ WordIndex j,i,l,m ;
+ Vector<WordIndex> Fert(es.size(),0);
+ l = es.size() - 1 ;
+ m = fs.size() - 1 ;
+ score *= LogProb(count);
+ COUNT temp = COUNT(score) ;
+ for (i=0 ; i <= l ; i++)
+ Fert[i] = 0 ;
+ for (j = 1 ; j <= m ; j++){
+ Fert[A[j]]++;
+ tTable.incCount(es[A[j]], fs[j], temp);
+ // tCountTable.getRef(es[A[j]], fs[j])+=score;
+ if (A[j])
+ dCountTable.getRef(j, A[j], l, m)+= temp ;
+ aCountTable.getRef(A[j], j, l, m)+= temp ;
+ }
+ for(i = 0 ; i <= l ; i++)
+ nCountTable.getRef(es[i], Fert[i])+= temp ;
+ // p1_count += score * (LogProb) (Fert[0]) ;
+ // p0_count += score * (LogProb) ((m - 2 * Fert[0])) ;
+ p1_count += temp * (Fert[0]) ;
+ p0_count += temp * ((m - 2 * Fert[0])) ;
+}
+
+
+
+void model3::findAlignmentsNeighborhood(Vector<WordIndex>& es,
+ Vector<WordIndex>& fs,
+ LogProb&align_total_count,
+ alignmodel&neighborhood,
+ int i_peg = -1,
+ int j_peg = -1
+ )
+ // Finding the Neigborhood of a best viterbi alignment after hill climbing
+ // if (i_peg == -1 and j_peg == -1, then No Pegging is done.
+{
+ LogProb best_score,score;
+ WordIndex i,j,l,m,old_i,j1;
+ Vector<WordIndex> A(fs.size(),0);
+ Vector<WordIndex> Fert(es.size(),0);
+ time_t it_st;
+
+ best_score = 0 ;
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ findBestAlignment(es, fs, A, Fert, best_score, /*tTable, aTable,*/ i_peg, j_peg);
+ if (best_score == 0){
+ cerr << "WARNING: viterbi alignment score is zero for the following pair\n";
+ printSentencePair(es, fs, cerr);
+ }
+ hillClimb(es, fs, A, Fert, best_score, tTable, i_peg, j_peg);
+ if (best_score <= 0){
+ cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n";
+ printSentencePair(es, fs, cerr);
+ if(Log){
+ logmsg << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n";
+ printSentencePair(es, fs, logmsg);
+ }
+ }
+ else { // best_score > 0
+ // if (2 * Fert[0] < m ){
+ if (2*Fert[0] <= m ){
+ /* consider alignments that has Fert[0] less than
+ half the number of words in French sentence */
+ if (neighborhood.insert(A, best_score)){
+ align_total_count += best_score ;
+ }
+ }
+ else { // else part is added for debugging / Yaser
+ cerr << "WARNING:Best Alignment found violates Fertility requiremnets !!\n" ;
+ for (i = 0 ; i <= l ; i++)
+ cerr << "Fert["<<i<<"] = "<< Fert[i] << "\n";
+ for (j = 1 ; j <= m ; j++){
+ cerr << "A["<<j<<"] = "<< A[j] <<"\n";
+ }
+ cerr << "Condition violated : 2 * Fert[0] <= m " << 2*Fert[0] <<"?"<<
+ m << "\n";
+ } // end of added code for debugging // Yaser
+ it_st = time(NULL) ;
+
+ // Now find add all neighbors of the best alignmet to the collection
+ for (j = 1 ; j <= m ; j++){
+ for (j1 = j + 1 ; j1 <= m; j1++){ // all possible swaps
+ if (A[j] != A[j1]){// make sure you are not swapping at same position
+ // score = best_score * scoreOfSwap(es, fs, A, best_score, tTable, j, j1);
+ score = best_score * scoreOfSwap(es, fs, A, tTable, j, j1);
+ // ADD A and its score to list of alig. to collect counts over
+ if (2 * Fert[0] <= m && score > 0){
+ /* consider alignments that has Fert[0] less than
+ half the number of words in French sentence */
+ old_i = A[j] ;
+ A[j] = A[j1] ;
+ A[j1] = old_i ;
+ if (neighborhood.insert(A, score)){
+ align_total_count += score ;
+ }
+ // restore original alignment
+ old_i = A[j] ;
+ A[j] = A[j1] ;
+ A[j1] = old_i ;
+ }
+ }
+ }
+ for (i = 0 ; i <= l ; i++){ // all possible moves
+ if (i != A[j]){ // make sure not to move to same position
+ if ((Fert[i]+1 < MAX_FERTILITY) &&
+ ((i == 0 && (m >= 2*(Fert[0]+1))) || (i != 0))){
+ // consider legal alignments only
+ score = best_score * scoreOfMove(es, fs, A, Fert, tTable, j, i);
+ // ADD A and its score to list of alig. to collect counts over
+ if (score > 0){
+ old_i = A[j] ;
+ A[j] = i ;
+ Fert[old_i]-- ;
+ Fert[i]++ ;
+ // add to list of alignemts here ******************
+ if (neighborhood.insert(A, score)){
+ align_total_count += score ;
+ }
+ // now resotre alignment and fertilities to previoud values
+ A[j] = old_i ;
+ Fert[old_i]++ ;
+ Fert[i]-- ;
+ } // end of if (score > 0)
+ } // end of if (i == 0 ...)
+ } // end of if (i != A[j])
+ }// end of for(i = 0 ; ...)
+ }// end of for (j = 1 ; ...)
+ } // of else best_score <= 0
+}
+
+void model3::viterbi_loop(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1,
+ bool dump_files, const char* alignfile,
+ bool collect_counts, string model )
+{
+ WordIndex i, j, l, m ;
+ ofstream of2 ;
+ int pair_no;
+ LogProb temp;
+
+ if (dump_files)
+ of2.open(alignfile);
+ pair_no = 0 ; // sentence pair number
+ // for each sentence pair in the corpus
+ perp.clear() ; // clears cross_entrop & perplexity
+ viterbiPerp.clear();
+ sentPair sent ;
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ const float count = sent.getCount();
+ if ((sent.sentenceNo % 1000) == 0)
+ cerr <<sent.sentenceNo << '\n';
+ time_t sent_s = time(NULL) ;
+ pair_no++ ;
+ l = es.size() - 1 ;
+ m = fs.size() - 1 ;
+ if (Log){
+ logmsg << "Processing sentence pair:\n\t";
+ printSentencePair(es, fs, logmsg);
+ for (i = 0 ; i <= l ; i++)
+ logmsg << Elist.getVocabList()[es[i]].word << " ";
+ logmsg << "\n\t";
+ for (j = 1 ; j <= m ; j++)
+ logmsg << Flist.getVocabList()[fs[j]].word << " ";
+ logmsg << "\n";
+ }
+
+ LogProb align_total_count=0;
+ // LogProb best_score;
+
+ Vector<WordIndex> viterbi_alignment;
+ LogProb viterbi_score ;
+ alignmodel neighborhood;
+ neighborhood.clear();
+ align_total_count = 0;
+ findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count, p0_count,*/ es, fs, align_total_count, neighborhood) ;
+ if (Peg){
+ for (i = 0 ; i <= l ; i++)
+ for (j = 1 ; j <= m ; j++){
+ if ( (tTable.getProb(es[i], fs[j]) > PROB_SMOOTH) &&
+ (aTable.getValue(i, j, l, m) > PROB_SMOOTH) &&
+ (dTable.getValue(j, i, l, m) > PROB_SMOOTH))
+ findAlignmentsNeighborhood(/*tTable, aTable,*/ /*p1_count,
+ p0_count, */ es, fs, align_total_count, neighborhood, i, j);
+ }
+ }
+ // Now Collect counts over saved neighborhoods
+ viterbi_score = 0 ;
+ if (Verbose)
+ cerr << "\nCollecting counts over found alignments, total prob: "
+ << align_total_count << "\n";
+ if (Log)
+ logmsg << "\nCollecting counts over found alignments, total prob: "
+ << align_total_count << "\n";
+ hash_map<Vector<WordIndex>, LogProb, hashmyalignment, equal_to_myalignment >::iterator align ;
+ int acount = 0 ;
+ if (align_total_count == 0 ){
+ cerr << " WARNINIG: For the following sentence pair : \n";
+ printSentencePair(es, fs, cerr);
+ cerr << "The collection of alignments found have 0 probability!!\n";
+ cerr << "No counts will be collected of it \n";
+ if (Log){
+ logmsg << "The collection of alignments found have 0 probability!!\n";
+ logmsg << "No counts will be collected of it \n";
+ }
+ }
+ else {
+ if (collect_counts) {
+ for(align = neighborhood.begin(); align != neighborhood.end(); align++){
+ temp = (*align).second/align_total_count ;
+ collectCountsOverAlignement(/*tTable, aCountTable, */es, fs, /*p1_count,
+ p0_count ,*/ ((*align).first), temp , count);
+ acount++;
+ if (viterbi_score < temp){
+ viterbi_alignment = ((*align).first);
+ viterbi_score = temp;
+ }
+ }
+ } // end of if (collect_counts)
+ perp.addFactor(log(double(align_total_count)), count, l, m,0);
+ viterbiPerp.addFactor(log(double(viterbi_score)), count, l, m,0);
+
+ if (Verbose){
+ cerr << "Collected counts over "<<acount <<" (of "
+ << pow(double(m), double(l+1)) <<") differnet alignments\n";
+ cerr << "Bucket count of alignments hash: "<<
+ neighborhood.getHash().bucket_count()<< ", size " <<
+ neighborhood.getHash().size() << "\n";
+ }
+ if (Log){
+ logmsg << "Collected counts over "<<acount <<" (of "
+ << pow(double(m), double(l+1)) <<") differnet alignments\n";
+ logmsg << "Bucket count of alignments hash: "<<
+ neighborhood.getHash().bucket_count()<< "\n";
+ }
+ } // end of else
+ // write best alignment (viterbi) for this sentence pair to alignment file
+ if (collect_counts){
+ if (viterbi_score <= 0){
+ cerr << "Viterbi Alignment for this pair have score zero!!\n";
+ of2 << "\n\n";
+ }
+ else {
+ if (dump_files)
+ printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, pair_no, viterbi_score);
+ addAL(viterbi_alignment,sent.sentenceNo,l);
+ }
+ } // end of if (collect_counts)
+ double period = difftime(time(NULL), sent_s);
+ if (Log)
+ logmsg << "processing this sentence pair ("<<l+1<<"x"<<m<<") : "<<
+ (l+1)*m << " took : " << period << " seconds\n";
+ if (Verbose)
+ cerr << "processing this sentence pair took : " << period
+ << " seconds\n";
+
+ } /* of sentence pair E, F */
+ sHandler1.rewind();
+ errorReportAL(cerr,model);
+ perp.record(model);
+ viterbiPerp.record(model);
+ if (dump_files)
+ of2.close();
+
+}
diff --git a/GIZA++-v2/model3_viterbi_with_tricks.cpp b/GIZA++-v2/model3_viterbi_with_tricks.cpp
new file mode 100644
index 0000000..1bfb07f
--- /dev/null
+++ b/GIZA++-v2/model3_viterbi_with_tricks.cpp
@@ -0,0 +1,690 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "mystl.h"
+#include "model3.h"
+#include "collCounts.h"
+#include "utility.h"
+#include "Globals.h"
+#include "D5Tables.h"
+#include "transpair_model5.h"
+#include "transpair_modelhmm.h"
+#include "myassert.h"
+#include "Parameter.h"
+
+GLOBAL_PARAMETER(float,PrintN,"nbestalignments","for printing the n best alignments",PARLEV_OUTPUT,0);
+
+const short LogHillClimb=0,LogPeg=0;
+const short UseHMMViterbiAlignmentIfPossible=1;
+short DoViterbiTraining=0;
+
+GLOBAL_PARAMETER(int,VerboseSentence,"VerboseSentence","number of sentence for which a lot of information should be printed (negative: no output)",PARLEV_OUTPUT,-10);
+GLOBAL_PARAMETER(double,PEGGED_CUTOFF,"PEGGED_CUTOFF","relative cutoff probability for alignment-centers in pegging",PARLEV_OPTHEUR,3e-2);
+GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF_AL,"COUNTINCREASE CUTOFF AL","countCutoffAl","Counts increment cutoff threshold for alignments in training of fertility models",PARLEV_OPTHEUR,1e-5);
+
+int SentNr;
+bool UseLinkCache=1; /// optimization for pegging
+int NumberOfAlignmentsInSophisticatedCountCollection;
+
+extern bool ONLYALDUMPS;
+
+int PrintHillClimbWarning=0;
+int PrintZeroScoreWarning=0;
+
+
+LogProb model3::viterbi_model2(const transpair_modelhmm&ef, alignment&output, int
+#ifdef STORE_HMM_ALIGNMENTS
+pair_no
+#endif
+, int i_peg , int j_peg )const
+{
+ static Vector<pair<alignment,LogProb> > viterbis;
+ Vector<int>vit;
+ int m=ef.get_m();
+ int l=ef.get_l();
+ double ret=0.0;
+ //#define STORE_HMM_ALIGNMENTS
+#ifdef STORE_HMM_ALIGNMENTS
+ if( i_peg==-1 && j_peg==-1 && viterbis.size()>pair_no )
+ {
+ output=viterbis[pair_no].first;
+ ret=viterbis[pair_no].second;
+ massert( ret==HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply );
+ }
+ else
+ {
+ ret=HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply;
+ for(int j=1;j<=m;j++)
+ {
+ if( vit[j-1]+1>l )
+ output.set(j,0);
+ else
+ output.set(j,vit[j-1]+1);
+ massert( (j==j_peg&&int(output(j))==i_peg) || j_peg!=j);
+ }
+ if( i_peg==-1 && j_peg==-1 )
+ {
+ iassert(viterbis.size()==pair_no);
+ viterbis.push_back(make_pair(output,ret));
+ }
+ }
+#else
+ ret=HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply;
+ for(int j=1;j<=m;j++)
+ {
+ if( vit[j-1]+1>l )
+ output.set(j,0);
+ else
+ output.set(j,vit[j-1]+1);
+ massert( (j==j_peg&&int(output(j))==i_peg) || j_peg!=j);
+ }
+#endif
+ massert( j_peg==-1 || int(output(j_peg))==i_peg );
+ if( j_peg!=-1 )
+ massert(int(output(j_peg))==i_peg);
+ if( output.valid() )
+ return ret;
+ else
+ {
+ return _viterbi_model2(ef,output,i_peg,j_peg);
+ }
+}
+
+LogProb model3::_viterbi_model2(const transpair_model2&ef, alignment&output, int i_peg, int j_peg)const
+{
+ WordIndex best_i=0;
+ LogProb ss=1;
+ PositionIndex l = ef.get_l(), m=ef.get_m();
+ Vector<WordIndex> Fert(l+1, (WordIndex)0);
+ if ((j_peg != -1) && (i_peg != -1))
+ {
+ output.set(j_peg, i_peg);
+ ss *= ef.get_t(i_peg, j_peg) * ef.get_a(i_peg, j_peg);
+ if( ss==0 )
+ cerr << "WARNING: already starting is zero: " << ef.get_t(i_peg, j_peg) << " " << ef.get_a(i_peg, j_peg) << '\n';
+ }
+ else
+ ss=1;
+ for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg)
+ {
+ LogProb score = 0 ;
+ for (PositionIndex i = 0 ; i <= l ; i++)
+ {
+ if( Fert[i]+1<MAX_FERTILITY && (i != 0 || m>=(2 * (Fert[0] + 1))))
+ {
+ LogProb temp = ef.get_t(i, j) * ef.get_a(i, j);
+ if (temp > score )
+ {
+ best_i = i ;
+ score = temp ;
+ }
+ }
+ }
+ if (score == 0){
+ cerr << "WARNING: In searching for model2 best alignment\n";
+ cerr << "Nothing was set for target token at position j: " << j << "\n";
+ for (PositionIndex i = 0 ; i <= l ; i++){
+ cerr << "i: " << i << "ttable("<<i<<", "<<j<<") = " <<
+ ef.get_t(i, j) << " atable(" << i<<", "<<j<<", "<<
+ l<<", "<<m<<") = "<< ef.get_a(i, j) << " product " <<
+ ef.get_t(i, j) * ef.get_a(i, j) ;
+ if ((Fert[i]+1 < MAX_FERTILITY) && ((i == 0 && (m >= 2*(Fert[0]+1)))
+ || (i != 0)))
+ cerr <<"Passed fertility condition \n";
+ else
+ cerr <<"Failed fertility condition \n";
+ }
+ }
+ else
+ {
+ output.set(j, best_i);
+ Fert[best_i]++;
+ }
+ ss *= score;
+ }
+ if (ss <= 0){
+ //cerr << ef;
+ cerr << "WARNING: Model2 viterbi alignment has zero score.\n" ;
+ cerr << "Here are the different elements that made this alignment probability zero \n";
+ cerr << "Source length " << l << " target length " << m << '\n';
+ LogProb gg=1 ; // for debugging only .....
+ for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg){
+ LogProb score = 0 ;
+ LogProb a = 0, t =0 ;
+ for (PositionIndex i = 0 ; i <= l ; i++){
+ // if( Debug_Fert[i]+1<MAX_FERTILITY && (i != 0 || m>=(2 * (Debug_Fert[0] + 1)))){
+ LogProb temp = ef.get_t(i, j) * ef.get_a(i, j);
+ if (temp > score ){
+ score = temp ;
+ best_i = i ;
+ a = ef.get_a(i, j);
+ t = ef.get_t(i, j) ;
+ }
+ // }
+ }
+ gg *= score ;
+ cerr << "best: fs[" << j << "] "<< j <<" : es[" << best_i << "] " <<
+ best_i << " , a: " << ef.get_a(best_i, j) << " t: " << t << " score " << score << " product : " << gg << " ss " <<
+ ss << '\n';
+ }
+ for(PositionIndex i = 0 ; i <= l ; i++)
+ cerr << "Fert["<<i<<"] selected " << Fert[i] << '\n';
+ }
+ massert(output.valid());
+ return ss;
+}
+LogProb model3::viterbi_model2(const transpair_model3&ef, alignment&output, int pair_no,int i_peg , int j_peg )const
+{
+ if( h&&UseHMMViterbiAlignmentIfPossible )
+ {
+ transpair_modelhmm efhmm(ef.E,ef.F,tTable,aTable,dTable,nTable,0.0,0.0,h);
+ LogProb ret=viterbi_model2(efhmm,output,pair_no,i_peg,j_peg);
+ massert(output.valid());
+ return ret;
+ }
+ return _viterbi_model2(ef,output,i_peg,j_peg);
+}
+
+int HillClimbingSteps=0;
+
+template<class TRANSPAIR>
+LogProb greedyClimb_WithIBM3Scoring(MoveSwapMatrix<TRANSPAIR>&msc2,int j_peg=-1)
+{
+ PositionIndex l = msc2.get_l(), m=msc2.get_m();
+ int changed=0;
+ int iter=0;
+ bool hereVERB=0;
+ do
+ {
+ MoveSwapMatrix<typename TRANSPAIR::simpler_transpair_model> msc_IBM3(msc2.get_ef(),alignment(msc2));
+ vector<pair<double,OneMoveSwap> > msvec;
+ for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg)
+ {
+ WordIndex aj=msc2(j);
+ for (PositionIndex j1 = j + 1 ; j1 <= m; j1++)
+ if((aj != msc2(j1)) && (int(j1) != j_peg))
+ msvec.push_back(pair<double,OneMoveSwap>(-msc_IBM3.cswap(j,j1),OneMoveSwap(1,j,j1)));
+ for (PositionIndex i = 0 ; i <= l ; i++)
+ if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+1<MAX_FERTILITY)
+ msvec.push_back(pair<double,OneMoveSwap>(-msc_IBM3.cmove(i,j),OneMoveSwap(2,i,j)));
+ }
+ sort(msvec.begin(),msvec.end());
+ HillClimbingSteps++;
+ int iused=-1;
+ changed=0;
+ for(unsigned int i=0;i<msvec.size()&&changed==0;++i)
+ {
+ LogProb csts;
+ const OneMoveSwap &oms=msvec[i].second;
+ if( oms.type==1&&(csts=msc2.cswap(oms.a,oms.b))>1.0001 )
+ {
+ if( hereVERB==1 )
+ cerr << "SWAP: " << csts << '\n';
+ msc2.doSwap(oms.a,oms.b);
+ changed=1;
+ iused=i;
+ break;
+ }
+ if( oms.type==2&&(csts=msc2.cmove(oms.a,oms.b))>1.0001 )
+ {
+ if( hereVERB==1 )
+ cerr << "MOVE: " << csts << '\n';
+ msc2.doMove(oms.a,oms.b);
+ changed=1;
+ iused=i;
+ break;
+ }
+ }
+ if( ++iter>30 )
+ {
+ //msc2.ef.verboseTP=1;
+ hereVERB=1;
+ cerr << "ERROR: more than 30 iterations in hill-climbing: " << iused
+ << " improvement: " << msvec[iused].first << " value:" << msvec[iused].second
+ << '\n' << msc2 << '\n';
+ for(int a=0;a<20;++a)
+ cout << a << ' ' << msvec[a].first << ' ' << msvec[a].second << '\n';
+ //cerr << msvec << '\n';
+ }
+ if( iter>50 )
+ break;
+ } while(changed);
+ return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2);
+}
+
+template<class TRANSPAIR>
+LogProb greedyClimb(MoveSwapMatrix<TRANSPAIR>&msc2, int j_peg = -1)
+{
+ if( msc2.get_ef().greedyHillClimbing()==1 )
+ return greedyClimb_WithIBM3Scoring(msc2,j_peg);
+ PositionIndex l = msc2.get_l(), m=msc2.get_m();
+ int changed=0;
+ do
+ {
+ HillClimbingSteps++;
+ changed=0;
+ for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg)
+ {
+ WordIndex aj=msc2(j);
+ for (PositionIndex j1 = j + 1 ; j1 <= m; j1++)if((aj != msc2(j1)) && (int(j1) != j_peg)&&msc2.cswap(j, j1) > 1.0)
+ msc2.doSwap(j, j1), changed=1;
+ for (PositionIndex i = 0 ; i <= l ; i++)if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+1<MAX_FERTILITY && msc2.cmove(i, j)>1.0)
+ msc2.doMove(i, j), changed=1;
+ }
+ } while (changed);
+ return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2);
+}
+
+template<class TRANSPAIR>
+LogProb hillClimb_std(MoveSwapMatrix<TRANSPAIR>&msc2, int= -1,int j_peg = -1)
+{
+ if( msc2.isLazy() )
+ return greedyClimb_WithIBM3Scoring(msc2,j_peg);
+ if( LogHillClimb>1 )
+ cout << msc2 << '\n';
+ PositionIndex l = msc2.get_l(), m=msc2.get_m();
+ int changes=0;
+ int best_change_type=-1, best_change_v1=-1, best_change_v2=-1;
+ do
+ {
+ HillClimbingSteps++;
+ LogProb best_change_so_far = 1.00001 ;
+ best_change_type=0;
+ for (PositionIndex j = 1 ; j <= m ; j++)if (int(j) != j_peg)
+ {
+ WordIndex aj=msc2(j);
+ for (PositionIndex j1 = j + 1 ; j1 <= m; j1++)if((aj != msc2(j1)) && (int(j1) != j_peg))
+ {
+ LogProb change = msc2.cswap(j, j1);
+ if (change > best_change_so_far)
+ {
+ best_change_so_far = change ;
+ best_change_type=1;
+ best_change_v1=j;
+ best_change_v2=j1;
+ if( LogHillClimb )
+ cerr << "CLIMB: " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << " " << best_change_so_far << msc2 << '\n';
+ massert(msc2.get_ef().isSubOptimal()==1);
+ }
+ }
+ for (PositionIndex i = 0 ; i <= l ; i++)if(i != aj &&(i != 0 || (m >= 2 * (msc2.fert(0)+1))) && msc2.fert(i)+1<MAX_FERTILITY)
+ {
+ LogProb change = msc2.cmove(i, j);
+ if (change > best_change_so_far)
+ {
+ best_change_so_far = change ;
+ best_change_type=2;
+ best_change_v1=j;
+ best_change_v2=i;
+ if( LogHillClimb )
+ cerr << "CLIMB: " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << " " << best_change_so_far << msc2 << '\n';
+ massert(msc2.get_ef().isSubOptimal()==1);
+ }
+ }
+ }
+ if (best_change_type==1)
+ {
+ msc2.doSwap(best_change_v1, best_change_v2);
+ if( LogHillClimb )
+ cerr << "SW-CLIMB-DONE: " << j_peg << msc2 << '\n';
+ }
+ if (best_change_type==2)
+ {
+ msc2.doMove(best_change_v2, best_change_v1);
+ if( LogHillClimb )
+ cerr << "MO-CLIMB-DONE: " << j_peg << msc2 << '\n';
+ }
+ changes++;
+ if( changes>40 )
+ {
+ if( PrintHillClimbWarning++<1000 )
+ cerr << "WARNING: already " << changes << " iterations in hillclimb: " << best_change_so_far << " " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << '\n';
+ else if (PrintHillClimbWarning==1000)
+ cerr << "ERROR: too many hill climbing warnings => I do not print more.\n";
+ }
+ if(changes>60 )
+ {
+ cerr << msc2 << '\n';
+ break;
+ }
+ } while (best_change_type);
+ return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2);
+}
+
+template<class MODEL_TYPE>
+bool extendCenterList(Vector<pair<MoveSwapMatrix<MODEL_TYPE>*,LogProb> >&setOfGoodCenters,MoveSwapMatrix<MODEL_TYPE> *msc,double peggedAlignmentScore)
+{
+ unsigned int l=msc->get_ef().get_l();
+ set<OneMoveSwap> alreadyCovered;
+ for(unsigned int nr=0;nr<setOfGoodCenters.size();nr++)
+ makeOneMoveSwap(*setOfGoodCenters[nr].first,*msc,alreadyCovered);
+ for(set<OneMoveSwap>::const_iterator i=alreadyCovered.begin();i!=alreadyCovered.end();++i)
+ {
+ if( i->type==1||i->type==4)
+ msc->delCenter();
+ if( i->type==1 )
+ {
+ for(unsigned int ii=0;ii<=l;++ii)
+ if( (*msc)(i->a)!=ii )
+ msc->delMove(ii,i->a);
+ }
+ else if( i->type==2||i->type==4 )
+ msc->delSwap(i->a,i->b);
+ else if( i->type==3 )
+ msc->delMove(i->b,i->a);
+ else abort();
+ }
+ setOfGoodCenters.push_back(make_pair(msc,peggedAlignmentScore));
+ return 1;
+}
+
+bool OldLog=0;
+short OldLogPeg=0,OldLogHillClimb=0;
+class Als
+{
+public:
+ int s,a,b;
+ double v;
+ Als(int _s,int _a,int _b,double _v)
+ : s(_s),a(_a),b(_b),v(_v) {}
+};
+
+inline bool operator<(const Als&x,const Als&y)
+{return x.v>y.v;}
+
+template<class MODEL_TYPE, class ADDITIONAL_MODEL_DATA_IN,class ADDITIONAL_MODEL_DATA_OUT>
+void model3::viterbi_loop_with_tricks(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1,
+ bool dump_files, const char* alignfile,
+ bool collect_counts, string model, bool final,
+ ADDITIONAL_MODEL_DATA_IN*dm_in,
+ ADDITIONAL_MODEL_DATA_OUT*dm_out)
+{
+ ofstream *writeNBestErrorsFile=0;
+ if( (dump_files||FEWDUMPS)&&PrintN&&ReferenceAlignment.size()>0 )
+ {
+ string x=alignfile+string("NBEST");
+ writeNBestErrorsFile= new ofstream(x.c_str());
+ }
+ ofstream *of3=0;
+ PositionIndex i, j, l, m ;
+ ofstream of2;
+ int pair_no;
+ HillClimbingSteps=0;
+ NumberOfAlignmentsInSophisticatedCountCollection=0;
+ if (dump_files||FEWDUMPS||(final&&(ONLYALDUMPS)) )
+ of2.open(alignfile);
+ if( dump_files&&PrintN&&final )
+ {
+ string x=alignfile+string("NBEST");
+ of3= new ofstream(x.c_str());
+ }
+ pair_no = 0 ; // sentence pair number
+ // for each sentence pair in the corpus
+ perp.clear() ; // clears cross_entrop & perplexity
+ viterbiPerp.clear() ; // clears cross_entrop & perplexity
+ sentPair sent ;
+ int NCenter=0,NHillClimbed=0,NAlignment=0,NTotal=0,NBetterByPegging=0;
+ while(sHandler1.getNextSentence(sent)){
+ if( sent.eSent.size()==1||sent.fSent.size()==1 )
+ continue;
+ SentNr=sent.sentenceNo;
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ const float count = sent.getCount();
+ if ((sent.sentenceNo % 10000) == 0)
+ cerr <<sent.sentenceNo << '\n';
+ time_t sent_s = time(NULL) ;
+ pair_no++ ;
+ l = es.size() - 1 ;
+ m = fs.size() - 1 ;
+ if (Log){
+ logmsg << "Processing sentence pair:\n\t";
+ printSentencePair(es, fs, logmsg);
+ for (i = 0 ; i <= l ; i++)
+ logmsg << Elist.getVocabList()[es[i]].word << " ";
+ logmsg << "\n\t";
+ for (j = 1 ; j <= m ; j++)
+ logmsg << Flist.getVocabList()[fs[j]].word << " ";
+ logmsg << "\n";
+ }
+
+ LogProb align_total_count=0;
+ alignment viterbi2alignment(l,m);
+ MODEL_TYPE ef(es,fs,tTable,aTable,dTable,nTable,p1,p0,dm_in);
+ viterbi_model2(ef,viterbi2alignment,pair_no-1);
+ Vector<pair<MoveSwapMatrix<MODEL_TYPE>*,LogProb> >setOfGoodCenters(1);
+ set<alignment> alignments;
+ MoveSwapMatrix<MODEL_TYPE> *best = (setOfGoodCenters[0].first = new MoveSwapMatrix<MODEL_TYPE>(ef, viterbi2alignment));
+ MoveSwapMatrix<MODEL_TYPE> _viterbi(*best), *viterbi=&_viterbi; // please, don't delete this line (FJO)
+ if (Log)
+ logmsg << "VITERBI: " << alignment(_viterbi);
+ if( ef.isSubOptimal() )
+ setOfGoodCenters[0].second = hillClimb_std(*best);
+ else
+ {
+ setOfGoodCenters[0].second = best->get_ef().prob_of_target_and_alignment_given_source(*best);
+ if( setOfGoodCenters[0].second==0 )
+ {
+ cerr << "PROBLEM: alignment is 0.\n";
+ best->get_ef().prob_of_target_and_alignment_given_source(*best,1);
+ }
+ }
+ int bestAlignment=0;
+
+
+ for(unsigned int i=0;i<setOfGoodCenters.size();++i)
+ setOfGoodCenters[i].first->check();
+ alignments.insert(*best);
+ if (setOfGoodCenters[bestAlignment].second <= 0){
+ if( PrintZeroScoreWarning++<100 )
+ {
+ cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n";
+ cerr << alignment(*setOfGoodCenters[bestAlignment].first) ;
+ printSentencePair(es, fs, cerr);
+ if(Log){
+ logmsg << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n";
+ printSentencePair(es, fs, logmsg);
+ }
+ }
+ else if(PrintZeroScoreWarning==100)
+ {
+ cerr << "ERROR: too many zero score warnings => no additional one will be printed\n";
+ }
+ setOfGoodCenters[bestAlignment].second=1e-300;
+ continue;
+ }
+ int nHillClimbed=1,nAlignment=1;
+ bool flagBetterByPegging=0;
+ if ( Peg )
+ {
+ const MoveSwapMatrix<MODEL_TYPE> *useMatrix=viterbi; // it is faster using 'best', ... (FJO)
+ Array2<short, vector<short> > linkCache(l+1, m+1, false);
+ if(UseLinkCache)for(unsigned int j=1;j<=m;j++)linkCache((*useMatrix)(j), j)=1;
+ for(PositionIndex j=1;j<=m;j++)for(PositionIndex i=0;i<=l;i++)
+ {
+ nAlignment++;
+ if( i!=(*useMatrix)(j) && (UseLinkCache==0||linkCache(i,j)==0) &&
+ ef.get_t(i,j)>ef.get_t((*useMatrix)(j),j)*PEGGED_CUTOFF &&
+ (i != 0 || (m >= 2 * (useMatrix->fert(0)+1))))
+ {
+ MoveSwapMatrix<MODEL_TYPE> *BESTPEGGED=0;
+ LogProb peggedAlignmentScore;
+ nHillClimbed++;
+ if( ef.isSubOptimal() )
+ {
+ BESTPEGGED = new MoveSwapMatrix<MODEL_TYPE>(*useMatrix);
+ BESTPEGGED->doMove(i, j);
+ peggedAlignmentScore= hillClimb_std(*BESTPEGGED, i,j);
+ }
+ else
+ {
+ alignment pegAlignment(l,m);
+ peggedAlignmentScore=viterbi_model2(ef,pegAlignment,pair_no-1,i,j);
+ BESTPEGGED = new MoveSwapMatrix<MODEL_TYPE>(ef,pegAlignment);
+ massert( pegAlignment(j)==i );
+ }
+ if(UseLinkCache)
+ for(unsigned int j=1;j<=m;j++)
+ linkCache((*BESTPEGGED)(j), j)=1;
+ if( peggedAlignmentScore>setOfGoodCenters[bestAlignment].second*(LogProb)PEGGED_CUTOFF && alignments.count(*BESTPEGGED)==0 )
+ {
+ if(extendCenterList(setOfGoodCenters,BESTPEGGED,peggedAlignmentScore))
+ {
+ alignments.insert(*BESTPEGGED);
+ if( peggedAlignmentScore>1.00001*setOfGoodCenters[bestAlignment].second )
+ {
+ if( LogPeg )
+ {
+ cerr << "found better alignment by pegging " << pair_no << " " << peggedAlignmentScore/setOfGoodCenters[bestAlignment].second << '\n';
+ cerr << "NEW BEST: " << alignment(*BESTPEGGED);
+ cerr << "OLD : " << alignment(*setOfGoodCenters[bestAlignment].first);
+ }
+ flagBetterByPegging=1;
+ bestAlignment=alignments.size()-1;
+ }
+ }
+ assert( differences(*BESTPEGGED, *best)!=0 );
+ BESTPEGGED=0; }
+ else
+ delete BESTPEGGED;
+ }
+ }
+ } // end of if(Peg)
+ NBetterByPegging+=flagBetterByPegging;
+ for(unsigned int i=0;i<setOfGoodCenters.size();++i)
+ setOfGoodCenters[i].first->check();
+ if( LogPeg>1 )
+ cout << "PEGGED: " << setOfGoodCenters.size() << " HILLCLIMBED:" << nHillClimbed << " TOTAL:" << nAlignment << " alignments." << '\n';
+ int alTotal=collectCountsOverNeighborhood(setOfGoodCenters,es, fs, tTable, aCountTable,
+ dCountTable, nCountTable, p1_count, p0_count,
+ align_total_count, count, collect_counts, dm_out);
+ if( LogPeg>1 )
+ {
+ cout << "ALL: " << alTotal << " from " << pow(float(l+1),float(m)) << '\n';
+ massert(alTotal<=pow(double(l+1),double(m)));
+ }
+ NCenter+=setOfGoodCenters.size();NHillClimbed+=nHillClimbed;NAlignment+=nAlignment;NTotal+=alTotal;
+ perp.addFactor(log(double(align_total_count)), count, l, m,0);
+ viterbiPerp.addFactor(log(double(setOfGoodCenters[bestAlignment].second)), count, l, m,0);
+ massert(log(double(setOfGoodCenters[bestAlignment].second)) <= log(double(align_total_count)));
+ if (dump_files||(FEWDUMPS&&sent.sentenceNo<1000)||(final&&(ONLYALDUMPS)) )
+ printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, (setOfGoodCenters[bestAlignment].first)->getAlignment(), pair_no,
+ setOfGoodCenters[bestAlignment].second);
+ for(unsigned int i=0;i<setOfGoodCenters.size();++i)
+ setOfGoodCenters[i].first->check();
+ if( of3||(writeNBestErrorsFile&&pair_no<int(ReferenceAlignment.size())) )
+ {
+ vector<Als> als;
+ for(unsigned int s=0;s<setOfGoodCenters.size();++s)
+ {
+ const MoveSwapMatrix<MODEL_TYPE>&msc= *setOfGoodCenters[s].first;
+ msc.check();
+ double normalized_ascore=setOfGoodCenters[s].second;
+ if( !msc.isCenterDeleted() )
+ als.push_back( Als(s,0,0,normalized_ascore) );
+
+ for(WordIndex j=1;j<=m;j++)
+ for(WordIndex i=0;i<=l;i++)
+ if( i!=msc(j)&& !msc.isDelMove(i,j) )
+ als.push_back( Als(s,i,j,msc.cmove(i,j)*normalized_ascore));
+ for(PositionIndex j1=1;j1<=m;j1++)
+ for(PositionIndex j2=j1+1;j2<=m;j2++)
+ if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) )
+ als.push_back( Als(s,-j1,-j2,msc.cswap(j1,j2)*normalized_ascore));
+ }
+ sort(als.begin(),als.end());
+ double sum=0,sum2=0;
+ for(unsigned int i=0;i<als.size();++i)
+ sum+=als[i].v;
+ for(unsigned int i=0;i<min((unsigned int)als.size(),(unsigned int)PrintN);++i)
+ {
+ alignment x=*setOfGoodCenters[als[i].s].first;
+ if( !(als[i].a==0 && als[i].b==0) )
+ {
+ if( als[i].a<=0&&als[i].b<=0 )
+ x.doSwap(-als[i].a,-als[i].b);
+ else
+ x.doMove(als[i].a,als[i].b);
+ }
+ if( of3&&i<PrintN )
+ printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(),*of3,x.getAlignment(), pair_no,
+ als[i].v/sum*count);
+ sum2+=als[i].v;
+ if( writeNBestErrorsFile )
+ {
+ if( pair_no<int(ReferenceAlignment.size()) )
+ {
+ int ALmissing=0,ALtoomuch=0,ALeventsMissing=0,ALeventsToomuch=0;
+ vector<double> scores;
+ ErrorsInAlignment(ReferenceAlignment[pair_no-1],x.getAlignment(),l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no);
+ ef.computeScores(x,scores);
+ *writeNBestErrorsFile << ALmissing+ALtoomuch << ' ';
+ for(unsigned int i=0;i<scores.size();++i)
+ *writeNBestErrorsFile << ((scores[i]>0.0)?(-log(scores[i])):1.0e6) << ' ';
+ *writeNBestErrorsFile << '\n';
+ }
+ }
+ }
+ if( writeNBestErrorsFile )
+ *writeNBestErrorsFile << '\n';
+ }
+ addAL((setOfGoodCenters[bestAlignment].first)->getAlignment(),sent.sentenceNo,l);
+ if (Log)
+ logmsg << "processing this sentence pair ("<<l+1<<"x"<<m<<") : "<<
+ (l+1)*m << " prob : " << align_total_count << " " << (setOfGoodCenters[bestAlignment].second) << alignment(*setOfGoodCenters[bestAlignment].first) << " \n";
+ for(unsigned int i=0;i<setOfGoodCenters.size();i++)
+ delete setOfGoodCenters[i].first;
+ double period = difftime(time(NULL), sent_s);
+ if (Verbose)
+ cerr << "processing this sentence pair took : " << period
+ << " seconds\n";
+
+ } /* of sentence pair E, F */
+ sHandler1.rewind();
+ perp.record(model);
+ errorReportAL(cerr,model);
+ viterbiPerp.record(model);
+ if (dump_files||FEWDUMPS||(final&&(ONLYALDUMPS)) )
+ of2.close();
+ delete of3;
+ delete writeNBestErrorsFile;
+ double FSent=pair_no;
+ cout << "#centers(pre/hillclimbed/real): " << NAlignment/FSent << " " << NHillClimbed/FSent << " " << NCenter/FSent << " #al: " << NTotal/FSent << " #alsophisticatedcountcollection: " << NumberOfAlignmentsInSophisticatedCountCollection/FSent << " #hcsteps: " << HillClimbingSteps/FSent << '\n';
+ cout << "#peggingImprovements: " << NBetterByPegging/FSent << '\n';
+ }
+
+
+
+#include "collCounts.cpp"
+#define INSTANTIATE(A,B,C) template \
+void model3::viterbi_loop_with_tricks<A,B,C>(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, \
+ bool dump_files, const char* alignfile,bool collect_counts, string, bool final,\
+ B*d4m,C*d5m);
+
+INSTANTIATE(transpair_model3, void, void);
+INSTANTIATE(transpair_modelhmm, const hmm, void);
+INSTANTIATE(transpair_modelhmm, const hmm, d4model);
+INSTANTIATE(transpair_modelhmm, const hmm, d5model);
+INSTANTIATE(transpair_model3, void,d4model);
+INSTANTIATE(transpair_model3, void,d5model);
+INSTANTIATE(transpair_model4, d4model,d4model);
+INSTANTIATE(transpair_model4, d4model,d5model);
+INSTANTIATE(transpair_model5, d5model,d5model);
diff --git a/GIZA++-v2/myassert.cpp b/GIZA++-v2/myassert.cpp
new file mode 100644
index 0000000..2d49be8
--- /dev/null
+++ b/GIZA++-v2/myassert.cpp
@@ -0,0 +1,20 @@
+#include "mystl.h"
+#include <iostream>
+#include "myassert.h"
+
+#ifndef STANDARD_ASSERT
+void myerror(int line,const char *file,const char *expression)
+{
+ cerr << "(general.h):Assertion failed: '" << expression << "' ::: b "
+ << file << ":" << line << endl;
+ cout << "(general.h):Assertion failed: '" << expression << "' ::: b "
+ << file << ":" << line << endl;
+}
+void imyerror(int line,const char *file,const char *expression)
+{
+ cerr << "Error: '" << expression << "' ::: in Source " << file
+ << ":" << line << endl;
+}
+
+#endif
+
diff --git a/GIZA++-v2/myassert.h b/GIZA++-v2/myassert.h
new file mode 100644
index 0000000..b648fdd
--- /dev/null
+++ b/GIZA++-v2/myassert.h
@@ -0,0 +1,20 @@
+#ifndef MY_ASSERT_DEFINED
+#define MY_ASSERT_DEFINED
+void myerror(int line,const char *file,const char *expression);
+void imyerror(int line,const char *file,const char *expression);
+
+#define iassert(expression) do {if (!(expression)) {imyerror(__LINE__,__FILE__,#expression);}} while (0)
+
+#
+#define massert(expr) do {} while(0)
+
+#define vassert(expr) do {} while(0)
+
+#include <cassert>
+
+#endif
+
+
+
+
+
diff --git a/GIZA++-v2/mymath.h b/GIZA++-v2/mymath.h
new file mode 100644
index 0000000..f8ad926
--- /dev/null
+++ b/GIZA++-v2/mymath.h
@@ -0,0 +1,9 @@
+/* ---------------------------------------------------------------- */
+/* Copyright 1998 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
+/* Franz Josef Och */
+/* ---------------------------------------------------------------- */
+#ifndef HEADER_MYMATH_DEFINED
+#define HEADER_MYMATH_DEFINED
+inline double mfabs(double x){return (x<0)?(-x):x;}
+#include <math.h>
+#endif
diff --git a/GIZA++-v2/mystl.h b/GIZA++-v2/mystl.h
new file mode 100644
index 0000000..65c5ca1
--- /dev/null
+++ b/GIZA++-v2/mystl.h
@@ -0,0 +1,322 @@
+/* ---------------------------------------------------------------- */
+/* Copyright 1998 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
+/* Franz Josef Och */
+/* ---------------------------------------------------------------- */
+#ifndef MY_STL_H_DEFINED
+#define MY_STL_H_DEFINED
+
+#include <string>
+using namespace std;
+#ifdef USE_STLPORT
+#ifdef __STL_DEBUG
+using namespace _STLD;
+#else
+using namespace _STL;
+#endif
+#endif
+
+#include "myassert.h"
+#include <string>
+#include <utility>
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+#else
+#include <hash_map>
+#endif
+#include <iostream>
+#include "mymath.h"
+#include "Array2.h"
+
+#define over_string(a,i) for(unsigned int i=0;i<a.length();i++)
+#define over_array(a,i) for(i=(a).low();i<=(a).high();i++)
+#define backwards_array(a,i) for(i=(a).high();i>=(a).low();i--)
+#define over_arr(a,i) for(int i=(a).low();i<=(a).high();i++)
+#define over_arrMAX(a,i,max) for(int i=(a).low();i<=min((a).high(),max-1);i++)
+#define backwards_arr(a,i) for(int i=(a).high();i>=(a).low();i--)
+
+extern double n1mult,n2mult,n3mult;
+
+inline double realProb(int n1,int n2)
+{
+ massert(n1<=n2);
+ iassert(n1>=0&&n2>0);
+ if(n2==0)n2=1;
+ return ((double)n1)/(double)n2;
+}
+
+inline double verfProb(int n1,int n2)
+{
+ double prob = realProb(n1,n2);
+ if( n1==1 )return prob*n1mult;
+ else if( n1==2 )return prob*n2mult;
+ else if( n1==3 )return prob*n3mult;
+ else
+ return prob;
+}
+
+inline bool prefix(const string&x,const string&y)
+{
+ if(y.size()>x.size() )
+ return 0;
+ for(unsigned int i=0;i<y.size();++i)
+ if( y[i]!=x[i] )
+ return 0;
+ return 1;
+}
+
+
+/*template<class T>
+int lev(const T&s1,const T&s2)
+{
+ Array2<int,vector<int> > a(s1.size()+1,s2.size()+1,1000);
+ Array2<pair<int,int>,vector<pair<int,int> > > back(s1.size()+1,s2.size()+1,pair<int,int>(0,0));
+ for(unsigned int i=0;i<=s1.size();i++)
+ for(unsigned int j=0;j<=s2.size();j++)
+ {
+ if( i==0&&j==0 )
+ a(i,j)=0;
+ else
+ {
+ int aDEL=100,aINS=100,aSUB=100;
+ if(i>0)
+ aDEL=a(i-1,j)+1;
+ if(j>0)
+ aINS=a(i,j-1)+1;
+ if(i>0&&j>0)
+ aSUB=a(i-1,j-1)+ !(s1[i-1]==s2[j-1]);
+ if( aSUB<=aDEL && aSUB<=aINS )
+ {
+ a(i,j)=aSUB;
+ back(i,j)=pair<int,int>(i-1,j-1);
+ }
+ else if( aDEL<=aSUB && aDEL<=aINS )
+ {
+ a(i,j)=aDEL;
+ back(i,j)=pair<int,int>(i-1,j);
+ }
+ else
+ {
+ a(i,j)=aINS;
+ back(i,j)=pair<int,int>(i,j-1);
+ }
+ }
+ }
+ return a(s1.size(),s2.size());
+}
+
+template<class T>
+float rel_lev(const T&s1,const T&s2)
+{
+ if( s1.size()==0 )
+ return s2.size()==0;
+ else
+ return min(1.0,lev(s1,s2)/(double)s1.size());
+}*/
+
+template<class V> int Hash(const pair<V,V>&a)
+{ return Hash(a.first)+13001*Hash(a.second); }
+
+template<class T1,class T2>
+ostream& operator<<(ostream &out,const pair<T1,T2> &ir)
+{
+ out << "(" << ir.first << "," << ir.second << ")";
+ return out;
+}
+
+inline int Hash(const string& s)
+{
+ int sum=0;
+ string::const_iterator i=s.begin(),end=s.end();
+ for(;i!=end;i++)sum=5*sum+(*i);
+ return sum;
+}
+template<class A,class B,class C>
+class tri
+{
+public:
+ A a;
+ B b;
+ C c;
+ tri(){};
+ tri(const A&_a,const B&_b,const C&_c)
+ : a(_a),b(_b),c(_c) {}
+};
+template<class A,class B,class C>
+bool operator==(const tri<A,B,C>&x,const tri<A,B,C>&y)
+{ return x.a==y.a&&x.b==y.b&&x.c==y.c;}
+
+template<class A,class B,class C>
+bool operator<(const tri<A,B,C>&x,const tri<A,B,C>&y)
+{
+ if(x.a<y.a)return 1;
+ if(y.a<x.a)return 0;
+ if(x.b<y.b)return 1;
+ if(y.b<x.b)return 0;
+ if(x.c<y.c)return 1;
+ if(y.c<x.c)return 0;
+ return 0;
+}
+
+double used_time();
+
+template<class T>
+class my_hash
+{
+public:
+ int operator()(const T&t)const {return Hash(t);}
+};
+
+inline int Hash(int value) { return value; }
+#define MY_HASH_BASE hash_map<A,B,my_hash<A> >
+
+template<class A,class B>
+class leda_h_array : public MY_HASH_BASE
+{
+private:
+ B init;
+public:
+ leda_h_array() : MY_HASH_BASE() {}
+ leda_h_array(const B&_init)
+ : MY_HASH_BASE(),init(_init) {}
+ bool defined(const A&a) const
+ { return find(a)!=this->end(); }
+ const B&operator[](const A&a)const
+ {
+ typename MY_HASH_BASE::const_iterator pos=find(a);
+ if( pos==this->end() )
+ return init;
+ else
+ return pos->second;
+ }
+ B&operator[](const A&a)
+ {
+ typename MY_HASH_BASE::iterator pos=find(a);
+ if( pos==this->end() )
+ {
+ insert(MY_HASH_BASE::value_type(a,init));
+ pos=find(a);
+ iassert(pos!=this->end());
+ }
+ return pos->second;
+ }
+ const B&initValue()const
+ {return init;}
+};
+
+#define forall_defined_h(a,b,c,d) for(typename leda_h_array<a,b>::const_iterator __jj__=(d).begin();__jj__!=(d).end()&&((c=__jj__->first),1); ++__jj__)
+template<class T,class U>
+ostream & operator<<(ostream&out,const leda_h_array<T,U>&w)
+{
+ T t;
+ bool makeNl=0;
+ out << "h_array{";
+ forall_defined_h(T,U,t,w)
+ {
+ if( makeNl )
+ out << "\n ";
+ out << "EL:" << t << " INH:" << w[t] << ".";
+ makeNl=1;
+ }
+ return out << "}\n";
+}
+
+template<class T,class U>
+istream & operator>>(istream&in,leda_h_array<T,U>&)
+{
+ return in;
+}
+
+template<class A,class B>
+bool operator==(const leda_h_array<A,B>&p1,const leda_h_array<A,B>&p2)
+{
+ A v;
+ forall_defined_h(A,B,v,p1)
+ if( !( p1[v]==p2[v]) ) return 0;
+ forall_defined_h(A,B,v,p2)
+ if( !( p1[v]==p2[v]) ) return 0;
+ return 1;
+}
+
+template<class T>
+int count_elements(T a,T b)
+{
+ int c=0;
+ while(a!=b)
+ {
+ a++;
+ c++;
+ }
+ return c;
+}
+
+template<class T>
+T normalize_if_possible_with_increment(T*a,T*b,int increment)
+{
+ T sum=0;
+ for(T*i=a;i!=b;i+=increment)
+ sum+=*i;
+ if( sum )
+ for(T*i=a;i!=b;i+=increment)
+ *i/=sum;
+ else
+ {
+ T factor=increment/(b-a);
+ for(T*i=a;i!=b;i+=increment)
+ *i=factor;
+ }
+ return sum;
+}
+
+template<class T>
+inline int m_comp_3way(T a,T b,int n)
+{
+ int _n=0;
+ while((_n++<n) && a && b)
+ {
+ const typename T::value_type &aa=*a;
+ const typename T::value_type &bb=*b;
+ if( aa<bb )return 1;
+ if( bb<aa )return -1;
+ ++a;
+ ++b;
+ }
+ return 0;
+}
+
+template<class T>
+void smooth_standard(T*a,T*b,double p)
+{
+ int n=b-a;
+ if( n==0 )
+ return;
+ double pp=p/n;
+ for(T*i=a;i!=b;++i)
+ *i = (1.0-p)*(*i)+pp;
+}
+
+template<class T>
+const T *conv(typename vector<T>::const_iterator i)
+{
+ return &(*i);
+}
+#if __GNUC__>2
+template<class T>
+T *conv(typename vector<T>::iterator i)
+{
+ return &(*i);
+}
+#endif
+
+/*template<class T>
+const T *conv(const T*x)
+{
+ return x;
+}*/
+template<class T>
+T *conv(T*x)
+{
+ return x;
+}
+
+#endif
diff --git a/GIZA++-v2/parse.cpp b/GIZA++-v2/parse.cpp
new file mode 100644
index 0000000..c67a9ff
--- /dev/null
+++ b/GIZA++-v2/parse.cpp
@@ -0,0 +1,151 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+/* FJO 01/2001: completely reorganized parameter processing */
+
+#include <strstream>
+#include <string>
+#include <fstream.h>
+#include "defs.h"
+#include "utility.h"
+#include "Globals.h"
+#include "D4Tables.h"
+#include "D5Tables.h"
+#include "ATables.h"
+#include "Parameter.h"
+
+extern bool ONLYALDUMPS;
+
+void parseConfigFile (char * fname )
+ // This functions reads in the configuration file to set up some run-time
+ // parameters. The parameters are global variables that are defined in
+ // main.cc and used all over the place in the program
+ // The format of the configuration file can be explained in the following way
+ // FORMAT:
+ // the character '\n' separates lines ..
+ // lines that start with "//" (skipping over white spaces are considered
+ // as comments and will be ignored.
+ // Any other line is considered as an attribute setting instruction and it
+ // is divided into haves (separated by a colon ":"). The first half is the
+ // attribute value which consists of the concatenation of all non-white space
+ // tokens before the colon. These tokens will have spaces eseparating them.
+ // The attribute vlue is the first token after the colon (any thing after
+ // it will be ignored ;
+ // For example :
+ // if the configuration file has the following entry:
+ //
+ // NO. ITERATIONS MODEL 2 : 10
+ //
+ // then the attribute is "NO. ITERATIONS MODEL 2" , and the attribute value
+ // is "10" (these do not include the quotation marks).
+
+{
+
+ string line, word, attrib, attribval ;
+ ifstream Config_File(fname);
+ if(!Config_File){
+ cerr << "ERROR: Cannot open configuration file " << fname << "!\n" ;
+ exit(1);
+ }
+
+ cout << "The following options are from the config file and will be overwritten by any command line options.\n";
+
+ while(getline(Config_File, line)){
+
+ istrstream buffer(line.c_str());
+ word = attrib = attribval = "" ;
+ buffer >> word ;
+ if (word != "//"){ // if line does not start with "//" (i.e. not a comment)
+ attrib = word ;
+ while((buffer >> word) && (word != ":")){
+ attrib += " " + word ;
+ }
+ if(!(buffer >> attribval))
+ {
+ istrstream buffer2(line.c_str());
+ buffer2>>attrib;
+ buffer2>>attribval;
+ }
+
+ // This# is where (1) the configuration file is defined and
+ // (2) parsing of its attributes occurs.
+
+ if(attrib == "t FILE"){
+ t_Filename = attribval;
+ cout << "\tt file: " << t_Filename << '\n';
+ }
+ else if(attrib == "a FILE"){
+ a_Filename = attribval;
+ cout << "\ta file: " << a_Filename << '\n';
+ }
+ else if(attrib == "d FILE"){
+ d_Filename = attribval;
+ cout << "\td file: " << d_Filename << '\n';
+ }
+ else if(attrib == "n FILE"){
+ n_Filename = attribval;
+ cout << "\tn file: " << n_Filename << '\n';
+ }
+ else if(attrib == "p0 FILE"){
+ p0_Filename = attribval;
+ cout << "\tp0 file: " << p0_Filename << '\n';
+ }
+ else if ( line == ""){}
+ else if( !makeSetCommand(attrib,attribval,getGlobalParSet(),2) )
+ cerr << "ERROR: Unrecognized attribute :" << attrib << '\n';
+ }
+ }
+}
+
+
+void parseArguments(int argc, char *argv[])
+{
+ int arg = 1;
+
+ if(!strcmp(argv[1], "--h") || !strcmp(argv[1], "--help")){
+ printHelp();
+ exit(0);
+ }
+ if( argv[1][0]=='-' )
+ arg=0;
+ else
+ parseConfigFile(argv[1]);
+ while(++arg<argc){
+ if( strlen(argv[arg])>2 && argv[arg][0]=='-' && argv[arg][1]=='-' )
+ {
+ if( !makeSetCommand(argv[arg]+1,"1",getGlobalParSet(),2))
+ cerr << "WARNING: ignoring unrecognized option: "<< argv[arg] << '\n' ;
+ }
+ else if( arg+1<argc && !makeSetCommand(argv[arg],argv[arg+1],getGlobalParSet(),2))
+ cerr << "WARNING: ignoring unrecognized option: "<< argv[arg] << '\n' ;
+ else
+ {
+ arg++;
+ }
+ }
+ if( OPath.length() )
+ OPath+="/";
+ Prefix = (OPath + Prefix);
+ LogFilename = (OPath + LogFilename);
+ printGIZAPars(cout);
+}
+
diff --git a/GIZA++-v2/plain2snt.cpp b/GIZA++-v2/plain2snt.cpp
new file mode 100644
index 0000000..035d5fc
--- /dev/null
+++ b/GIZA++-v2/plain2snt.cpp
@@ -0,0 +1,115 @@
+#include <iostream>
+#include <string>
+#include <strstream>
+#include <fstream>
+#include <map>
+#include <vector>
+
+using namespace std;
+
+int main(int argc,char**argv)
+{
+ vector<double>weights;
+ vector<string>filenames;
+ for(int i=1;i<argc;++i)
+ if(string(argv[i])=="-weight")
+ weights.push_back(atof(argv[++i]));
+ else
+ filenames.push_back(argv[i]);
+
+ if((filenames.size()%2)==1||filenames.size()==0 )
+ {
+ cerr << argv[0] << " txt1 txt2 [txt3 txt4 -weight w]\n";
+ cerr << " Converts plain text into GIZA++ snt-format.\n";
+ exit(1);
+ }
+ string line1,line2,word;
+ map<string,int> v1,v2;
+ map<string,int> id1,id2;
+ vector<string> iid1(2),iid2(2);
+
+ string w1(filenames[0]);
+ string w2(filenames[1]);
+
+ if( w1.length()>4&&w2.length()>4&&((w1.substr(w1.length()-4,w1.length())==".tok" && w2.substr(w2.length()-4,w2.length())==".tok" )||
+ (w1.substr(w1.length()-4,w1.length())==".txt" && w2.substr(w2.length()-4,w2.length())==".txt" ) ))
+ {
+ w1=w1.substr(0,w1.length()-4);
+ w2=w2.substr(0,w2.length()-4);
+ cerr << "w1:"<< w1 << " w2:" << w2 << endl;
+ }
+
+
+ string vocab1(w1),vocab2(w2),snt1,snt2;
+ unsigned int slashpos=vocab1.rfind('/')+1;
+ if( slashpos>=vocab1.length() ) slashpos=0;
+ string vocab1x(vocab1.substr(slashpos,vocab1.length()));
+ cout << vocab1 << " -> " << vocab1x << endl;
+ slashpos=vocab2.rfind('/')+1;
+ if( slashpos>=vocab2.length() ) slashpos=0;
+ string vocab2x(vocab2.substr(slashpos,vocab2.length()));
+ cout << vocab2 << " -> " << vocab2x << endl;
+ snt1=vocab1+"_"+vocab2x+string(".snt");
+ snt2=vocab2+"_"+vocab1x+string(".snt");
+ vocab1+=string(".vcb");
+ vocab2+=string(".vcb");
+
+ ofstream ovocab1(vocab1.c_str()),ovocab2(vocab2.c_str()),osnt1(snt1.c_str()),osnt2(snt2.c_str());
+ for(unsigned int i=0;i<filenames.size();i+=2)
+ {
+ ifstream i1(filenames[i].c_str()),i2(filenames[i+1].c_str());
+ if(!i1)cerr << "WARNING: " << filenames[i] << " cannot be read.\n";
+ if(!i2)cerr << "WARNING: " << filenames[i+1] << " cannot be read.\n";
+ while(getline(i1,line1) && getline(i2,line2) )
+ {
+ vector<string> t1,t2;
+ istrstream ii1(line1.c_str());
+ while(ii1>>word)
+ {
+ t1.push_back(word);
+ v1[word]++;
+ if( id1.find(word)==id1.end() )
+ {
+ iid1.push_back(word);
+ id1[word]=iid1.size()-1;
+ }
+ }
+ istrstream ii2(line2.c_str());
+ while(ii2>>word)
+ {
+ t2.push_back(word);
+ v2[word]++;
+ if( id2.find(word)==id2.end() )
+ {
+ iid2.push_back(word);
+ id2[word]=iid2.size()-1;
+ }
+ }
+ double w=1.0;
+ if( i/2<weights.size() )
+ w=weights[i/2];
+ if( t1.size()&&t2.size() )
+ {
+ osnt1 << w << "\n";
+ for(unsigned int j=0;j<t1.size();++j)osnt1 << id1[t1[j]] << ' ';
+ osnt1 << '\n';
+ for(unsigned int j=0;j<t2.size();++j)osnt1 << id2[t2[j]] << ' ';
+ osnt1 << '\n';
+
+ osnt2 << w << "\n";
+ for(unsigned int j=0;j<t2.size();++j)osnt2 << id2[t2[j]] << ' ';
+ osnt2 << '\n';
+ for(unsigned int j=0;j<t1.size();++j)osnt2 << id1[t1[j]] << ' ';
+ osnt2 << '\n';
+ }
+ else
+ cerr << "WARNING: filtered out empty sentence (source: " << filenames[i] << " " << t1.size() <<
+ " target: " << filenames[i+1] << " " << t2.size() << ").\n";
+ }
+ }
+
+ for(unsigned int i=2;i<iid1.size();++i)
+ ovocab1 << i << ' ' << iid1[i] << ' ' << v1[iid1[i]] << '\n';
+ for(unsigned int i=2;i<iid2.size();++i)
+ ovocab2 << i << ' ' << iid2[i] << ' ' << v2[iid2[i]] << '\n';
+}
diff --git a/GIZA++-v2/reports.cpp b/GIZA++-v2/reports.cpp
new file mode 100644
index 0000000..4d5873a
--- /dev/null
+++ b/GIZA++-v2/reports.cpp
@@ -0,0 +1,211 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include <strstream>
+#include <time.h>
+#include <set>
+#include "defs.h"
+#include "vocab.h"
+#include "Perplexity.h"
+#include "getSentence.h"
+#include "TTables.h"
+#include "Globals.h"
+#include "Parameter.h"
+
+void printHelp(void)
+{
+ cerr << "Usage:\n\n" << Usage << '\n';
+ cerr << "Options (these override parameters set in the config file):\n\n";
+ cerr << "\t--v \t\t print verbose message, Warning this is not very descriptive and not systematic.\n";
+ cerr << "\t--NODUMPS \t Do not write any files to disk (This will over write dump frequency options).\n";
+ cerr << "\t--h[elp]\t\tprint this help\n";
+ cerr << "\t--p\t\tUse pegging when generating alignments for Model3 training. (Default NO PEGGING)\n";
+ cerr << "\t--st\t\tto use a fixed ditribution for the fertility parameters when tranfering from model 2 to model 3 (Default complicated estimation)\n";
+ printGIZAPars(cout);
+}
+
+
+void generatePerplexityReport(const Perplexity& trainperp,
+ const Perplexity& testperp,
+ const Perplexity& trainVperp,
+ const Perplexity& testVperp,
+ ostream& of, int trainsize, int testsize,
+ bool)
+{
+ unsigned int i, m;
+ unsigned int m1 = max(trainperp.size(), testperp.size());
+ unsigned int m2 = max(trainVperp.size(), testVperp.size());
+ m = max(m1,m2);
+ of << "#trnsz\ttstsz\titer\tmodel\ttrn-pp\t\ttest-pp\t\ttrn-vit-pp\t\ttst-vit-pp\n";
+ for (i = 0 ; i <m ; i++){
+ of << trainsize << '\t' << testsize << '\t' << i<< '\t' << trainperp.modelid[i] << '\t';
+ if (i < trainperp.perp.size())
+ of << trainperp.perp[i] << "\t\t" ;
+ else
+ of << "N/A\t\t";
+ if (i<testperp.perp.size())
+ of << testperp.perp[i] << "\t\t" ;
+ else
+ of << "N/A\t\t";
+ if (i < trainVperp.perp.size())
+ of << trainVperp.perp[i] << "\t\t" ;
+ else
+ of << "N/A\t";
+ if (i< testVperp.perp.size())
+ of << testVperp.perp[i] << '\n' ;
+ else
+ of << "N/A\n";
+ }
+}
+
+void printSentencePair(Vector<WordIndex>& es,
+ Vector<WordIndex>& fs,
+ ostream& of)
+
+ // just writes a sentece pair to the give output stream, one sentence pair line
+ // it writes token ids not actual tokens.
+{
+ WordIndex i, j, l, m;
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ of << "Source sentence length : " << l << " , target : " << m << "\n";
+ for (i = 1 ; i <= l ; i++)
+ of << es[i] << ' ';
+ of << "\n";
+ for (j = 1 ; j <= m ; j++)
+ of << fs[j] << ' ';
+ of << "\n";
+
+}
+
+extern short CompactAlignmentFormat;
+void printAlignToFile(const Vector<WordIndex>& es,
+ const Vector<WordIndex>& fs,
+ const Vector<WordEntry>& evlist,
+ const Vector<WordEntry>& fvlist,
+ ostream& of2,
+ const Vector<WordIndex>& viterbi_alignment,
+ int pair_no, double alignment_score)
+
+ // prints the given alignment to alignments file (given it stream pointer)
+ // in a format recognizable by the draw-alignment tool ... which is of the
+ // example (each line triple is one sentence pair):
+ // # sentence caption
+ // target_word_1 target_word_2 ..... target_word_m
+ // source_word_1 ({ x y z }) source_word_2 ({ }) .. source_word_n ({w})
+ // where x, y, z, and w are positions of target words that each source word
+ // is connected to.
+
+{
+ WordIndex l, m;
+ Vector<Vector<WordIndex> > translations(es.size()); // each english words has a vector
+ // of zero or more translations .
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ if( CompactAlignmentFormat )
+ {
+ for (WordIndex j = 1 ; j <= m ; j++)
+ if( viterbi_alignment[j] )
+ of2 << viterbi_alignment[j]-1 << ' ' << j-1 << ' ';
+ of2 << '\n';
+ }
+ else
+ {
+ of2 << "# Sentence pair (" << pair_no <<") source length " << l << " target length "<< m <<
+ " alignment score : "<< alignment_score << '\n';
+ for (WordIndex j = 1 ; j <= m ; j++){
+ of2 << fvlist[fs[j]].word << " " ;
+ translations[viterbi_alignment[j]].push_back(j);
+ }
+ of2 << '\n';
+
+ for (WordIndex i = 0 ; i <= l ; i++){
+ of2 << evlist[es[i]].word << " ({ " ;
+ for (WordIndex j = 0 ; j < translations[i].size() ; j++)
+ of2 << translations[i][j] << " " ;
+ of2 << "}) ";
+ }
+ of2 << '\n';
+ }
+}
+
+
+void printOverlapReport(const tmodel<COUNT, PROB>& tTable,
+ sentenceHandler& testHandler, vcbList& trainEList,
+ vcbList& trainFList, vcbList& testEList, vcbList& testFList)
+{
+ set<pair<WordIndex, WordIndex> > testCoocur ;
+ sentPair s ;
+ /* string unseenCoocurFile = Prefix + ".tst.unseen.cooc" ;
+ ofstream of_unseenCoocur(unseenCoocurFile.c_str());
+
+ string seenCoocurFile = Prefix + ".tst.seen.cooc" ;
+ ofstream of_seenCoocur(seenCoocurFile.c_str());
+ */
+ testHandler.rewind();
+ int seen_coocur = 0, unseen_coocur = 0, srcUnk = 0, trgUnk = 0 ;
+ while(testHandler.getNextSentence(s)){
+ for (WordIndex i = 1 ; i < s.eSent.size() ; i++)
+ for (WordIndex j = 1 ; j < s.fSent.size() ; j++)
+ testCoocur.insert(pair<WordIndex, WordIndex> (s.eSent[i], s.fSent[j])) ;
+ }
+ set<pair<WordIndex, WordIndex> >::const_iterator i ;
+ for (i = testCoocur.begin() ; i != testCoocur.end() ; ++i){
+ if (tTable.getProb((*i).first, (*i).second) > PROB_SMOOTH){
+ seen_coocur ++ ;
+ // of_seenCoocur << (*i).first << ' ' << (*i).second << '\n';
+ }
+ else {
+ unseen_coocur++;
+ // of_unseenCoocur << (*i).first << ' ' << (*i).second << '\n';
+ }
+ }
+
+ string trgUnkFile = Prefix + ".tst.trg.unk" ;
+ ofstream of_trgUnk(trgUnkFile.c_str());
+
+ for (WordIndex i = 0 ; i < testFList.getVocabList().size() && i < testFList.uniqTokens();i++)
+ if (testFList.getVocabList()[i].freq > 0 && trainFList.getVocabList()[i].freq <= 0){
+ of_trgUnk << i << ' ' << testFList.getVocabList()[i].word << ' ' << testFList.getVocabList()[i].freq
+ << '\n';
+ trgUnk++ ;
+ }
+ string srcUnkFile = Prefix + ".tst.src.unk" ;
+ ofstream of_srcUnk(srcUnkFile.c_str());
+
+ for (WordIndex j = 0 ; j < testEList.getVocabList().size() && j < testEList.uniqTokens();j++)
+ if (testEList.getVocabList()[j].freq > 0 && trainEList.getVocabList()[j].freq <= 0){
+ srcUnk++ ;
+ of_srcUnk << j << ' ' << testEList.getVocabList()[j].word << ' ' << testEList.getVocabList()[j].freq
+ << '\n';
+ }
+ string summaryFile = Prefix + ".tst.stats" ;
+ ofstream of_summary(summaryFile.c_str());
+ of_summary << "\t\t STATISTICS ABOUT TEST CORPUS\n\n";
+ of_summary << "source unique tokens: " << testEList.uniqTokens() << '\n';
+ of_summary << "target unique tokens: " << testFList.uniqTokens() << '\n';
+ of_summary << "unique unseen source tokens: " << srcUnk << '\n';
+ of_summary << "unique unseen target tokens: " << trgUnk << '\n';
+ of_summary << "cooccurrences not found in the final t table: " << unseen_coocur << '\n';
+ of_summary << "cooccurrences found in the final t table: " << seen_coocur << '\n';
+
+}
+
diff --git a/GIZA++-v2/snt2cooc.cpp b/GIZA++-v2/snt2cooc.cpp
new file mode 100644
index 0000000..0a656d2
--- /dev/null
+++ b/GIZA++-v2/snt2cooc.cpp
@@ -0,0 +1,106 @@
+#include <iostream>
+#include <string>
+#include <strstream>
+#include <fstream>
+#include <map>
+#include <vector>
+#include <set>
+
+using namespace std;
+
+void readVoc(istream&in,map<string,string>&voc)
+{
+ string line,s1,s2;
+ voc["1"]="UNK";
+ if( !in )cerr <<"Vocabulary does not exist.\n";
+ while(getline(in,line))
+ {
+ istrstream eingabe(line.c_str());
+ if( !(eingabe>>s1>>s2))
+ cerr << "ERROR in vocabulary '" << line << "'\n";
+ voc[s1]=s2;
+ }
+}
+
+int maxElems=0;
+int main(int argc,char **argv)
+{
+ if( argc!=4&&argc!=5 )
+ {
+ cerr << "Usage: " << argv[0] << " vcb1 vcb2 snt12 \n";
+ cerr << "Converts GIZA++ snt-format into plain text.\n";
+ exit(1);
+ }
+ bool counts=0;
+ if( argc==5 )
+ {
+ if(string(argv[4])!="-counts")
+ cerr << "ERROR: wrong option " << argv[5] << endl;
+ counts=1;
+ maxElems=10000000;
+ }
+ ifstream v1(argv[1]),v2(argv[2]),t(argv[3]);
+ map<string,string>voc1,voc2;
+ readVoc(v1,voc1);
+ readVoc(v2,voc2);
+ string line1,line2,line3;
+ vector<map<int,int> > vsi(voc1.size()+1000);
+ int nLine=0;
+ int totalElems=0;
+ while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
+ {
+ istrstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str());
+ double count;
+ string word;
+ eingabe1>>count;
+ vector<int>l1,l2;
+ while(eingabe2>>word)
+ l1.push_back(atoi(word.c_str()));
+ while(eingabe3>>word)
+ l2.push_back(atoi(word.c_str()));
+ if( ((++nLine)%1000)==0 )
+ cerr << "line " << nLine << '\n';
+ totalElems-=vsi[0].size();
+ for(unsigned int j=0;j<l2.size();++j)
+ vsi[0][l2[j]]++;
+ totalElems+=vsi[0].size();
+ for(unsigned int i=0;i<l1.size();++i)
+ {
+ if( l1[i]>=int(vsi.size()) )
+ {
+ cerr << "I have to resize: " << l1[i] << endl;
+ vsi.resize(l1[i]+1);
+ }
+ map<int,int>&theset=vsi[l1[i]];
+ totalElems-=theset.size();
+ for(unsigned int j=0;j<l2.size();++j)
+ theset[l2[j]]++;
+ totalElems+=theset.size();
+ }
+ if( totalElems>maxElems&&maxElems )
+ {
+ cerr << "INFO: print out " << totalElems << " entries.\n";
+ for(unsigned int i=0;i<vsi.size();++i)
+ for(map<int,int>::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j)
+ {
+ if(counts==1 )
+ cout << j->second << " " << i << " " << j->first << '\n';
+ else
+ cout << i << " " << j->first << '\n';
+ }
+ totalElems=0;
+ vsi.clear();
+ vsi.resize(voc1.size()+1000);
+ }
+ }
+ cerr << "END.\n";
+ for(unsigned int i=0;i<vsi.size();++i)
+ for(map<int,int>::const_iterator j=vsi[i].begin();j!=vsi[i].end();++j)
+ {
+ if(counts==1 )
+ cout << j->second << " " << i << " " << j->first << '\n';
+ else
+ cout << i << " " << j->first << '\n';
+ }
+}
+
diff --git a/GIZA++-v2/snt2plain.cpp b/GIZA++-v2/snt2plain.cpp
new file mode 100644
index 0000000..7b41c46
--- /dev/null
+++ b/GIZA++-v2/snt2plain.cpp
@@ -0,0 +1,90 @@
+#include <iostream>
+#include <string>
+#include <strstream>
+#include <fstream>
+#include <map>
+#include <vector>
+
+using namespace std;
+
+void readVoc(istream&in,map<string,string>&voc)
+{
+ string line,s1,s2;
+ voc["1"]="UNK";
+ if( !in )cerr <<"Vocabulary does not exist.\n";
+ while(getline(in,line))
+ {
+ istrstream eingabe(line.c_str());
+ if( !(eingabe>>s1>>s2))
+ cerr << "ERROR in vocabulary '" << line << "'\n";
+ voc[s1]=s2;
+ }
+}
+
+int main(int argc,char **argv)
+{
+ if( argc!=5&&argc!=6 )
+ {
+ cerr << "Usage: " << argv[0] << " vcb1 vcb2 snt12 output_prefix [ -counts ]\n";
+ cerr << "Converts GIZA++ snt-format into plain text.\n";
+ exit(1);
+ }
+ bool counts=0;
+ if( argc==6 )
+ {
+ if(string(argv[5])!="-counts")
+ cerr << "ERROR: wrong option " << argv[5] << endl;
+ counts=1;
+ }
+ ifstream v1(argv[1]),v2(argv[2]),t(argv[3]);
+ string prefix(argv[4]);
+ string outfil1=prefix+"1.txt";
+ string outfil2=prefix+"2.txt";
+ ofstream out1(outfil1.c_str());
+ ofstream out2(outfil2.c_str());
+ map<string,string>voc1,voc2;
+ readVoc(v1,voc1);
+ readVoc(v2,voc2);
+ int source=0,target=0;
+ string line1,line2,line3;
+ int printed=0;
+ while(getline(t,line1)&&getline(t,line2)&&getline(t,line3))
+ {
+ istrstream eingabe1(line1.c_str()),eingabe2(line2.c_str()),eingabe3(line3.c_str());
+ double count;
+ string word;
+ eingabe1>>count;
+ vector<string>l1,l2;
+ while(eingabe2>>word)
+ l1.push_back(word);
+ while(eingabe3>>word)
+ l2.push_back(word);
+ if( counts )
+ cout << count << '\n';
+ for(unsigned int p=0;p<l1.size();p++)
+ {
+ if(voc1.count(l1[p])==0)
+ {
+ if( printed++==0)
+ cerr << "ERROR: source vocabulary entry " << l1[p] << " unknown.\n";
+ out1 << l1[p]<<' ';
+ }
+ else
+ out1 << voc1[l1[p]] << ' ';
+ source++;
+ }
+ for(unsigned int p=0;p<l2.size();p++)
+ {
+ if(voc2.count(l2[p])==0)
+ {
+ if( printed++ ==0)
+ cerr << "ERROR: target vocabulary entry " << l2[p] << " unknown.\n";
+ out2 <<l2[p]<<' ';
+ }
+ out2 << voc2[l2[p]] << ' ';
+ target++;
+ }
+ out1<<'\n';
+ out2<<'\n';
+ }
+}
diff --git a/GIZA++-v2/trainGIZA++.sh b/GIZA++-v2/trainGIZA++.sh
new file mode 100755
index 0000000..230aa9f
--- /dev/null
+++ b/GIZA++-v2/trainGIZA++.sh
@@ -0,0 +1,19 @@
+#! /bin/csh
+
+if( $# != 3 ) then
+
+ echo Usage: trainGIZA++.sh vcb1 vcb2 snt
+ echo " "
+ echo Performs a training of word classes and a standard GIZA training.
+
+else
+
+ snt2plain.out $1 $2 $3 PLAIN
+
+ mkcls -m2 -pPLAIN1.txt -c50 -V$1.classes opt >& mkcls1.log
+ rm PLAIN1.txt
+ mkcls -m2 -pPLAIN2.txt -c50 -V$2.classes opt >& mkcls2.log
+ rm PLAIN2.txt
+ GIZA++ -S $1 -T $2 -C $3 -p0 0.98 -o GIZA++ >& GIZA++.log
+
+endif
diff --git a/GIZA++-v2/transpair_model1.h b/GIZA++-v2/transpair_model1.h
new file mode 100644
index 0000000..00d7875
--- /dev/null
+++ b/GIZA++-v2/transpair_model1.h
@@ -0,0 +1,108 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef transpair_model1_h_fjo_defined
+#define transpair_model1_h_fjo_defined
+//#include "logprob.h"
+#include "defs.h"
+#include "Array2.h"
+#include "defs.h"
+#include "Vector.h"
+#include "NTables.h"
+#include "ATables.h"
+#include "TTables.h"
+#include "alignment.h"
+#include <math.h>
+#include <algorithm>
+#include "Array2.h"
+#include "mystl.h"
+
+class transpair_model1
+{
+ public:
+ bool verboseTP;
+ Array2<PROB, Vector<PROB> > t;
+ WordIndex l, m;
+ Vector<WordIndex> E,F;
+ void setMode(bool)
+ {}
+ transpair_model1(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, const tmodel<COUNT, PROB>&tTable)
+ : verboseTP(0),t(es.size(), fs.size()),l(es.size()-1), m(fs.size()-1),E(es),F(fs)
+ {
+ WordIndex l=es.size()-1,m=fs.size()-1;
+ for(WordIndex i=0;i<=l;i++)
+ for(WordIndex j=1;j<=m;j++)
+ {
+ t(i, j)=tTable.getProb(es[i], fs[j]);
+ if( !(t(i,j)>=PROB_SMOOTH) )
+ cerr << "ERROR IN PROBABILITY: " << t(i,j) << " " << PROB_SMOOTH << endl;
+ }
+ }
+ /* transpair_model1(const Vector<WordIndex>&es, const Vector<WordIndex>&fs)
+ : verboseTP(0),t(es.size(), fs.size()),l(es.size()-1), m(fs.size()-1),E(es),F(fs)
+ {
+ WordIndex l=es.size()-1,m=fs.size()-1;
+ for(WordIndex i=0;i<=l;i++)
+ for(WordIndex j=1;j<=m;j++)
+ {
+ const string&estr=globeTrainVcbList->getVocabList()[es[i]].word;
+ const string&fstr=globfTrainVcbList->getVocabList()[fs[j]].word;
+ if( lev(estr,fstr)==0 )
+ t(i,j)=1.0;
+ else
+ t(i,j)=1/100.0;
+ massert( t(i,j)>=PROB_SMOOTH );
+ }
+}*/
+ WordIndex get_l()const
+ {return l;}
+ WordIndex get_m()const
+ {return m;}
+ const PROB&get_t(WordIndex i, WordIndex j)const
+ {massert( t(i,j)>=PROB_SMOOTH);
+ return t(i, j);}
+ WordIndex get_es(int i)const {return E[i];}
+ WordIndex get_fs(int j)const {return F[j];}
+ bool greedyHillClimbing()const
+ {return 0;}
+ void computeScores(const alignment&,vector<double>&)const
+ {}
+ LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double=-1.0)const
+ {
+ int old_i=a(j);
+ return (t(new_i, j) /t(old_i, j));
+ }
+ LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const
+ {
+ WordIndex i1=a(j1), i2=a(j2);
+ return (t(i2, j1)/t(i1, j1))*(t(i1, j2)/t(i2, j2));
+ }
+ LogProb prob_of_target_and_alignment_given_source(const alignment&al)const
+ {
+ LogProb prob=1.0;
+ int lp1=al.get_l()+1;
+ for(unsigned int j=1;j<=al.get_m();++j)
+ prob*=t(al(j),j)/lp1;
+ return prob;
+ }
+};
+#endif
diff --git a/GIZA++-v2/transpair_model2.h b/GIZA++-v2/transpair_model2.h
new file mode 100644
index 0000000..f3d53e3
--- /dev/null
+++ b/GIZA++-v2/transpair_model2.h
@@ -0,0 +1,52 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef transpair_model2_defined_h
+#define transpair_model2_defined_h
+
+#include "defs.h"
+#include "Vector.h"
+#include "NTables.h"
+#include "ATables.h"
+#include "TTables.h"
+#include "alignment.h"
+#include <math.h>
+#include "transpair_model1.h"
+
+
+class transpair_model2 : public transpair_model1
+{
+ protected:
+ Array2<PROB, Vector<PROB> > a;
+ public:
+ transpair_model2(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, const tmodel<COUNT, PROB>&tTable,
+ const amodel<PROB>&aTable)
+ : transpair_model1(es,fs,tTable),a(es.size(),fs.size())
+ {
+ for(WordIndex i=0;i<=l;i++)
+ for(WordIndex j=1;j<=m;j++)
+ a(i, j)=aTable.getValue(i, j, l, m);
+ }
+ const PROB&get_a(WordIndex i, WordIndex j)const
+ {return a(i, j);}
+};
+#endif
diff --git a/GIZA++-v2/transpair_model3.cpp b/GIZA++-v2/transpair_model3.cpp
new file mode 100644
index 0000000..0ab4c54
--- /dev/null
+++ b/GIZA++-v2/transpair_model3.cpp
@@ -0,0 +1,197 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/*--
+transpair_model3: representation of a translation pair for model3 training
+allowing for fast access (esp. to t table).
+
+Franz Josef Och (30/07/99)
+--*/
+#include "transpair_model3.h"
+#include <algorithm>
+
+transpair_model3::transpair_model3(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, tmodel<COUNT, PROB>&tTable, amodel<PROB>&aTable, amodel<PROB>&dTable, nmodel<PROB>&nTable, double _p1, double _p0, void*)
+ : transpair_model2(es,fs,tTable,aTable),d(es.size(), fs.size()),n(es.size(), MAX_FERTILITY+1), p0(_p0), p1(_p1)
+{
+ WordIndex l=es.size()-1,m=fs.size()-1;
+ for(WordIndex i=0;i<=l;i++)
+ {
+ for(WordIndex j=1;j<=m;j++)
+ d(i, j)=dTable.getValue(j, i, l, m);
+ if( i>0 )
+ {
+ for(WordIndex f=0;f<MAX_FERTILITY;f++)
+ n(i, f)=nTable.getValue(es[i], f);
+ n(i,MAX_FERTILITY)=PROB_SMOOTH;
+ }
+ }
+}
+
+LogProb transpair_model3::scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j, double,bool forModel3)const
+{
+ LogProb change;
+ const WordIndex old_i=a(j);
+ WordIndex f0=a.fert(0);
+ if (old_i == new_i)
+ change=1.0;
+ else if (old_i == 0)
+ change=((double)p0*p0/p1) *
+ (( (DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):f0)*(m-f0+1.0)) / ((m-2*f0+1)*(m-2*f0+2.0))) *
+ ((PROB)(forModel3?(a.fert(new_i)+1.0):1.0)) *
+ (get_fertility(new_i, a.fert(new_i)+1) / get_fertility(new_i, a.fert(new_i)))*
+ (t(new_i, j)/t(old_i, j))*
+ (forModel3?d(new_i, j):1.0);
+ else if (new_i == 0)
+ change=(double(p1) / (p0*p0)) *
+ (double((m-2*f0)*(m-2*f0-1))/( (DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):(1+f0))*(m-f0))) *
+ (forModel3?(1.0/a.fert(old_i)):1.0) *
+ (get_fertility(old_i, a.fert(old_i)-1) /get_fertility(old_i, a.fert(old_i)))*
+ (t(new_i, j) /t(old_i, j)) *
+ (forModel3?(1.0 / d(old_i, j)):1.0);
+ else
+ change=(forModel3?((a.fert(new_i)+1.0)/a.fert(old_i)):1.0) *
+ (get_fertility(old_i,a.fert(old_i)-1) / get_fertility(old_i,a.fert(old_i))) *
+ (get_fertility(new_i,a.fert(new_i)+1) /get_fertility(new_i,a.fert(new_i))) *
+ (t(new_i,j)/t(old_i,j)) *
+ (forModel3?(d(new_i,j)/d(old_i,j)):1.0);
+ return change;
+}
+
+LogProb transpair_model3::scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2, double,bool forModel3)const
+{
+ PROB score=1;
+ assert(j1<j2);
+ WordIndex i1=a(j1), i2=a(j2);
+ if (i1!=i2)
+ {
+ score=(t(i2, j1)/t(i1, j1))*(t(i1, j2)/t(i2, j2));
+ if( forModel3 )
+ {
+ if (i1)
+ score *= d(i1, j2)/d(i1, j1);
+ if (i2)
+ score *= d(i2, j1)/d(i2, j2);
+ }
+ }
+ return score;
+}
+
+ostream&operator<<(ostream&out, const transpair_model3&m)
+{
+ for(WordIndex i=0;i<=m.get_l();i++)
+ {
+ out << "EF-I:"<<i<<' ';
+ for(WordIndex j=1;j<=m.get_m();j++)
+ out << "("<<m.t(i,j)<<","<<m.d(i,j)<<")";
+ for(WordIndex j=1;j<MAX_FERTILITY;j++)
+ if( i>0 )
+ out << "(fert:"<<m.get_fertility(i,j)<<")";
+ out << '\n';
+ }
+ out << "T:" << m.t << "D:" << m.d << "A:" << m.a << "N:" << m.n << m.p0 << m.p1 << '\n';
+ return out;
+}
+
+LogProb transpair_model3::_scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double)const
+{
+ alignment b(a);
+ b.set(j, new_i);
+ LogProb a_prob=prob_of_target_and_alignment_given_source(a);
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b);
+ if( a_prob )
+ return b_prob/a_prob;
+ else if( b_prob )
+ return 1e20;
+ else
+ return 1.0;
+}
+
+LogProb transpair_model3::_scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const
+{
+ alignment b(a);
+ b.set(j1, a(j2));
+ b.set(j2, a(j1));
+ LogProb a_prob=thisValue;
+ if( a_prob<0.0 )
+ a_prob=prob_of_target_and_alignment_given_source(a);
+ massert(a_prob==prob_of_target_and_alignment_given_source(a));
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b);
+ if( a_prob )
+ return b_prob/a_prob;
+ else if( b_prob )
+ return 1e20;
+ else
+ return 1.0;
+}
+
+LogProb transpair_model3::prob_of_target_and_alignment_given_source(const alignment&al,bool verb)const
+{
+ LogProb total = 1.0 ;
+ static const LogProb zero = 1E-299 ;
+ total *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
+ if( verb) cerr << "IBM-3: (1-p1)^(m-2 f0)*p1^f0: " << total << '\n';
+ for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
+ total *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
+ if( verb) cerr << "IBM-3: +NULL:binomial+distortion " << total << '\n';
+ for (WordIndex i = 1 ; i <= l ; i++)
+ {
+ total *= get_fertility(i, al.fert(i)) * (LogProb) factorial(al.fert(i));
+ if( verb) cerr << "IBM-3: fertility of " << i << " with factorial " << get_fertility(i, al.fert(i)) * (LogProb) factorial(al.fert(i)) << " -> " << total << '\n';
+ }
+ for (WordIndex j = 1 ; j <= m ; j++)
+ {
+ total*= get_t(al(j), j) ;
+ massert( get_t(al(j), j)>=PROB_SMOOTH );
+ if( verb) cerr << "IBM-3: t of " << j << " " << al(j) << ": " << get_t(al(j), j) << " -> " << total << '\n';
+ if (al(j))
+ {
+ total *= get_d(al(j), j);
+ if( verb) cerr << "IBM-3: d of " << j << ": " << get_d(al(j), j) << " -> " << total << '\n';
+ }
+ }
+ return total?total:zero;
+}
+
+
+void transpair_model3::computeScores(const alignment&al,vector<double>&d)const
+{
+ LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ;
+ total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
+ for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
+ total1 *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
+ for (WordIndex i = 1 ; i <= l ; i++)
+ {
+ total2 *= get_fertility(i, al.fert(i)) * (LogProb) factorial(al.fert(i));
+ }
+ for (WordIndex j = 1 ; j <= m ; j++)
+ {
+ total3*= get_t(al(j), j) ;
+ massert( get_t(al(j), j)>=PROB_SMOOTH );
+ if (al(j))
+ {
+ total4 *= get_d(al(j), j);
+ }
+ }
+ d.push_back(total1);//5
+ d.push_back(total2);//6
+ d.push_back(total3);//7
+ d.push_back(total4);//8
+}
diff --git a/GIZA++-v2/transpair_model3.h b/GIZA++-v2/transpair_model3.h
new file mode 100644
index 0000000..5fa43db
--- /dev/null
+++ b/GIZA++-v2/transpair_model3.h
@@ -0,0 +1,84 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+/*--
+transpair_model3: representation of a translation pair for model3 training
+allowing for fast access (esp. to t table).
+
+Franz Josef Och (30/07/99)
+--*/
+#ifndef transpair_model3_h_fjo_defined
+#define transpair_model3_h_fjo_defined
+#include "Array2.h"
+#include "defs.h"
+#include "Vector.h"
+#include "NTables.h"
+#include "ATables.h"
+#include "TTables.h"
+#include "alignment.h"
+#include <math.h>
+#include "transpair_model2.h"
+
+extern double factorial(int n);
+inline bool doubleEqual(const double a, const double b)
+{
+ if( a==b )
+ return 1.0;
+ bool bl=fabs(1.0-a/b)<1e-10;
+ if( bl )
+ return 1;
+ else
+ {
+ cerr << "DIFFERENT: " << a << " " << b << " " << a/b << " " << 1.0-a/b << endl;
+ return 0;
+ }
+}
+
+
+class transpair_model3 : public transpair_model2
+{
+ protected:
+ Array2<PROB, Vector<PROB> > d, n;
+ PROB p0, p1;
+ public:
+ typedef transpair_model3 simpler_transpair_model;
+ transpair_model3(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, tmodel<COUNT, PROB>&tTable,
+ amodel<PROB>&aTable, amodel<PROB>&dTable, nmodel<PROB>&nTable,
+ double _p1, double _p0, void*x=0);
+ const PROB&get_d(WordIndex i, WordIndex j)const
+ {return d(i, j);}
+ const PROB&get_a(WordIndex i, WordIndex j)const
+ {return a(i, j);}
+ const PROB&get_fertility(WordIndex i, WordIndex f)const
+ {massert(i>0);return (f>=MAX_FERTILITY)?n(i, MAX_FERTILITY):n(i, f);}
+ int modelnr()const{return 3;}
+ LogProb scoreOfAlignmentForChange(const alignment&)const
+ {return -1.0; }
+ LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j, double thisValue=-1.0,bool withDistortions=1)const;
+ LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2, double thisValue=-1.0,bool withDistortions=1)const ;
+ LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
+ LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const;
+ friend ostream&operator<<(ostream&out, const transpair_model3&m);
+ LogProb prob_of_target_and_alignment_given_source(const alignment&al,bool verb=0)const;
+ bool isSubOptimal()const{return 1;}
+ void computeScores(const alignment&al,vector<double>&d)const;
+};
+#endif
diff --git a/GIZA++-v2/transpair_model4.cpp b/GIZA++-v2/transpair_model4.cpp
new file mode 100644
index 0000000..ebc2666
--- /dev/null
+++ b/GIZA++-v2/transpair_model4.cpp
@@ -0,0 +1,179 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "transpair_model4.h"
+#include "Parameter.h"
+
+GLOBAL_PARAMETER(float,d4modelsmooth_factor,"model4SmoothFactor","smooting parameter for alignment probabilities in Model 4",PARLEV_SMOOTH,0.2);
+
+LogProb transpair_model4::_scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double)const
+{
+ LogProb a_prob=prob_of_target_and_alignment_given_source(a);
+ alignment b(a);
+ b.set(j, new_i);
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b);
+ if( a_prob )
+ return b_prob/a_prob;
+ else if( b_prob )
+ return 1e20;
+ else
+ return 1.0;
+}
+LogProb transpair_model4::_scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double)const
+{
+ LogProb a_prob=prob_of_target_and_alignment_given_source(a);
+ alignment b(a);
+ b.set(j1, a(j2));
+ b.set(j2, a(j1));
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b);
+ if( a_prob )
+ return b_prob/a_prob;
+ else if( b_prob )
+ return 1e20;
+ else
+ return 1.0;
+}
+//increasing efficiency: no copy of alignment (calc. everything incrementally)
+LogProb transpair_model4::scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue)const
+{
+ if( a(j)==new_i )
+ return 1.0;
+ LogProb change=transpair_model3::scoreOfMove(a,new_i,j,-1.0,0);
+ LogProb a_prob=thisValue;
+ if(a_prob<0.0 )
+ a_prob=prob_of_target_and_alignment_given_source(a,2);
+ massert(a_prob==prob_of_target_and_alignment_given_source(a,2));
+ WordIndex old_i=a(j);
+ //alignment b(a);
+ const_cast<alignment&>(a).set(j,new_i);
+ LogProb b_prob=prob_of_target_and_alignment_given_source(a,2);
+ const_cast<alignment&>(a).set(j,old_i);
+ change*=b_prob/a_prob;
+ return change;
+}
+//increasing efficiency: no copy of alignment (calc. everything incrementally)
+LogProb transpair_model4::scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const
+{
+ WordIndex aj1=a(j1),aj2=a(j2);
+ if( aj1==aj2 )
+ return 1.0;
+ LogProb change=transpair_model3::scoreOfSwap(a,j1,j2,-1.0,0);
+ LogProb a_prob=thisValue;
+ if( a_prob<0.0 )
+ a_prob=prob_of_target_and_alignment_given_source(a,2);
+ massert(a_prob==prob_of_target_and_alignment_given_source(a,2));
+
+ //alignment b(a);
+ const_cast<alignment&>(a).set(j1,aj2);
+ const_cast<alignment&>(a).set(j2,aj1);
+ LogProb b_prob=prob_of_target_and_alignment_given_source(a,2);
+ const_cast<alignment&>(a).set(j1,aj1);
+ const_cast<alignment&>(a).set(j2,aj2);
+
+ if( verboseTP )
+ cerr << "scoreOfSwap: " << change << ' ' << a_prob << ' ' << b_prob << ' ' << endl;
+ change*=b_prob/a_prob;
+ if( verboseTP )
+ cerr << "resulting: " << change << " should be " << _scoreOfSwap(a,j1,j2) << endl;
+ return change;
+}
+
+LogProb transpair_model4::prob_of_target_and_alignment_given_source_1(const alignment&al,bool verb)const
+{
+ LogProb total = 1.0 ;
+ total *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
+ if( verb) cerr << "IBM-4: (1-p1)^(m-2 f0)*p1^f0: " << total << endl;
+ for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
+ total *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
+ if( verb) cerr << "IBM-4: +NULL:binomial+distortion " << total << endl;
+ for (WordIndex i = 1 ; i <= l ; i++)
+ {
+ total *= get_fertility(i, al.fert(i));// * (LogProb) factorial(al.fert(i));
+ if( verb) cerr << "IBM-4: fertility of " << i << " " << get_fertility(i, al.fert(i)) << " -> " << total << endl;
+ }
+ for (WordIndex j = 1 ; j <= m ; j++)
+ {
+ total*= get_t(al(j), j) ;
+ if( verb) cerr << "IBM-4: t of j:" << j << " i:" << al(j) << ": " << get_t(al(j), j) << " -> " << total << endl;
+ }
+ return total;
+}
+
+LogProb transpair_model4::prob_of_target_and_alignment_given_source(const alignment&al, short distortionType,bool verb)const
+{
+ LogProb total = 1.0 ;
+ static const LogProb almostZero = 1E-299 ;
+ if( distortionType&1 )
+ {
+ total *= prob_of_target_and_alignment_given_source_1(al,verb);
+ }
+ if( distortionType&2 )
+ {
+ for(WordIndex j=1;j<=m;j++)
+ if( al(j) )
+ if( al.get_head(al(j))==j)
+ {
+ int ep=al.prev_cept(al(j));
+ float x2=probFirst[ep](j,al.get_center(ep));
+ massert(x2<=1.0);
+ total*=x2;
+ if( verb) cerr << "IBM-4: d=1 of " << j << ": " << x2 << " -> " << total << endl;
+ }
+ else
+ {
+ float x2=probSecond(j,al.prev_in_cept(j));
+ massert(x2<=1.0);
+ total*=x2;
+ if( verb) cerr << "IBM-4: d>1 of " << j << ": " << x2 << " -> " << total << endl;
+ }
+ }
+ return total?total:almostZero;
+}
+
+void transpair_model4::computeScores(const alignment&al,vector<double>&d)const
+{
+ LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ;
+ total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
+ for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
+ total1 *= double(m - al.fert(0) - i + 1) / (double(DeficientDistortionForEmptyWord?(max(2,int(m))/DeficientDistortionForEmptyWord):i)) ;
+ for (WordIndex i = 1 ; i <= l ; i++)
+ total2 *= get_fertility(i, al.fert(i));// * (LogProb) factorial(al.fert(i));
+ for (WordIndex j = 1 ; j <= m ; j++)
+ total3*= get_t(al(j), j) ;
+ for(WordIndex j=1;j<=m;j++)
+ if( al(j) )
+ if( al.get_head(al(j))==j)
+ {
+ int ep=al.prev_cept(al(j));
+ float x2=probFirst[ep](j,al.get_center(ep));
+ total4*=x2;
+ }
+ else
+ {
+ float x2=probSecond(j,al.prev_in_cept(j));
+ total4*=x2;
+ }
+ d.push_back(total1);//9
+ d.push_back(total2);//10
+ d.push_back(total3);//11
+ d.push_back(total4);//12
+}
diff --git a/GIZA++-v2/transpair_model4.h b/GIZA++-v2/transpair_model4.h
new file mode 100644
index 0000000..730fbe7
--- /dev/null
+++ b/GIZA++-v2/transpair_model4.h
@@ -0,0 +1,79 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef transpair_model4_h_fjo_defined
+#define transpair_model4_h_fjo_defined
+#include "Array2.h"
+#include "defs.h"
+#include "Vector.h"
+#include "NTables.h"
+#include "ATables.h"
+#include "TTables.h"
+#include "alignment.h"
+#include "D4Tables.h"
+#include "transpair_model3.h"
+
+extern double factorial(int n);
+
+class transpair_model4 : public transpair_model3
+{
+ private:
+ d4model&d4m;
+ Array2<double> probSecond;
+ Vector<Array2<double> > probFirst;
+ public:
+ typedef transpair_model3 simpler_transpair_model;
+ transpair_model4(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, tmodel<COUNT, PROB>&tTable, amodel<PROB>&aTable, amodel<PROB>&dTable, nmodel<PROB>&nTable, double _p1, double _p0,d4model*_d4m)
+ : transpair_model3(es, fs, tTable, aTable, dTable, nTable, _p1, _p0),
+ d4m(*_d4m),probSecond(m+1,m+1,0.0),probFirst(l+1)
+ {
+ for(unsigned int j1=1;j1<=m;++j1)
+ for(unsigned int j2=1;j2<j1;++j2)
+ {
+ probSecond(j1,j2)=d4m.getProb_bigger(j1,j2,0,d4m.fwordclasses.getClass(get_fs(j1)),l,m);
+ }
+ for(unsigned int i=0;i<=l;++i)
+ {
+ Array2<double> &pf=probFirst[i]=Array2<double>(m+1,m+1,0.0);
+ for(unsigned int j1=1;j1<=m;++j1)
+ {
+ map<m4_key,d4model::Vpff,compare1 >::const_iterator ci=d4m.getProb_first_iterator(d4m.ewordclasses.getClass(get_es(i)),d4m.fwordclasses.getClass(get_fs(j1)),l,m);
+ for(unsigned int j2=0;j2<=m;++j2)
+ {
+ pf(j1,j2)=d4m.getProb_first_withiterator(j1,j2,m,ci);
+ massert(pf(j1,j2)==d4m.getProb_first(j1,j2,d4m.ewordclasses.getClass(get_es(i)),d4m.fwordclasses.getClass(get_fs(j1)),l,m));
+ }
+ }
+ }
+ }
+ LogProb prob_of_target_and_alignment_given_source_1(const alignment&al,bool verb)const;
+ LogProb scoreOfAlignmentForChange(const alignment&a)const
+ {return prob_of_target_and_alignment_given_source(a,2); }
+ LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
+ LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ;
+ LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
+ LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ;
+ int modelnr()const{return 4;}
+ LogProb prob_of_target_and_alignment_given_source(const alignment&al, short distortionType=3,bool verb=0)const;
+ void computeScores(const alignment&al,vector<double>&d)const;
+};
+#endif
diff --git a/GIZA++-v2/transpair_model5.cpp b/GIZA++-v2/transpair_model5.cpp
new file mode 100644
index 0000000..7baa5ca
--- /dev/null
+++ b/GIZA++-v2/transpair_model5.cpp
@@ -0,0 +1,243 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "transpair_model5.h"
+#include "Parameter.h"
+
+int m5scorefound=0,m5scorenotfound=0;
+
+GLOBAL_PARAMETER(float,d5modelsmooth_factor,"model5SmoothFactor","smooting parameter for distortion probabilities in Model 5 (linear interpolation with constant)",PARLEV_SMOOTH,0.1);
+float d5modelsmooth_countoffset=0.0;
+
+LogProb transpair_model5::_scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double)const
+{
+ if( doModel4Scoring )
+ return transpair_model4::_scoreOfMove(a,new_i,j);
+ alignment b(a);
+ b.set(j, new_i);
+ LogProb a_prob=prob_of_target_and_alignment_given_source(a);
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b);
+ if( a_prob )
+ return b_prob/a_prob;
+ else if( b_prob )
+ return 1e20;
+ else
+ return 1.0;
+}
+LogProb transpair_model5::_scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const
+{
+ if( doModel4Scoring )
+ return transpair_model4::_scoreOfSwap(a,j1,j2,thisValue);
+ alignment b(a);
+ b.set(j1, a(j2));
+ b.set(j2, a(j1));
+ LogProb a_prob=prob_of_target_and_alignment_given_source(a);
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b);
+ assert(a_prob);
+ assert(b_prob);
+ if( a_prob )
+ return b_prob/a_prob;
+ else if( b_prob )
+ return 1e20;
+ else
+ return 1.0;
+}
+
+//increasing efficiency: no copy of alignment (calc. everything incrementally)
+LogProb transpair_model5::scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue)const
+{
+ if( doModel4Scoring )
+ return transpair_model4::scoreOfMove(a,new_i,j,thisValue);
+ alignment b(a);
+ b.set(j,new_i);
+
+ LogProb change;
+ const WordIndex old_i=a(j);
+ WordIndex f0=a.fert(0);
+ if (old_i == new_i)
+ change=1.0;
+ else if (old_i == 0)
+ change=((double)p0*p0/p1) *
+ ((f0*(m-f0+1.0)) / ((m-2*f0+1)*(m-2*f0+2.0))) *
+ ((PROB)(1.0)) *
+ (get_fertility(new_i, a.fert(new_i)+1) / get_fertility(new_i, a.fert(new_i)))*
+ (t(new_i, j)/t(old_i, j))*
+ 1.0;
+ else if (new_i == 0)
+ change=(double(p1) / (p0*p0)) *
+ (double((m-2*f0)*(m-2*f0-1))/((1+f0)*(m-f0))) *
+ (1.0) *
+ (get_fertility(old_i, a.fert(old_i)-1) /get_fertility(old_i, a.fert(old_i)))*
+ (t(new_i, j) /t(old_i, j)) *
+ (1.0);
+ else
+ change=(1.0) *
+ (get_fertility(old_i,a.fert(old_i)-1) / get_fertility(old_i,a.fert(old_i))) *
+ (get_fertility(new_i,a.fert(new_i)+1) /get_fertility(new_i,a.fert(new_i))) *
+ (t(new_i,j)/t(old_i,j)) *
+ (1.0);
+ LogProb a_prob=thisValue;
+ if( a_prob<0.0 )
+ a_prob=prob_of_target_and_alignment_given_source(a,2);
+ massert(a_prob==prob_of_target_and_alignment_given_source(a,2));
+
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b,2);
+ change*=b_prob/a_prob;
+ return change;
+}
+LogProb transpair_model5::scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue)const
+{
+ if( doModel4Scoring )
+ return transpair_model4::scoreOfSwap(a,j1,j2,thisValue);
+ alignment b(a);
+ b.set(j1,a(j2));
+ b.set(j2,a(j1));
+ LogProb change=transpair_model3::scoreOfSwap(a,j1,j2,-1.0,0);
+ LogProb a_prob=thisValue;
+ if( a_prob<0.0 )
+ a_prob=prob_of_target_and_alignment_given_source(a,2);
+ massert(a_prob==prob_of_target_and_alignment_given_source(a,2));
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b,2);
+ change*=b_prob/a_prob;
+ return change;
+}
+
+LogProb transpair_model5::prob_of_target_and_alignment_given_source(const alignment&al, short distortionType,bool verb)const
+{
+ if( doModel4Scoring )
+ return transpair_model4::prob_of_target_and_alignment_given_source(al,distortionType);
+ LogProb total = 1.0 ;
+ static const LogProb almostZero = 1E-299 ;
+ double x2;
+ if( distortionType&1 )
+ {
+ total *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
+ if( verb) cerr << "IBM-5: (1-p1)^(m-2 f0)*p1^f0: " << total << endl;
+ for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
+ total *= double(m - al.fert(0) - i + 1) / i ; // IBM-5 is not deficient!
+ if( verb) cerr << "IBM-5: +NULL:binomial+distortion " << total << endl;
+ for (WordIndex i = 1 ; i <= l ; i++)
+ {
+ total *= get_fertility(i, al.fert(i));
+ if( verb) cerr << "IBM-5: fertility of " << i << " " << get_fertility(i, al.fert(i)) << " -> " << total << endl;
+ }
+ for (WordIndex j = 1 ; j <= m ; j++)
+ {
+ total*= get_t(al(j), j) ;
+ if( verb) cerr << "IBM-5: t of j:" << j << " i:" << al(j) << ": " << get_t(al(j), j) << " -> " << total << endl;
+ }
+ }
+ if( distortionType&2 )
+ {
+ PositionIndex prev_cept=0;
+ PositionIndex vac_all=m;
+ Vector<char> vac(m+1,0);
+ for(WordIndex i=1;i<=l;i++)
+ {
+ PositionIndex cur_j=al.als_i[i];
+ PositionIndex prev_j=0;
+ PositionIndex k=0;
+ if(cur_j) { // process first word of cept
+ k++;
+ // previous position
+ total*= (x2=d5m.getProb_first(vacancies(vac,cur_j),vacancies(vac,al.get_center(prev_cept)),d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-al.fert(i)+k));
+
+ vac_all--;
+ assert(vac[cur_j]==0);
+ vac[cur_j]=1;
+
+ if( verb) cerr << "IBM-5: d=1 of " << cur_j << ": " << x2 << " -> " << total << endl;
+ prev_j=cur_j;
+ cur_j=al.als_j[cur_j].next;
+ }
+ while(cur_j) { // process following words of cept
+ k++;
+ // previous position
+ int vprev=vacancies(vac,prev_j);
+ total*= (x2=d5m.getProb_bigger(vacancies(vac,cur_j),vprev,d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-al.fert(i)+k));
+
+
+ vac_all--;
+ vac[cur_j]=1;
+
+
+ if( verb) cerr << "IBM-5: d>1 of " << cur_j << ": " << x2 << " -> " << total << endl;
+ prev_j=cur_j;
+ cur_j=al.als_j[cur_j].next;
+ }
+ assert(k==al.fert(i));
+ if( k )
+ prev_cept=i;
+ }
+ assert(vac_all==al.fert(0));
+ }
+ total = total?total:almostZero;
+ return total;
+}
+
+
+void transpair_model5::computeScores(const alignment&al,vector<double>&d)const
+{
+ LogProb total1 = 1.0,total2=1.0,total3=1.0,total4=1.0 ;
+ total1 *= pow(double(1-p1), m-2.0 * al.fert(0)) * pow(double(p1), double(al.fert(0)));
+ for (WordIndex i = 1 ; i <= al.fert(0) ; i++)
+ total1 *= double(m - al.fert(0) - i + 1) / i ; // IBM-5 is not deficient!
+ for (WordIndex i = 1 ; i <= l ; i++)
+ total2 *= get_fertility(i, al.fert(i));
+ for (WordIndex j = 1 ; j <= m ; j++)
+ total3*= get_t(al(j), j) ;
+ PositionIndex prev_cept=0;
+ PositionIndex vac_all=m;
+ Vector<char> vac(m+1,0);
+ for(WordIndex i=1;i<=l;i++)
+ {
+ PositionIndex cur_j=al.als_i[i];
+ PositionIndex prev_j=0;
+ PositionIndex k=0;
+ if(cur_j) { // process first word of cept
+ k++;
+ total4*=d5m.getProb_first(vacancies(vac,cur_j),vacancies(vac,al.get_center(prev_cept)),d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-al.fert(i)+k);
+ vac_all--;
+ assert(vac[cur_j]==0);
+ vac[cur_j]=1;
+ prev_j=cur_j;
+ cur_j=al.als_j[cur_j].next;
+ }
+ while(cur_j) { // process following words of cept
+ k++;
+ int vprev=vacancies(vac,prev_j);
+ total4*=d5m.getProb_bigger(vacancies(vac,cur_j),vprev,d5m.fwordclasses.getClass(get_fs(cur_j)),l,m,vac_all-vprev/*war weg*/-al.fert(i)+k);
+ vac_all--;
+ vac[cur_j]=1;
+ prev_j=cur_j;
+ cur_j=al.als_j[cur_j].next;
+ }
+ assert(k==al.fert(i));
+ if( k )
+ prev_cept=i;
+ }
+ assert(vac_all==al.fert(0));
+ d.push_back(total1);//13
+ d.push_back(total2);//14
+ d.push_back(total3);//15
+ d.push_back(total4);//16
+}
diff --git a/GIZA++-v2/transpair_model5.h b/GIZA++-v2/transpair_model5.h
new file mode 100644
index 0000000..5ecf49d
--- /dev/null
+++ b/GIZA++-v2/transpair_model5.h
@@ -0,0 +1,74 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef transpair_model5_h_fjo_defined
+#define transpair_model5_h_fjo_defined
+#include "Array2.h"
+#include "defs.h"
+#include "Vector.h"
+#include "NTables.h"
+#include "ATables.h"
+#include "TTables.h"
+#include "alignment.h"
+#include "D5Tables.h"
+#include "transpair_model4.h"
+
+extern double factorial(int n);
+
+inline int vacancies(const Vector<char>&vac,int u)
+{
+ int n=0;
+ const char *i=&(vac[0])+1;
+ const char *end=&(vac[0])+u+1;
+ while(i<end)
+ n+= ((*i++)==0);
+ return n;
+}
+
+class transpair_model5 : public transpair_model4
+{
+ private:
+ const d5model&d5m;
+ bool doModel4Scoring;
+ public:
+ typedef transpair_model3 simpler_transpair_model;
+ mutable map<Vector<PositionIndex>,LogProb> scores[4];
+ transpair_model5(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, tmodel<COUNT, PROB>&tTable,
+ amodel<PROB>&aTable, amodel<PROB>&dTable, nmodel<PROB>&nTable, double _p1, double _p0,
+ const d5model*_d5m)
+ : transpair_model4(es, fs, tTable, aTable, dTable, nTable, _p1, _p0,&_d5m->d4m),d5m(*_d5m),doModel4Scoring(0) {}
+ LogProb scoreOfAlignmentForChange(const alignment&a)const
+ {
+ if( doModel4Scoring )
+ return transpair_model4::prob_of_target_and_alignment_given_source(a,2);
+ else
+ return prob_of_target_and_alignment_given_source(a,2);
+ }
+ LogProb scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
+ LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ;
+ LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double thisValue=-1.0)const;
+ LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double thisValue=-1.0)const ;
+ int modelnr()const{return 5;}
+ LogProb prob_of_target_and_alignment_given_source(const alignment&al, short distortionType=3,bool verb=0)const;
+ void computeScores(const alignment&al,vector<double>&d)const;
+};
+#endif
diff --git a/GIZA++-v2/transpair_modelhmm.h b/GIZA++-v2/transpair_modelhmm.h
new file mode 100644
index 0000000..2b38913
--- /dev/null
+++ b/GIZA++-v2/transpair_modelhmm.h
@@ -0,0 +1,223 @@
+/*
+
+Copyright (C) 2000,2001 Franz Josef Och (RWTH Aachen - Lehrstuhl fuer Informatik VI)
+
+This file is part of GIZA++ ( extension of GIZA ).
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef transpair_modelhmm_h_fjo_defined
+#define transpair_modelhmm_h_fjo_defined
+#include "Array2.h"
+#include "defs.h"
+#include "Vector.h"
+#include "NTables.h"
+#include "ATables.h"
+#include "TTables.h"
+#include "alignment.h"
+#include <math.h>
+#include "transpair_model2.h"
+#include "ForwardBackward.h"
+#include "hmm.h"
+
+class transpair_modelhmm : public transpair_model2
+{
+ public:
+ typedef transpair_modelhmm simpler_transpair_model;
+ HMMNetwork*net;
+ transpair_modelhmm(const Vector<WordIndex>&es, const Vector<WordIndex>&fs, const tmodel<COUNT, PROB>&tTable,
+ const amodel<PROB>&aTable,const amodel<PROB>&,const nmodel<PROB>&,
+ double, double,const hmm*h)
+ : transpair_model2(es,fs,tTable,aTable),net(h->makeHMMNetwork(es,fs,0))
+ {}
+ ~transpair_modelhmm() { delete net; }
+ int modelnr()const{return 6;}
+ LogProb scoreOfMove(const alignment&a, WordIndex _new_i, WordIndex j,double=-1.0)const
+ {
+ int new_i=_new_i;
+ LogProb change=1.0;
+ int old_i=a(j);
+ if (old_i == new_i)
+ change=1.0;
+ else
+ {
+ int theJ=j-1;
+ old_i--;
+ new_i--;
+ int jj=j-1;
+ while(jj>0&&a(jj)==0)
+ jj--;
+ int theIPrev= (jj>0)?(a(jj)-1):0;
+ if( j>1&&a(j-1)==0 )
+ theIPrev+=l;
+ if( old_i==-1 ){old_i = theIPrev;if(old_i<int(l))old_i+=l;}
+ if( new_i==-1 ){new_i = theIPrev;if(new_i<int(l))new_i+=l;}
+ int theIPrevOld=theIPrev,theIPrevNew=theIPrev;
+ if( theJ==0 )
+ {
+ change*=net->getAlphainit(new_i)/net->getAlphainit(old_i);
+ }
+ do
+ {
+ if( new_i!=old_i )
+ {
+ change*=net->nodeProb(new_i,theJ)/net->nodeProb(old_i,theJ);
+ }
+ if( theJ>0)
+ change*=net->outProb(theJ,theIPrevNew,new_i)/net->outProb(theJ,theIPrevOld,old_i);
+ theIPrevOld=old_i;
+ theIPrevNew=new_i;
+ theJ++;
+ if( theJ<int(m) && a(theJ+1)==0 )
+ {
+ if( new_i<int(l)) new_i+=l;
+ if( old_i<int(l)) old_i+=l;
+ }
+ } while( theJ<int(m) && a(theJ+1)==0 );
+ if(theJ==int(m))
+ {
+ change*=net->getBetainit(new_i)/net->getBetainit(old_i);
+ }
+ else
+ {
+ new_i=a(theJ+1)-1;
+ if( new_i==-1)
+ new_i=theIPrevNew;
+ change*=net->outProb(theJ,theIPrevNew,new_i)/net->outProb(theJ,theIPrevOld,new_i);
+ }
+ }
+ return change;
+ }
+ LogProb scoreOfAlignmentForChange(const alignment&)const
+ {return -1.0; }
+ LogProb scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const
+ {
+ return _scoreOfSwap(a,j1,j2);
+ }
+ LogProb _scoreOfMove(const alignment&a, WordIndex new_i, WordIndex j,double=-1.0)const
+ {
+ alignment b(a);
+ b.set(j, new_i);
+ LogProb a_prob=prob_of_target_and_alignment_given_source(a);
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b);
+ if( a_prob )
+ return b_prob/a_prob;
+ else if( b_prob )
+ return 1e20;
+ else
+ return 1.0;
+ }
+ LogProb _scoreOfSwap(const alignment&a, WordIndex j1, WordIndex j2,double=-1.0)const
+ {
+ WordIndex aj1=a(j1),aj2=a(j2);
+ if( aj1==aj2 )
+ return 1.0;
+ LogProb a_prob=prob_of_target_and_alignment_given_source(a);
+
+ /*alignment b(a);
+ b.set(j1, a(j2));
+ b.set(j2, a(j1));
+ LogProb b_prob=prob_of_target_and_alignment_given_source(b);*/
+
+ const_cast<alignment&>(a).set(j1,aj2);
+ const_cast<alignment&>(a).set(j2,aj1);
+ LogProb b_prob=prob_of_target_and_alignment_given_source(a);
+ const_cast<alignment&>(a).set(j1,aj1);
+ const_cast<alignment&>(a).set(j2,aj2);
+
+ if( a_prob )
+ return b_prob/a_prob;
+ else if( b_prob )
+ return 1e20;
+ else
+ return 1.0;
+ }
+ inline friend ostream&operator<<(ostream&out, const transpair_modelhmm&)
+ {
+ return out << "NO-OUTPUT for transpair_modelhmm\n";
+ }
+ LogProb prob_of_target_and_alignment_given_source(const alignment&al,bool verbose=0)const
+ {
+ double prob=1.0;
+ int theIPrev=0;
+ for(unsigned int j=1;j<=m;j++)
+ {
+ int theJ=j-1;
+ int theI=al(j)-1;
+ if( theI==-1 )
+ theI=(theIPrev%l)+l;
+ prob*=net->nodeProb(theI,theJ);
+ if( verbose )
+ cout << "NP " << net->nodeProb(theI,theJ) << ' ';
+ if( j==1 )
+ {
+ prob*=net->getAlphainit(theI);
+ if( verbose )
+ cout << "AP0 " << net->getAlphainit(theI) << ' ';
+ }
+ else
+ {
+ prob*=net->outProb(theJ,theIPrev,theI);
+ if( verbose )
+ cout << "AP1 " << net->outProb(theJ,theIPrev,theI) << ' ';
+ }
+ theIPrev=theI;
+ if( j==m )
+ {
+ prob*=net->getBetainit(theI);
+ if( verbose )
+ cout << "AP2 " << net->getBetainit(theI) << ' ';
+ }
+ if( verbose )
+ cout << "j:"<<theJ<<" i:"<<theI << "; ";
+ }
+ if( verbose )
+ cout << '\n';
+ return prob*net->finalMultiply;
+ }
+ void computeScores(const alignment&al,vector<double>&d)const
+ {
+ double prob1=1.0,prob2=1.0;
+ int theIPrev=0;
+ for(unsigned int j=1;j<=m;j++)
+ {
+ int theJ=j-1;
+ int theI=al(j)-1;
+ if( theI==-1 )
+ theI=(theIPrev%l)+l;
+ prob1*=net->nodeProb(theI,theJ);
+ if( j==1 )
+ {
+ prob2*=net->getAlphainit(theI);
+ }
+ else
+ {
+ prob2*=net->outProb(theJ,theIPrev,theI);
+ }
+ theIPrev=theI;
+ if( j==m )
+ {
+ prob2*=net->getBetainit(theI);
+ }
+ }
+ d.push_back(prob1);
+ d.push_back(prob2);
+ }
+
+ bool isSubOptimal()const{return 0;}
+};
+#endif
diff --git a/GIZA++-v2/utility.cpp b/GIZA++-v2/utility.cpp
new file mode 100644
index 0000000..4e9607a
--- /dev/null
+++ b/GIZA++-v2/utility.cpp
@@ -0,0 +1,30 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "mymath.h"
+
+double factorial(int n)
+{
+ double f=1;
+ for(int i=2; i <= n; i++)
+ f *= i;
+ return f;
+}
diff --git a/GIZA++-v2/utility.h b/GIZA++-v2/utility.h
new file mode 100644
index 0000000..078a2a0
--- /dev/null
+++ b/GIZA++-v2/utility.h
@@ -0,0 +1,54 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef utility_h
+#define utility_h
+#include <iostream>
+#include "Perplexity.h"
+#include "Vector.h"
+#include "TTables.h"
+#include "getSentence.h"
+#include "vocab.h"
+
+extern void printHelp(void);
+extern void parseConfigFile (char * fname );
+extern void parseArguments(int argc, char *argv[]);
+extern void generatePerplexityReport(const Perplexity& trainperp,
+ const Perplexity& testperp,
+ const Perplexity& trainVperp,
+ const Perplexity& testVperp,
+ ostream& of, int trainsize,
+ int testsize, unsigned int last, bool);
+
+extern void printSentencePair(Vector<WordIndex>& es, Vector<WordIndex>& fs, ostream& of);
+
+extern void printOverlapReport(const tmodel<COUNT, PROB>& tTable,
+ sentenceHandler& testHandler, vcbList& trainEList,
+ vcbList& trainFList, vcbList& testEList, vcbList& testFList);
+
+extern void printAlignToFile(const Vector<WordIndex>& es, const Vector<WordIndex>& fs,
+ const Vector<WordEntry>& evlist, const Vector<WordEntry>& fvlist,
+ ostream& of2, const Vector<WordIndex>& viterbi_alignment, int pair_no,
+ double viterbi_score);
+
+extern double factorial(int) ;
+
+#endif
diff --git a/GIZA++-v2/vocab.cpp b/GIZA++-v2/vocab.cpp
new file mode 100644
index 0000000..9ad171c
--- /dev/null
+++ b/GIZA++-v2/vocab.cpp
@@ -0,0 +1,90 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "vocab.h"
+
+void vcbList::readVocabList()
+ // reads a vocabulary file from fname. It expects the following format:
+ //
+ // token_id token_string frequency
+{
+
+ int freq=0;
+ WordIndex word_id ;
+ WordEntry entry("NULL",0) ;
+
+ string line, word ;
+ cerr << "Reading vocabulary file from:" << fname << "\n";
+ // total = 0 ;
+ ifstream vFile(fname);
+ if(!vFile){
+ cerr << "\nCannot open vocabulary file " << fname << "file";
+ exit(1);
+ }
+
+ list.push_back(entry);
+ s2i[entry.word]=list.size()-1;
+
+ while(getline(vFile, line)){
+ istrstream buffer(line.c_str());
+ if(!(buffer >> word_id >> word >> freq))
+ cerr << "ERROR: reading vocabulary; " << word_id << ' ' << word << ' ' << freq << endl;
+ if (word_id == 0){
+ cerr << "ERROR: TOKEN ID 0 is reserved for special token NULL, in line: \n"<< line<<"\n" ;
+ exit(-1);
+ }
+ else if (word_id >= MAX_VOCAB_SIZE){
+ cerr << "ERROR: TOKEN ID is greater than maximum vocabulary size "
+ << MAX_VOCAB_SIZE << " in line :\n"<< line <<"\n" ;
+ exit(-1);
+ }
+ else if (freq < 0){
+ cerr << "ERROR: frequency must be a positive integer, in line :\n"
+ << line <<"\n";
+ exit(-1);
+ }
+ else if(word_id >= list.size()){
+ list.resize(word_id+1);
+ list[word_id].word = word ;
+ s2i[word]=word_id;
+ list[word_id].freq = 0 ;
+ noUniqueTokens = word_id + 1 ;
+ // noUniqueTokens++ ;
+ // total += freq ;
+ }
+ else if(list[word_id].word != "\0"){
+ cerr << "ERROR: TOKEN ID must be unique for each token, in line :\n"
+ << line <<"\n";
+ cerr << "TOKEN ID " << word_id << " has already been assigned to: " <<
+ list[word_id].word << "\n";
+ exit(-1);
+ }
+ else { // line has valid information
+ list[word_id].word = word ;
+ s2i[word]=word_id;
+ list[word_id].freq = 0 ;
+ // noUniqueTokens++ ;
+ noUniqueTokens = word_id + 1 ;
+ // total += freq ;
+ }
+ } // end of while
+}
+
diff --git a/GIZA++-v2/vocab.h b/GIZA++-v2/vocab.h
new file mode 100644
index 0000000..ab637c2
--- /dev/null
+++ b/GIZA++-v2/vocab.h
@@ -0,0 +1,103 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#ifndef _vocab_h
+#define _vocab_h 1
+
+#include "defs.h"
+#include "Vector.h"
+
+#include <fstream>
+#include <strstream>
+#include <map>
+
+class WordEntry {
+ public:
+ string word ;
+ double freq ;
+ WordEntry():word("\0"), freq(0){};
+ WordEntry(string w, int f):word(w), freq(f){};
+};
+
+class vcbList{
+ private:
+ Vector<WordEntry> list ;
+ map<string,int> s2i;
+ double total;
+ WordIndex noUniqueTokens ;
+ WordIndex noUniqueTokensInCorpus ;
+ const char* fname ;
+ public:
+ vcbList(const char* f=0):list(), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f){};
+ void setName(const char*f)
+ { fname=f; }
+ vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname){};
+ inline WordIndex size()const {return (list.size());};
+ inline WordIndex uniqTokens()const {return noUniqueTokens;};
+ inline WordIndex uniqTokensInCorpus()const {return noUniqueTokensInCorpus;};
+ inline double totalVocab() const {return total;};
+ inline Vector<WordEntry>& getVocabList() { return(list);};
+ inline const Vector<WordEntry>& getVocabList()const { return(list);};
+ void readVocabList();
+ void incFreq(WordIndex id , double f){
+ if(id < list.size()){
+ if (list[id].freq == 0)
+ noUniqueTokensInCorpus++;
+ list[id].freq += f ;
+ total += f ;
+ }
+ };
+ void clearAllFreq(){
+ for (WordIndex id = 0 ; id < list.size() ; id++)
+ list[id].freq = 0 ;
+ total = 0 ;
+ noUniqueTokensInCorpus = 0 ;
+ };
+ int operator()(const string&x)const
+ {
+ map<string,int>::const_iterator i=s2i.find(x);
+ if( i!=s2i.end() )
+ return i->second;
+ else
+ {
+ cerr << "ERROR: no word index for '"<<x<<"'\n";
+ return 0;
+ }
+ }
+ const string operator()(WordIndex id) const { // Yaser - 2000-12-13
+ if (id < list.size())
+ return list[id].word ;
+ else return 0 ;
+ }
+ const string operator[](WordIndex id) const { // Yaser - 2000-12-13
+ if (id < list.size())
+ return list[id].word ;
+ else return 0 ;
+ }
+ void printVocabList(ostream& of){
+ for (WordIndex i = 1 ; i < list.size() ; i++){
+ if (list[i].word != "" && list[i].freq > 0)
+ of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n';
+ }
+ }
+
+};
+#endif
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..afe9e9d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,14 @@
+
+.PHONY: gizapp mkcls-v2
+
+all: gizapp mkcls-v2
+
+gizapp:
+ $(MAKE) -C GIZA++-v2
+
+mkcls-v2:
+ $(MAKE) -C mkcls-v2
+
+clean:
+ $(MAKE) -C GIZA++-v2 clean
+ $(MAKE) -C mkcls-v2 clean
diff --git a/README b/README
new file mode 100644
index 0000000..c4b4e34
--- /dev/null
+++ b/README
@@ -0,0 +1,8 @@
+This package contains the GIZA++ toolkit and the mkcls tool, originally
+written by F.J. Och and several other authors.
+
+For more information, refer to the README files and the following pages:
+ http://www.fjoch.com/mkcls.html
+ http://www.fjoch.com/GIZA++.html
+
+
diff --git a/mkcls-v2/Array.h b/mkcls-v2/Array.h
new file mode 100644
index 0000000..5647fd0
--- /dev/null
+++ b/mkcls-v2/Array.h
@@ -0,0 +1,370 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#ifndef ARRAY_H_DEFINED
+#define ARRAY_H_DEFINED
+using namespace std;
+#include "myassert.h"
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <functional>
+#include "my.h"
+
+#define ARRAY_DEBUG
+
+
+template<class T> class Array
+{
+ private:
+ T *p;
+ int realSize;
+ int maxWritten;
+ char a;
+
+ void copy(T *a,const T *b,int n);
+ void copy(T *a,T *b,int n);
+ void _expand();
+
+ public:
+ Array()
+ : p(0),realSize(0),maxWritten(-1) ,a(1)
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "MAKE ARRAY: " << this<<" "<<(void*)p << endl;
+#endif
+ }
+ Array(const Array<T> &x)
+ : p(new T[x.maxWritten+1]),realSize(x.maxWritten+1),maxWritten(x.maxWritten),a(x.a)
+ {
+ copy(p,x.p,realSize);
+#ifdef VERY_ARRAY_DEBUG
+ cout << "MAKE ARRAY copy: " << this << " " << realSize <<" "<<(void*)p<< endl;
+#endif
+ }
+ explicit Array(int n)
+ : p(new T[n]),realSize(n),maxWritten(n-1),a(0)
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "MAKE ARRAY with parameter n: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ }
+ Array(int n,const T&_init,int _a=0)
+ : p(new T[n]),realSize(n),maxWritten(n-1),a(_a)
+ {
+ for(int iii=0;iii<n;iii++)p[iii]=_init;
+#ifdef VERY_ARRAY_DEBUG
+ cout << "MAKE ARRAY with parameter n and init: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ }
+
+ ~Array()
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ delete [] p;
+ }
+
+ Array<T>& operator=(const Array<T>&x)
+ {
+ if( this!= &x )
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+
+ delete [] p;
+ realSize = x.maxWritten+1;
+ maxWritten = x.maxWritten;
+ a = x.a;
+ p = new T[realSize];
+ copy(p,x.p,realSize);
+#ifdef VERY_ARRAY_DEBUG
+ cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ }
+ return *this;
+ }
+
+ Array<T>& operator=(Array<T>&x)
+ {
+ if( this!= &x )
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ delete [] p;
+ realSize = x.maxWritten+1;
+ maxWritten = x.maxWritten;
+ a = x.a;
+ p = new T[realSize];
+ copy(p,x.p,realSize);
+#ifdef VERY_ARRAY_DEBUG
+ cout << "NEW ARRAY because of operator=: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ }
+ return *this;
+ }
+
+ void allowAccess(int n)
+ {
+ while( realSize<=n )
+ _expand();
+ maxWritten=max(maxWritten,n);
+ massert( maxWritten<realSize );
+ }
+ void resize(int n)
+ {
+ while( realSize<n )
+ _expand();
+ maxWritten=n-1;
+ }
+ void sort(int until=-1)
+ {
+ if( until== -1 ) until=size();
+ std::sort(p,p+until);
+ }
+ void invsort(int until=-1)
+ {
+ if( until== -1 ) until=size();
+ std::sort(p,p+until,greater<T>());
+ }
+ void init(int n,const T&_init,bool _a=0)
+ {
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ delete []p;
+ p=new T[n];
+ realSize=n;
+ a=_a;
+ maxWritten=n-1;
+ for(int iii=0;iii<n;iii++)p[iii]=_init;
+#ifdef VERY_ARRAY_DEBUG
+ cout << "NEW ARRAY because of init: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ }
+ inline int size() const
+ {massert( maxWritten<realSize );
+ return maxWritten+1;}
+ inline int low() const
+ { return 0; }
+ inline int high() const
+ { return maxWritten; }
+ inline bool autoexpand() const
+ {return a;}
+ inline void autoexpand(bool autoExp)
+ {a=autoExp;}
+ int findMax() const;
+ int findMin() const;
+ const void errorAccess(int n) const;
+ inline T*getPointerToData(){return p;}
+
+ inline T& operator[](int n)
+ {
+ if( a && n==maxWritten+1 )
+ allowAccess(n);
+ if( n<0 || n>maxWritten )
+ errorAccess(n);
+ return p[n];
+ }
+ inline const T& operator[](int n) const
+ {
+ if(n<0 || n>maxWritten )
+ errorAccess(n);
+ return p[n];
+ }
+ const T&top(int n=0) const
+ {return (*this)[maxWritten-n];}
+ T&top(int n=0)
+ {return (*this)[maxWritten-n];}
+ T&push(const T&x)
+ {
+ (*this)[maxWritten+1]=x;
+ return top();
+ }
+ bool writeTo(ostream&out) const
+ {
+ out << "Array ";
+ out << size() << " ";
+ out << a << endl;
+ for(int iv=0;iv<=maxWritten;iv++)
+ {
+ writeOb(out,(*this)[iv]);
+ out << endl;
+ }
+ return 1;
+ }
+ bool readFrom(istream&in)
+ {
+ string s;
+ if( !in )
+ {
+ cerr << "ERROR(Array): file cannot be opened.\n";
+ return 0;
+ }
+ in >> s;
+ if( !(s=="Array") )
+ {
+ cerr << "ERROR(Array): Array!='"<<s<<"'\n";
+ return 0;
+ }
+ int biggest;
+ in >> biggest;
+ in >> a;
+ resize(biggest);
+ for(int iv=0;iv<size();iv++)
+ {
+ readOb(in,(*this)[iv]);
+ }
+ return 1;
+ }
+};
+
+template<class T> bool operator==(const Array<T> &x, const Array<T> &y)
+{
+ if( &x == &y )
+ return 1;
+ else
+ {
+ if( y.size()!=x.size() )
+ return 0;
+ else
+ {
+ for(int iii=0;iii<x.size();iii++)
+ if( !(x[iii]==y[iii]) )
+ return 0;
+ return 1;
+ }
+ }
+}
+
+template<class T> bool operator<(const Array<T> &x, const Array<T> &y)
+{
+ if( &x == &y )
+ return 0;
+ else
+ {
+ if( y.size()<x.size() )
+ return !(y<x);
+ for(int iii=0;iii<x.size();iii++)
+ {
+ massert( iii!=y.size() );
+ if( x[iii]<y[iii] )
+ return 1;
+ else if( y[iii]<x[iii] )
+ return 0;
+ }
+ return x.size()!=y.size();
+ }
+}
+
+
+template<class T> const void Array<T>:: errorAccess(int n) const
+{
+ cerr << "ERROR: Access to array element " << n
+ << " (" << maxWritten << "," << realSize << "," << (void*)p << " " << a << ")\n";
+ cout << "ERROR: Access to array element " << n
+ << " (" << maxWritten << "," << realSize << "," << (void*)p << " " << a << ")\n";
+ massert(0);
+#ifndef DEBUG
+ abort();
+#endif
+}
+
+template<class T> ostream& operator<<(ostream&o,const Array<T>&a)
+{
+ o << "Array(" << a.size() << "," << a.autoexpand() << "){ ";
+ for(int iii=0;iii<a.size();iii++)
+ o << " " << iii<< ":" << a[iii]<<";";
+ return o << "}\n";
+}
+
+template<class T> istream& operator>>(istream&in, Array<T>&)
+{return in;}
+
+template<class T> int Hash(const Array<T>&a)
+{
+ int n=0;
+ for(int iii=0;iii<a.size();iii++)
+ n+=Hash(a[iii])*(iii+1);
+ return n+a.size()*47;
+}
+template<class T> void Array<T>::copy(T *aa,const T *bb,int n)
+{
+ for(int iii=0;iii<n;iii++)
+ aa[iii]=bb[iii];
+}
+template<class T> void Array<T>::copy(T *aa,T *bb,int n)
+{
+ for(int iii=0;iii<n;iii++)
+ aa[iii]=bb[iii];
+}
+
+template<class T> void Array<T>::_expand()
+{
+#ifdef VERY_ARRAY_DEBUG
+ cout << "FREE ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+ T *oldp=p;
+ int oldsize=realSize;
+ realSize=realSize*2+1;
+ p=new T[realSize];
+ copy(p,oldp,oldsize);
+ delete [] oldp;
+#ifdef VERY_ARRAY_DEBUG
+ cout << "NEW ARRAY because of _expand: " << this << " " << realSize<<" "<<(void*)p << endl;
+#endif
+}
+
+template<class T> int Array<T>::findMax() const
+{
+ if( size()==0 )
+ return -1;
+ else
+ {
+ int maxPos=0;
+ for(int iii=1;iii<size();iii++)
+ if( (*this)[maxPos]<(*this)[iii] )
+ maxPos=iii;
+ return maxPos;
+ }
+}
+template<class T> int Array<T>::findMin() const
+{
+ if( size()==0 )
+ return -1;
+ else
+ {
+ int minPos=0;
+ for(int iii=1;iii<size();iii++)
+ if( (*this)[iii]<(*this)[minPos] )
+ minPos=iii;
+ return minPos;
+ }
+}
+
+#endif
diff --git a/mkcls-v2/FixedArray.h b/mkcls-v2/FixedArray.h
new file mode 100644
index 0000000..39da0b1
--- /dev/null
+++ b/mkcls-v2/FixedArray.h
@@ -0,0 +1,287 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#ifndef FIXARRAY_H_DEFINED
+#define FIXARRAY_H_DEFINED
+#include <iostream>
+#include <string>
+#include <functional>
+
+template<class T>
+bool writeOb(ostream&out,const T&f)
+{
+ out << f << " ";
+ return 1;
+}
+
+template<class T>
+bool readOb(istream&in,T&f)
+{
+ in >> f;
+ char c;
+ in.get(c);
+ massert(c==' ');
+ return 1;
+}
+
+template<class T>
+bool writeOb(ostream&out,const string &s,const T&f)
+{
+ out << s << " " << f << " ";
+ return 1;
+}
+template<class T>
+bool readOb(istream&in,const string&s,T&f)
+{
+ string ss;
+ in >> ss;
+ if( s!=ss )
+ {
+ cerr << "ERROR: readOb should be '" << s << "' and is '" << ss << "'" << endl;
+ return 0;
+ }
+ in >> f;
+ char c;
+ in.get(c);
+ massert(c==' ');
+ return 1;
+}
+
+template<class T> class FixedArray
+{
+ private:
+ void copy(T *aa,const T *bb,int nnn)
+ {for(int iii=0;iii<nnn;iii++)aa[iii]=bb[iii];}
+
+ public:
+ T *p;
+ int realSize;
+ FixedArray()
+ : p(0),realSize(0){}
+ FixedArray(const FixedArray<T> &x)
+ : p(new T[x.realSize]),realSize(x.realSize) {copy(p,x.p,realSize);}
+ explicit FixedArray(int n)
+ : p(new T[n]),realSize(n){}
+ FixedArray(int n,const T&_init)
+ : p(new T[n]),realSize(n){for(int z=0;z<n;z++)p[z]=_init;}
+ FixedArray(const FixedArray&f,const T&t)
+ : p(new T[f.size()+1]),realSize(f.size()+1){for(int z=0;z<f.size();z++)p[z]=f[z];p[f.size()]=t;}
+ ~FixedArray()
+ { delete [] p;p=0;realSize=-1;}
+
+ FixedArray<T>& operator=(const FixedArray<T>&x)
+ {
+ if( this!= &x )
+ {
+ delete [] p;
+ realSize = x.realSize;
+ p = new T[x.realSize];
+ copy(p,x.p,realSize);
+ }
+ return *this;
+ }
+ void resize(int n)
+ {
+ if( n<=realSize )
+ shrink(n);
+ else
+ {
+ T*np=new T[n];
+ copy(np,p,realSize);
+ delete []p;
+ p=np;
+ realSize=n;
+ }
+ }
+ void shrink(int n)
+ {
+ assert(n<=realSize);
+ realSize=n;
+ }
+ void init(int n,const T&_init)
+ {
+ delete []p;
+ p=new T[n];
+ realSize=n;
+ for(int l=0;l<n;l++)p[l]=_init;
+ }
+ inline const T&top(int n=0) const
+ {return (*this)[realSize-1-n];}
+ inline int size() const
+ {return realSize;}
+
+ inline T*begin(){ return p; }
+ inline T*end(){ return p+realSize; }
+
+ inline const T*begin()const{ return p; }
+ inline const T*end()const{return p+realSize;}
+
+ inline int low() const
+ {return 0;}
+ inline int high() const
+ {return realSize-1;}
+ const void errorAccess(int n) const;
+
+ inline T& operator[](int n)
+ {
+ return p[n];
+ }
+ inline const T& operator[](int n) const
+ {
+ return p[n];
+ }
+ bool writeTo(ostream&out) const
+ {
+ out << "FixedArray ";
+ out << size() << " ";
+ for(int a=0;a<size();a++)
+ {
+ writeOb(out,(*this)[a]);
+ out << " ";
+ }
+ out << endl;
+ return 1;
+ }
+ bool readFrom(istream&in)
+ {
+ string s;
+ if( !in )
+ {
+ cerr << "ERROR(FixedArray): file cannot be opened.\n";
+ return 0;
+ }
+ in >> s;
+ if( !(s=="FixedArray") )
+ {
+ cerr << "ERROR(FixedArray): FixedArray!='"<<s<<"'\n";
+ return 0;
+ }
+ int biggest;
+ in >> biggest;
+ resize(biggest);
+ for(int a=0;a<size();a++)
+ readOb(in,(*this)[a]);
+ return 1;
+ }
+ void sort(int until=-1)
+ {
+ if( until== -1 ) until=size();
+ std::sort(p,p+until);
+ }
+ void invsort(int until=-1)
+ {
+ if( until== -1 ) until=size();
+ std::sort(p,p+until,greater<T>());
+ }
+ int binary_locate(const T&t)
+ {
+ T*ppos=std::lower_bound(p,p+size(),t);
+ int pos=ppos-p;
+ if( pos>=-1&&pos<size() )
+ return pos;
+ else
+ return -1;
+ }
+ int binary_search(const T&t)
+ {
+ T*ppos=std::lower_bound(p,p+size(),t);
+ int pos=ppos-p;
+ if( pos>=0&&pos<size()&& *ppos==t )
+ return pos;
+ else
+ return -1;
+ }
+ typedef T* iterator;
+ typedef const T* const_iterator;
+};
+
+template<class T> bool operator<(const FixedArray<T> &x, const FixedArray<T> &y)
+{
+ return lexicographical_compare(x.begin(),x.end(),y.begin(),y.end());
+
+}
+
+
+template<class T> bool operator==(const FixedArray<T> &x, const FixedArray<T> &y)
+{
+ if( &x == &y )return 1;
+ const int s = x.size();
+ if( s !=y.size() )return 0;
+ for(int iii=0;iii<s;iii++)
+ if( !(x.p[iii]==y.p[iii]) )
+ return 0;
+ return 1;
+}
+
+template<class T> int Hash(const FixedArray<T>&a)
+{
+ int n=0;
+ const int s=a.size();
+ for(int iii=0;iii<s;iii++)
+ n=13*n+Hash(a.p[iii]);
+ return n;
+}
+
+template<class T> const void FixedArray<T>:: errorAccess(int n) const
+{
+ massert(0);
+ cerr << "ERROR: Access to array element " << n
+ << " (" << realSize << "," << (void*)p << ")\n";
+}
+
+template<class T> ostream& operator<<(ostream&o,const FixedArray<T>&a)
+{
+ o << "FixedArray(" << a.size() << "){ ";
+ for(int iii=0;iii<a.size();iii++)
+ o << " " << iii<< ":" << a[iii]<<";";
+ return o << "}\n";
+}
+
+template<class T> istream& operator>>(istream&in, FixedArray<T>&)
+{ return in;}
+
+template<class T> FixedArray<T> operator+(const FixedArray<T>&a,const FixedArray<T>&b)
+{
+ massert(a.size()==b.size());
+ FixedArray<T> x(a.size());
+ for(int iii=0;iii<a.size();iii++)
+ x[iii]=a[iii]+b[iii];
+ return x;
+}
+template<class T> FixedArray<T> operator|(const FixedArray<T>&aaa,const FixedArray<T>&bbb)
+{
+ iassert(aaa.size()==bbb.size());
+
+ FixedArray<T> xxx(aaa.size());
+ for(int iii=0;iii<aaa.size();iii++)
+ xxx.p[iii]=aaa.p[iii]||bbb.p[iii];
+ return xxx;
+}
+
+#endif
+
+
+
diff --git a/mkcls-v2/FlexArray.h b/mkcls-v2/FlexArray.h
new file mode 100644
index 0000000..ede3e9e
--- /dev/null
+++ b/mkcls-v2/FlexArray.h
@@ -0,0 +1,48 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#ifndef CLASS_FlexArray_defined
+#define CLASS_FlexArray_defined
+#include "FixedArray.h"
+
+template<class T>
+class FlexArray
+{
+private:
+ FixedArray<T> p;
+ int start,end;
+public:
+ FlexArray(int _start=0,int _end=-1)
+ : p(_end-_start+1),start(_start),end(_end) {}
+ T&operator[](int i)
+ {return p[i-start];}
+ const T&operator[](int i)const
+ {returnp[i-start];}
+ int low()const{return start;}
+ int high()const{return end;}
+};
+
+
+#endif
diff --git a/mkcls-v2/GDAOptimization.cpp b/mkcls-v2/GDAOptimization.cpp
new file mode 100644
index 0000000..a9e2fa7
--- /dev/null
+++ b/mkcls-v2/GDAOptimization.cpp
@@ -0,0 +1,159 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "GDAOptimization.h"
+#include "ProblemTest.h"
+#include <cmath>
+
+#define GDAOptimization GDAOptimization
+#define IterOptimization IterOptimization
+
+
+
+double GDAOptimization::defaultTemperatur=1e100;
+
+
+double GDAOptimization::defaultAlpha=0.001;
+
+
+
+GDAOptimization::GDAOptimization(Problem &p,int m)
+: IterOptimization(p,m) ,temperatur(defaultTemperatur),alpha(defaultAlpha)
+{
+}
+
+
+GDAOptimization::GDAOptimization(Problem &p,double t,double a,int m)
+: IterOptimization(p,m) ,temperatur(t) ,alpha(a)
+{
+}
+
+
+GDAOptimization::GDAOptimization(GDAOptimization &o)
+: IterOptimization(o)
+{
+ temperatur = o.temperatur;
+ alpha = o.alpha;
+ gdaEndFlag = o.gdaEndFlag;
+}
+
+
+void GDAOptimization::zInitialize()
+{
+ IterOptimization::zInitialize();
+ if(temperatur==1e100)
+ {
+ double v=problem.value();
+
+
+
+
+
+ temperatur=v;
+ }
+ assert(alpha>=0);
+}
+
+short GDAOptimization::accept(double delta)
+{
+ if( curValue + delta < temperatur )
+ return 1;
+ else
+ return 0;
+}
+
+void GDAOptimization::abkuehlen()
+{
+ double newTemperatur = temperatur - alpha*(temperatur - curValue);
+ if( fabs(temperatur - newTemperatur)<1e-30 )
+ gdaEndFlag=1;
+ else
+ gdaEndFlag=0;
+ temperatur = newTemperatur;
+}
+
+short GDAOptimization::end()
+{
+ return ( endFlag>0 ) && ( gdaEndFlag );
+}
+
+void GDAOptimization::makeGraphOutput()
+{
+ IterOptimization::makeGraphOutput();
+ *GraphOutput << temperatur-curValue;
+}
+
+
+
+
+double GDAOptimization::optimizeValue(Problem &p,int proParameter,int numParameter,int typ,
+ int optimierungsschritte,int print)
+{
+ if(typ!=1)
+ {
+ cerr << "Error: wrong parameter-type in GDAOptimization::optimizeValue ("
+ << typ << ")\n";
+ exit(1);
+ }
+ else
+ {
+ double bestPar=-1,best=1e100;
+ double now;
+ if( print )
+ cout << "#GDA-optimizeValues: " << numParameter<<endl;
+
+
+ defaultTemperatur=1e100;
+
+ for(int i=0;i<=numParameter;i++)
+ {
+ StatVar end,laufzeit,init;
+ defaultAlpha = pow(pow(200,1.0/numParameter),i)*0.002;
+ solveProblem(0,p,proParameter,optimierungsschritte,GDA_OPT,now,end,
+ laufzeit,init);
+ if( best>now )
+ {
+ best=now;
+ bestPar=defaultAlpha;
+ }
+ if( print )
+ {
+ cout << defaultAlpha <<" ";
+ cout << end.getMean() << " " << end.quantil(0.2) << " "
+ << end.quantil(0.79) << " " << laufzeit.getMean() << " "
+ << end.quantil(0.0) << " " << end.getSigma() << " "
+ << end.getSigmaSmaller()<< " "<< end.getSigmaBigger()<< endl;
+ }
+ }
+ if( print )
+ cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit"
+ " Bester Sigma SigmaSmaller SigmaBigger\n";
+ defaultAlpha=0.03;
+ return bestPar;
+ }
+ return 1e100;
+}
+
diff --git a/mkcls-v2/GDAOptimization.h b/mkcls-v2/GDAOptimization.h
new file mode 100644
index 0000000..33bcec3
--- /dev/null
+++ b/mkcls-v2/GDAOptimization.h
@@ -0,0 +1,80 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+#ifndef GDAOPTIMIZATION
+#define GDAOPTIMIZATION
+#include "IterOptimization.h"
+
+class GDAOptimization : public IterOptimization
+{
+
+ private:
+ double temperatur;
+ double alpha;
+ short gdaEndFlag;
+
+
+ protected:
+ virtual void zInitialize();
+
+
+ virtual short accept(double delta);
+
+
+ virtual void abkuehlen();
+
+
+ virtual short end();
+
+
+ virtual void makeGraphOutput();
+
+
+ public:
+ GDAOptimization(Problem &p,double temperatur,double alpha,
+ int maxIter=-1);
+
+
+ GDAOptimization(Problem &p,int maxIter=-1);
+
+
+ GDAOptimization(GDAOptimization &o);
+
+
+ static double optimizeValue(Problem &p,int proParameter,
+ int numParameter,int typ,int schritte= -1,int verbose=1);
+
+
+
+ static double defaultTemperatur;
+ static double defaultAlpha;
+
+};
+#endif
+
+
diff --git a/mkcls-v2/GNU.GPL b/mkcls-v2/GNU.GPL
new file mode 100644
index 0000000..5b2225e
--- /dev/null
+++ b/mkcls-v2/GNU.GPL
@@ -0,0 +1,282 @@
+
+
+Preamble
+
+The licenses for most software are designed to take away your freedom
+to share and change it. By contrast, the GNU General Public License is
+intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the software, or if you modify it.
+
+For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on,
+we want its recipients to know that what they have is not the
+original, so that any problems introduced by others will not reflect
+on the original authors' reputations.
+
+Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at
+all.
+
+The precise terms and conditions for copying, distribution and
+modification follow.
+
+
+TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+0. This License applies to any program or other work which contains a
+notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the Program
+(independent of having been made by running the Program). Whether that
+is true depends on what the Program does.
+
+1. You may copy and distribute verbatim copies of the Program's source
+code as you receive it, in any medium, provided that you conspicuously
+and appropriately publish on each copy an appropriate copyright notice
+and disclaimer of warranty; keep intact all the notices that refer to
+this License and to the absence of any warranty; and give any other
+recipients of the Program a copy of this License along with the
+Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a
+fee.
+
+2. You may modify your copy or copies of the Program or any portion of
+it, thus forming a work based on the Program, and copy and distribute
+such modifications or work under the terms of Section 1 above,
+provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that
+ in whole or in part contains or is derived from the Program or
+ any part thereof, to be licensed as a whole at no charge to all
+ third parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you
+ provide a warranty) and that users may redistribute the program
+ under these conditions, and telling the user how to view a copy
+ of this License. (Exception: if the Program itself is interactive
+ but does not normally print such an announcement, your work based
+ on the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of
+ Sections 1 and 2 above on a medium customarily used for software
+ interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt otherwise
+to copy, modify, sublicense or distribute the Program is void, and
+will automatically terminate your rights under this License. However,
+parties who have received copies, or rights, from you under this
+License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted
+herein. You are not responsible for enforcing compliance by third
+parties to this License.
+
+
+7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+9. The Free Software Foundation may publish revised and/or new
+versions of the General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Program does not specify a
+version number of this License, you may choose any version ever
+published by the Free Software Foundation.
+
+10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the
+author to ask for permission. For software which is copyrighted by the
+Free Software Foundation, write to the Free Software Foundation; we
+sometimes make exceptions for this. Our decision will be guided by the
+two goals of preserving the free status of all derivatives of our free
+software and of promoting the sharing and reuse of software generally.
+
+NO WARRANTY
+
+11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE
+LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS
+AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF
+ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+
+12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+END OF TERMS AND CONDITIONS
diff --git a/mkcls-v2/HCOptimization.cpp b/mkcls-v2/HCOptimization.cpp
new file mode 100644
index 0000000..0c6a729
--- /dev/null
+++ b/mkcls-v2/HCOptimization.cpp
@@ -0,0 +1,57 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "HCOptimization.h"
+
+HCOptimization::HCOptimization(Problem &p,int m)
+: IterOptimization(p,m)
+{
+ if( maxStep<=0 )
+ maxStep=(int)(problem.expectedNumberOfIterations());
+}
+HCOptimization::HCOptimization(HCOptimization &o)
+: IterOptimization(o)
+{
+}
+
+
+short HCOptimization::accept(double delta)
+{
+ if( delta < 0 )
+ return 1;
+ else
+ return 0;
+}
+short HCOptimization::end()
+{
+ return endFlag>0;
+}
+void HCOptimization::abkuehlen()
+{
+}
+
+
+
diff --git a/mkcls-v2/HCOptimization.h b/mkcls-v2/HCOptimization.h
new file mode 100644
index 0000000..ec147b2
--- /dev/null
+++ b/mkcls-v2/HCOptimization.h
@@ -0,0 +1,54 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+#ifndef HCOPTIMIZATION
+#define HCOPTIMIZATION
+#include "IterOptimization.h"
+
+class HCOptimization : public IterOptimization
+{
+
+ protected:
+ virtual short accept(double delta);
+
+
+ virtual void abkuehlen();
+
+
+ virtual short end();
+
+
+ public:
+ HCOptimization(Problem &p,int maxIter=-1);
+
+
+ HCOptimization(HCOptimization &o);
+
+
+};
+#endif
diff --git a/mkcls-v2/IterOptimization.cpp b/mkcls-v2/IterOptimization.cpp
new file mode 100644
index 0000000..258cb1f
--- /dev/null
+++ b/mkcls-v2/IterOptimization.cpp
@@ -0,0 +1,199 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#include "IterOptimization.h"
+#include "ProblemTest.h"
+
+ostream *GraphOutput;
+
+
+
+IterOptimization::IterOptimization(Problem& p,int m)
+ : maxNonBetterIterations(0),problem(p),maxStep(m),initialisiert(0)
+{
+}
+
+
+
+IterOptimization::IterOptimization(IterOptimization& o) : Optimization(),problem(o.problem)
+{
+ maxNonBetterIterations=o.maxNonBetterIterations;
+ curValue = o.curValue;
+ bestStep = o.bestStep;
+ bestValue = o.bestValue;
+ maxStep = o.maxStep;
+ initialisiert = o.initialisiert;
+ endFlag = o.endFlag;
+ endFlag2 = o.endFlag2;
+}
+
+
+
+double IterOptimization::minimize(int steps)
+{
+ if( !initialisiert )
+ zInitialize();
+
+ if( steps==0 )
+ return curValue;
+
+ int t=0;
+ int every=(steps<0)?10000:(steps/1000+1);
+
+ do
+ {
+ curStep++;
+ t++;
+ if(verboseMode&&(curStep%1000==0))
+ {
+ if(steps>0)
+ cout << "Processed: " << 100.0*(curStep/(double)max(maxStep,1)) << " percent. (IterOptimization run) "
+ << curValue << " max:" << maxStep << " " << steps << " \r";
+ else
+ cout << "In step:" << curStep << " currentValue: " << curValue
+ << " bestValue: " << bestValue-curValue << " " << curStep-bestStep << ". \r";
+ cout.flush();
+ }
+
+
+ ProblemChange *change= &(problem.change());
+
+
+ double delta=problem.valueChange(*change);
+
+
+ abkuehlen();
+
+
+ if( accept(delta) )
+ {
+
+ problem.doChange(*change);
+
+
+ curValue+=delta;
+
+
+ if( curValue<bestValue-1e-10 )
+ {
+ bestValue=curValue;
+ bestStep=curStep;
+ endFlag2=endFlag=0;
+ }
+
+ if( verboseMode>1 )
+ cout<<"in step: "<<curStep<<" accepted with : "<<delta<<endl;
+ }
+
+ if(curStep - bestStep>maxNonBetterIterations && maxNonBetterIterations>0)
+ endFlag=1;
+ if(curStep - bestStep>2*maxNonBetterIterations && maxNonBetterIterations>0)
+ endFlag2=1;
+
+
+
+ if( GraphOutput&&((curStep%every)==0) )
+ {
+ makeGraphOutput();
+ *GraphOutput<<" "<<delta<<endl;
+ }
+
+ delete change;
+ } while( t!=steps && (!end()) && (!problem.endCriterion()) );
+
+ if( GraphOutput)
+ {
+ makeGraphOutput();
+ *GraphOutput<<endl;
+ }
+ return curValue;
+}
+
+
+void IterOptimization::zInitialize()
+{
+ initialisiert=1;
+ bestValue=curValue=problem.value();
+ maxNonBetterIterations=problem.maxNonBetterIterations();
+ bestStep=curStep=0;
+ endFlag2=endFlag=0;
+}
+
+
+void IterOptimization::makeGraphOutput()
+{
+
+ *GraphOutput << curStep << " " <<curValue << " ";
+}
+
+
+double IterOptimizationOptimizeParameter(Problem &p,
+ double &parameter,double min,double max,
+ int nRun,int nPar,int verfahren,
+ double &bv)
+{
+ if( nPar<=0 )
+ return (max+min)/2;
+
+ StatVar end1,time1,init1;
+ StatVar end2,time2,init2;
+ double mean1,mean2;
+ double par1,par2;
+
+ parameter = par1 = min + (max-min)/3;
+ solveProblem(0,p,nRun,-1,verfahren,mean1,end1,time1,init1);
+ cout << parameter << " " << mean1 << " " << end1.quantil(0.0) << " " << end1.quantil(1.0) << endl;
+
+ parameter = par2 = min + 2*(max-min)/3;
+ solveProblem(0,p,nRun,-1,verfahren,mean2,end2,time2,init2);
+ cout << parameter << " " << mean2 << " " << end2.quantil(0.0) << " " << end2.quantil(1.0) << endl;
+
+ double bestPar,bestVal;
+ if(mean1<mean2)
+ {
+ bestVal = mean1;
+ bestPar=IterOptimizationOptimizeParameter(p,parameter,min,min+2*(max-min)/3,nRun,nPar-2,verfahren,bestVal);
+ }
+ else
+ {
+ bestVal = mean2;
+ bestPar=IterOptimizationOptimizeParameter(p,parameter,min+(max-min)/3,max,nRun,nPar-2,verfahren,bestVal);
+ }
+ if( mean1<bestVal&&mean1<=mean2 )
+ {
+ bv = mean1;
+ return par1;
+ }
+ else if(mean2<bestVal && mean2<=mean1)
+ {
+ bv = mean2;
+ return par2;
+ }
+ else
+ {
+ bv = bestVal;
+ return bestPar;
+ }
+}
diff --git a/mkcls-v2/IterOptimization.h b/mkcls-v2/IterOptimization.h
new file mode 100644
index 0000000..ba39b55
--- /dev/null
+++ b/mkcls-v2/IterOptimization.h
@@ -0,0 +1,123 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef ITEROPTIMIZATION
+#define ITEROPTIMIZATION
+
+#include "Optimization.h"
+
+
+
+
+
+#define ANZ_VERSCHLECHTERUNGEN 500
+
+extern ostream *GraphOutput;
+
+
+class IterOptimization : public Optimization
+ {
+
+
+ private:
+ int maxNonBetterIterations;
+
+
+ protected:
+ Problem &problem;
+ int curStep;
+ double curValue;
+ int bestStep;
+ double bestValue;
+ int maxStep;
+ int initialisiert;
+ short endFlag;
+ short endFlag2;
+
+
+
+
+ virtual void makeGraphOutput();
+
+
+ virtual short end()=0;
+
+
+ virtual void abkuehlen()=0;
+
+
+ virtual short accept(double delta)=0;
+
+
+ virtual void zInitialize();
+
+
+ public:
+ IterOptimization(Problem &p,int maxIter=-1);
+
+
+ IterOptimization(IterOptimization &o);
+
+
+ virtual double minimize(int steps=-1);
+
+
+ inline int getCurStep();
+
+
+ inline double getCurrentValue();
+
+
+ inline const Problem& getProblem();
+
+
+};
+
+double IterOptimizationOptimizeParameter(Problem &p,
+ double &parameter,double min,double max,
+ int nRun,int nPar,int verfahren,double &bv);
+
+inline int IterOptimization::getCurStep()
+{
+ return curStep;
+};
+inline double IterOptimization::getCurrentValue()
+{
+ return curValue;
+};
+inline const Problem& IterOptimization::getProblem()
+{
+ return problem;
+};
+
+#endif
+
+
+
+
diff --git a/mkcls-v2/KategProblem.cpp b/mkcls-v2/KategProblem.cpp
new file mode 100644
index 0000000..7318fb6
--- /dev/null
+++ b/mkcls-v2/KategProblem.cpp
@@ -0,0 +1,1001 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "KategProblem.h"
+#include "KategProblemTest.h"
+
+#include "ProblemTest.h"
+
+extern double SigmaVerfaelschung;
+
+double h_table[MAX_H_TABLE],l_table[MAX_H_TABLE],hmy_table[MAX_H_TABLE],hmy_sigma;
+
+double LWRW_Faktor=0.5;
+
+static int intcompare(const void *p,const void *j)
+{
+ return *(int *)p - *(int *)j;
+}
+
+KategProblem::KategProblem(int aw,int mak,int _initialisierung,int _auswertung,
+ int _nachbarschaft,int mindestAnzahl)
+: Problem(mak,aw,_initialisierung,_auswertung,_nachbarschaft),
+ sigmaVerfaelschung(SigmaVerfaelschung),katWasEmpty(0),nwg(mak+2),ngw(mak+2),_katOfWord(aw,-1),words(0),kats(0),
+ wordFreq(aw,mindestAnzahl),katFreq(mak+2,(_auswertung==CRITERION_MY)?SigmaVerfaelschung:0.0),
+ initLike(aw,-1)
+
+{
+ if( auswertung == CRITERION_MY )
+ cout << "Sigma-Verfaelschung: " << sigmaVerfaelschung << endl;
+ _maxComp=aw;
+ _maxCompVal=mak;
+ massert(katFreq.nKats>0);
+ massert(mak<=aw);
+
+
+ for(int i=1;i<MAX_H_TABLE;i++)
+ {
+ h_table[i]=i*log((double)(i));
+ l_table[i]=log((double)(i));
+ hmy_table[i]=i*log(verfaelsche(i,sigmaVerfaelschung));
+ }
+ hmy_sigma=sigmaVerfaelschung;
+ l_table[0]=h_table[0]=0;
+
+ if( katwahl()==K_BEST )
+ _maxCompVal=1;
+
+}
+
+KategProblem::~KategProblem()
+
+{
+ delete words;
+ delete kats;
+}
+
+void KategProblem::_initialize(int initTyp)
+{
+ _initialize(initTyp,-1);
+}
+
+void KategProblem::_initialize(int initTyp,int specialFixedWord)
+
+{
+ massert(wordFreq.filled);
+ initialisierung = initTyp;
+ int i;
+
+ for(i=0;i<katFreq.nKats;i++)
+ for(int j=0;j<katFreq.nKats;j++)
+ katFreq.setN(i,j,0);
+
+
+
+
+ for(i=0;i<wordFreq.nWords;i++)
+ {
+ setKatOfWord(i,-1);
+ if( strcmp(getString(i),"$")==0||strcmp(getString(i),"1$")==0||strcmp(getString(i),"2$")==0||strcmp(getString(i),"3$")==0||strcmp(getString(i),"4$")==0 )
+ wordFreq.setDollar(i);
+ }
+ wordFreq.init(specialFixedWord);
+
+
+
+
+ _maxComp=wordFreq.nTranspWords;
+
+ switch(initTyp)
+ {
+ case INIT_OTHER:
+
+ if(verboseMode>2)cout << "KategProblem::_initialize(INIT_OTHER)\n";
+ for(i=0;i<wordFreq.nWords;i++)
+ fastPutWord(i,initLike[i]);
+ break;
+ case INIT_RAN:
+
+ if(verboseMode>2)cout << "KategProblem::_initialize(INIT_RAN)\n";
+ for(i=0;i<wordFreq.nWords;i++)
+ {
+ if( wordFreq.minIndex[i]>0 && wordFreq.maxIndex[i]>0 )
+ fastPutWord(i,wordFreq.minIndex[i]+randomInt(wordFreq.maxIndex[i]-wordFreq.minIndex[i]+1));
+ else
+ fastPutWord(i,2+randomInt(katFreq.nKats-2));
+ }
+
+
+ break;
+ case INIT_AIO:
+
+ if(verboseMode>2)cout << "KategProblem::_initialize(INIT_AIO)\n";
+ for(i=0;i<wordFreq.nWords;i++)
+ fastPutWord(i,2);
+ break;
+ case INIT_FREQ:
+
+ if(verboseMode>2)cout << "KategProblem::_initialize(INIT_FREQ)\n";
+ for(i=0;i<wordFreq.nWords;i++)
+ {
+ int to=i+2;
+ if( to>=katFreq.nKats )
+ to=katFreq.nKats-1;
+ fastPutWord((*(wordFreq.absteigend))[i],to);
+ }
+ curComp=katFreq.nKats-2;
+ break;
+ case INIT_LWRW:
+
+ {
+ Array<int> markList(wordFreq.nWords,1);
+ int to=2;
+ int i=0;
+ if(verboseMode>2)cout << "KategProblem::_initialize(INIT_LWRW)\n";
+ for(to=2;to<katFreq.nKats*LWRW_Faktor;to++)
+ {
+ int w=(*(wordFreq.absteigend))[to-2];
+ fastPutWord(w,to);
+ markList[w]=0;
+ }
+ while(to<katFreq.nKats-1 && i<wordFreq.nWords)
+ {
+ int toFilled=0;
+ int word=(*(wordFreq.absteigend))[i];
+ if(i%2)
+ {
+ ManyFreq &after=wordFreq.after[word];
+ for(int j=0;j<after.size();j++)
+ {
+ int w=after[j].w;
+ if( markList[w] )
+ fastPutWord(w,to),toFilled++;
+ markList[w]=0;
+ }
+ }
+ else
+ {
+ ManyFreq &before=wordFreq.before[word];
+ for(int j=0;j<before.size();j++)
+ {
+ int w=before[j].w;
+ if( markList[w] )
+ fastPutWord(w,to),toFilled++;
+ markList[w]=0;
+ }
+ }
+ i++;
+ if( toFilled>0 )
+ to++;
+ }
+ for(i=0;i<wordFreq.nWords;i++)
+ if(markList[i])
+ fastPutWord(i,katFreq.nKats-1);
+ }
+ break;
+ default:
+ cerr << "Wrong _initialize in KategProblem: " << initTyp << endl;
+ exit(1);
+ }
+
+
+
+ for(int word=0;word<wordFreq.nWords;word++)
+ {
+ Array<OneFreq>& aft=wordFreq.after[word];
+
+ int nAft=aft.size();
+
+ for(i=0;i<nAft;i++)
+ katFreq.addN(katOfWord(word),katOfWord(aft[i].w),aft[i].n);
+ }
+
+ if(verboseMode>2)
+ {
+ cout << "\nInitialization of KategProblem:";
+ dumpOn(cout);
+ }
+}
+
+double KategProblem::valueChange(ProblemChange&c)
+
+{
+ numberOfPartEvaluations++;
+ KategProblemChange &k=*(KategProblemChange *)&c;
+ fillNWG(k.word);
+
+ return _valueChange(k);
+}
+
+
+Problem *KategProblem::makeEqualProblem()
+
+{
+ KategProblem*p = new KategProblem(wordFreq.nWords,katFreq.nKats-2,initialisierung,
+ auswertung,nachbarschaft);
+ KategProblemWBC &w=p->wordFreq;
+ for(int x=0;x<wordFreq.nWords;x++)
+ {
+ w.setAfterWords(x,wordFreq.after[x].size());
+ w.setBeforeWords(x,wordFreq.before[x].size());
+ }
+ int i;
+ for(i=0;i<wordFreq.nWords;i++)
+ {
+ for(int j=0;j<wordFreq.after[i].size();j++)
+ w.setFreq(i,wordFreq.after[i][j].w,wordFreq.after[i][j].n);
+ }
+ w.testFull();
+ w.mindestAnzahl = wordFreq.mindestAnzahl;
+ if(words)
+ p->words = new leda_array<string>(*words);
+ for(i=0;i<wordFreq.nWords;i++)
+ {
+ p->setKatOfWord(i,katOfWord(i));
+ p->initLike[i]=initLike[i];
+ }
+ p->setValuesFrom(this);
+ return p;
+}
+
+double KategProblem::nicevalue(double val)
+
+{
+ double v;
+ if( val!=1e100)
+ v=val;
+ else
+ v=value();
+ double h=wordFreq.get_h_of_words();
+ double n=wordFreq.numberOfWords();
+ double k=0;
+ if(auswertung == CRITERION_MY)
+ k=katFreq.myCriterionTerm();
+ return exp((v+h-k)/n);
+}
+
+void KategProblem::makeKats()
+
+{
+ if(kats)delete kats;
+ kats = new leda_array<intSet>(katFreq.nKats);
+ for(int i=0;i<wordFreq.nWords;i++)
+ (*kats)[katOfWord(i)].insert(i);
+}
+
+void KategProblem::dumpInfos(ostream &strm)
+
+{
+ strm << ";KategProblem:";
+ strm << "cats: " << katFreq.nKats-2 << " words: " << wordFreq.nWords
+ << endl;
+}
+
+void KategProblem::dumpOn(ostream &strm)
+
+{
+ writeClasses(_katOfWord,*this,strm);
+ if(PrintBestTo2)
+ {
+ dumpInfos(*PrintBestTo2);
+ makeKats();
+ if( kats==0 )
+ {
+ if( words==0 )
+ {
+ for(int i=0;i<wordFreq.nWords;i++)
+ {
+ *PrintBestTo2 << i << ":" << katOfWord(i) << " ";
+ }
+ }
+ else
+ {
+ for(int i=0;i<wordFreq.nWords;i++)
+ *PrintBestTo2 << (*words)[i] << ":" << katOfWord(i) << " ";
+ }
+ }
+ else
+ {
+ int anzkat=0;
+ for(int i=0;i<katFreq.nKats;i++)
+ {
+ int printed=0;
+ *PrintBestTo2 << i << ":";
+ leda_set<int>&theSet = (*kats)[i];
+ if( words==0 )
+ {
+ int nr=0;
+ forall_set(leda_set<int>,nr,theSet)
+ {
+ *PrintBestTo2 << nr << ", ";
+ printed=1;
+ }
+ }
+ else
+ {
+ int nr=0;
+ forall_set(leda_set<int>,nr,theSet)
+ {
+ *PrintBestTo2 << (*words)[nr]<< ",";
+ printed=1;
+ }
+ }
+ if(printed==1)anzkat++;
+ *PrintBestTo2 << endl;
+ }
+ *PrintBestTo2 << ";I have " << anzkat << " categories used.\n";
+ }
+ *PrintBestTo2 << endl;
+ Problem::dumpOn(*PrintBestTo2);
+ }
+}
+
+
+
+
+
+
+const char *KategProblem::getString(int i)
+
+{
+ if(words==0)
+ return "<>";
+ else
+ return ((*words)[i]).c_str();
+}
+
+string KategProblem::getTheString(int i)
+{
+ return (*words)[i];
+}
+
+int KategProblem::maxNonBetterIterations()
+
+{
+ if(katwahl()==K_BEST)
+ return wordFreq.nTranspWords;
+ else
+ return katFreq.nKats*wordFreq.nTranspWords;
+}
+
+int KategProblem::expectedNumberOfIterations()
+
+{
+
+ if(katwahl()==K_BEST)
+ return 10*wordFreq.nTranspWords;
+ else
+ return 13*katFreq.nKats*wordFreq.nTranspWords;
+}
+
+void KategProblem::makeTitle(char x[512])
+
+{
+ char *ww;
+ char *kw;
+ char *in;
+ switch(wortwahl())
+ {
+ case W_RAN:
+ ww="zufaellig";
+ break;
+ case W_DET_DECR:
+ ww="absteigend";
+ break;
+ case W_DET_INCR:
+ ww="aufsteigend";
+ break;
+ default:
+ cerr << "Error: unknown word selection\n";
+ exit(1);
+ }
+ switch(katwahl())
+ {
+ case K_DET:
+ kw="rotierend";
+ break;
+ case K_RAN:
+ kw="zufaellig";
+ break;
+ case K_BEST:
+ kw="best ";
+ break;
+ default:
+ cout << "Error: unknown cagegory selection\n";
+ exit(1);
+ }
+ switch(initialisierung)
+ {
+ case INIT_RAN:
+ in="zufaellig ";
+ break;
+ case INIT_AIO:
+ in="all-in-one";
+ break;
+ case INIT_LWRW:
+ in="lwrw ";
+ break;
+ case INIT_FREQ:
+ in="freq ";
+ break;
+ case INIT_OTHER:
+ in="other ";
+ break;
+ default:
+ cout << "Error: unknown initialization\n";
+ exit(1);
+ }
+ sprintf(x,"(c:%d,w:%d(%d),ww:%s,kw:%s,in:%s)",katFreq.nKats,wordFreq.nWords,
+ wordFreq.nTranspWords,ww,kw,in);
+}
+
+
+
+
+int KategProblem::_change(ProblemChange **p)
+
+{
+ *p=0;
+ int word=curDimension();
+ switch( wortwahl() )
+ {
+ case W_RAN:
+ word=(*(wordFreq.absteigend))[randomInt(wordFreq.nTranspWords)];
+ break;
+ case W_DET_DECR:
+ word=(*(wordFreq.absteigend))[word];
+ break;
+ case W_DET_INCR:
+ word=(*(wordFreq.absteigend))[wordFreq.nTranspWords-word-1];
+ break;
+ default:
+ cerr << "Error: Unknown word selection\n";
+ exit(1);
+ }
+
+ int kat=curDimensionVal()+2;
+ switch( katwahl() )
+ {
+ case K_RAN:
+ kat=randomInt(katFreq.nKats-2)+2;
+
+ case K_DET:
+
+
+ if( kat==katOfWord(word)||(katWasEmpty&&katFreq.n1(kat)==0) )
+ return 0;
+ else if( wordFreq.minIndex[word]>0 && wordFreq.maxIndex[word]>0 && (kat<wordFreq.minIndex[word]||kat>wordFreq.maxIndex[word]))
+ {
+
+ return 0;
+ }
+ else
+ {
+ KategProblemChange *c = new KategProblemChange;
+ c->toKat=kat;
+ c->word=word;
+ c->fromKat=katOfWord(c->word);
+ massert( c->toKat < katFreq.nKats );
+ massert( c->fromKat < katFreq.nKats );
+ massert( c->word < wordFreq.nWords );
+ massert( c->toKat!=0 && c->toKat!=1 );
+ massert( c->fromKat!=0 && c->fromKat!=1 );
+ if(katFreq.n1(kat)==0)
+ katWasEmpty=1;
+ *p=c;
+ return 1;
+ }
+ break;
+ case K_BEST:
+ {
+ fillNWG(word);
+ double smallest=1e100;
+ KategProblemChange &smallestChange = *new KategProblemChange;
+ short withEmpty=0;
+
+
+ int startKat=2;
+ int endKat=katFreq.nKats;
+ if( wordFreq.minIndex[word]>0&&wordFreq.maxIndex[word]>0 )
+ {
+ startKat = max(2,wordFreq.minIndex[word]);
+ endKat = min(katFreq.nKats,wordFreq.maxIndex[word]+1);
+ }
+ for(kat=startKat;kat<endKat;kat++)
+ {
+ if( kat!=katOfWord(word) && (withEmpty==0 || katFreq.n1(kat)
+ || katFreq.n2(kat)) )
+ {
+ KategProblemChange c;
+ c.toKat=kat;
+ c.word=word;
+ c.fromKat=katOfWord(word);
+ double n=_valueChange(c);
+ if(n<smallest)
+ {
+ smallest=n;
+ smallestChange=c;
+ }
+ }
+ if( katFreq.n1(kat)==0 && katFreq.n2(kat)==0 )
+ withEmpty=1;
+ }
+ massert(smallest!=1e100);
+ *p= &smallestChange;
+ return 1;
+ }
+ break;
+ default:
+ cerr << "Error: Unknown category selection\n";
+ exit(1);
+ return 0;
+ }
+}
+
+void KategProblem::_doChange(ProblemChange &c)
+
+{
+ KategProblemChange &k=*(KategProblemChange *)&c;
+ putWord(k.word,k.toKat);
+
+}
+
+void KategProblem::_undoChange(ProblemChange &c)
+
+{
+ KategProblemChange &k=*(KategProblemChange *)&c;
+ putWord(k.word,k.fromKat);
+
+}
+
+void KategProblem::incrementDirection()
+
+{
+ Problem::incrementDirection();
+ katWasEmpty=0;
+ massert( _maxComp==wordFreq.nTranspWords );
+}
+
+double KategProblem::_value()
+
+{
+
+ return katFreq.fullBewertung(auswertung);
+}
+
+
+double mkat_h_full(int n,double tf)
+{
+
+
+ if( tf>0 )
+ return n*log(tf);
+ else
+ return 0.0;
+}
+
+double mkat_h_part(int n,double cf)
+{
+
+
+ if( cf>0.0 )
+ return n*log(cf);
+ else
+ return 0.0;
+}
+
+double KategProblem::kat_h_full(int n)
+{
+ return mkat_h_full(n,verfaelsche(n,sigmaVerfaelschung));
+}
+double KategProblem::kat_h_full(double n)
+{
+ abort();
+ return mkat_h_full((int)n,verfaelsche(n,sigmaVerfaelschung));
+}
+
+double KategProblem::kat_h_part(int n)
+{
+ return mkat_h_part(n,verfaelsche(n,sigmaVerfaelschung));
+}
+double KategProblem::kat_h_part(double n)
+{
+ abort();
+ return mkat_h_part((int)n,verfaelsche(n,sigmaVerfaelschung));
+}
+
+
+
+
+double KategProblem::nmo_my(int i,int j)
+
+{
+ FreqType n=nstrich(i,j),k=katFreq.n(i,j);
+ return kat_h_full(n+k)-kat_h_full(k);
+}
+double KategProblem::nmo(int i,int j)
+
+{
+ FreqType n=nstrich(i,j),k=katFreq.n(i,j);
+ return kat_h(n+k)-kat_h(k);
+}
+double KategProblem::nmo_lo(int i,int j,int &e0,int &e1)
+
+{
+ FreqType kij=katFreq.n(i,j);
+ FreqType nij=nstrich(i,j)+kij;
+ if( kij!=nij)
+ {
+ if( nij==0 )
+ e0++;
+ else if(nij==1)
+ e1++;
+ if( kij==0 )
+ e0--;
+ else if(kij==1)
+ e1--;
+ }
+ return nij*kat_mlog(nij-1-rhoLo)-kij*kat_mlog(kij-1-rhoLo);
+}
+
+
+double KategProblem::_valueChange(KategProblemChange &k)
+
+{
+ double v=0;
+ int i=0;
+
+ ursprung=k.fromKat;
+ ziel=k.toKat;
+
+ if( auswertung==CRITERION_LO )
+ {
+ int e0a=katFreq.eta0,e1a=katFreq.eta1;
+ v-=nmo_lo(ursprung,ursprung,e0a,e1a)+nmo_lo(ziel,ziel,e0a,e1a)
+ +nmo_lo(ursprung,ziel,e0a,e1a)+nmo_lo(ziel,ursprung,e0a,e1a);
+ i=0;
+ while(i<nwg.anzNot0)
+ {
+ int cl=nwg.not0[i];
+ if( cl!= ursprung && cl!=ziel )
+ v -= nmo_lo(ursprung,cl,e0a,e1a)+nmo_lo(ziel,cl,e0a,e1a);
+ i++;
+ }
+ i=0;
+ while(i<ngw.anzNot0)
+ {
+ int cl=ngw.not0[i];
+ if( cl!= ursprung && cl!=ziel )
+ v -= nmo_lo(cl,ursprung,e0a,e1a)+nmo_lo(cl,ziel,e0a,e1a);
+ i++;
+ }
+
+ v+=kat_hlo(katFreq.n1(ursprung)-wordFreq.n1(k.word))
+ -kat_hlo(katFreq.n1(ursprung))
+ +kat_hlo(katFreq.n2(ursprung)-wordFreq.n2(k.word))
+ -kat_hlo(katFreq.n2(ursprung))
+ +kat_hlo(katFreq.n1(ziel)+wordFreq.n1(k.word))
+ -kat_hlo(katFreq.n1(ziel))
+ +kat_hlo(katFreq.n2(ziel)+wordFreq.n2(k.word))
+ -kat_hlo(katFreq.n2(ziel));
+
+ int old0=katFreq.c1_0*katFreq.nKats+katFreq.c2_0*katFreq.nKats
+ -katFreq.c1_0*katFreq.c2_0;
+ int nc1_0=katFreq.c1_0,nc2_0=katFreq.c2_0;
+ if( wordFreq.n1(k.word)>0 && katFreq.n1(ursprung)==wordFreq.n1(k.word) )
+ nc1_0++;
+ if( wordFreq.n2(k.word)>0 && katFreq.n2(ursprung)==wordFreq.n2(k.word) )
+ nc2_0++;
+ if( wordFreq.n1(k.word)>0 && katFreq.n1(ziel)==0 ) nc1_0--;
+ if( wordFreq.n2(k.word)>0 && katFreq.n2(ziel)==0 ) nc2_0--;
+ int new0=nc1_0*katFreq.nKats+nc2_0*katFreq.nKats-nc1_0*nc2_0;
+ v-=kat_etaFkt(e0a,e1a,new0,katFreq.nKats)
+ -kat_etaFkt(katFreq.eta0,katFreq.eta1,old0,katFreq.nKats);
+ vassert(NULLFLOAT(Problem::valueChange(k)-v));
+ }
+ else if(auswertung==CRITERION_ML)
+ {
+ v-=nmo(ursprung,ursprung)+nmo(ziel,ziel)
+ +nmo(ursprung,ziel)+nmo(ziel,ursprung);
+ i=0;
+ while(i<nwg.anzNot0)
+ {
+ int cl=nwg.not0[i];
+ if( cl!= ursprung && cl!=ziel )
+ v -= nmo(ursprung,cl)+nmo(ziel,cl);
+ i++;
+ }
+ i=0;
+ while(i<ngw.anzNot0)
+ {
+ int cl=ngw.not0[i];
+ if( cl!= ursprung && cl!=ziel )
+ v -= nmo(cl,ursprung)+nmo(cl,ziel);
+ i++;
+ }
+ v+=kat_h(katFreq.n1(ursprung)-wordFreq.n1(k.word))
+ -kat_h(katFreq.n1(ursprung))
+ +kat_h(katFreq.n2(ursprung)-wordFreq.n2(k.word))
+ -kat_h(katFreq.n2(ursprung))
+ +kat_h(katFreq.n1(ziel)+wordFreq.n1(k.word))
+ -kat_h(katFreq.n1(ziel))
+ +kat_h(katFreq.n2(ziel)+wordFreq.n2(k.word))
+ -kat_h(katFreq.n2(ziel));
+ }
+ else if( auswertung==CRITERION_MY )
+ {
+ v-=nmo_my(ursprung,ursprung)+nmo_my(ziel,ziel)
+ +nmo_my(ursprung,ziel)+nmo_my(ziel,ursprung);
+ i=0;
+ while(i<nwg.anzNot0)
+ {
+ int cl=nwg.not0[i];
+ if( cl!= ursprung && cl!=ziel )
+ v -= nmo_my(ursprung,cl)+nmo_my(ziel,cl);
+ i++;
+ }
+ i=0;
+ while(i<ngw.anzNot0)
+ {
+ int cl=ngw.not0[i];
+ if( cl!= ursprung && cl!=ziel )
+ v -= nmo_my(cl,ursprung)+nmo_my(cl,ziel);
+ i++;
+ }
+ v+=kat_h_part(katFreq.n1(ursprung)-wordFreq.n1(k.word))
+ -kat_h_part(katFreq.n1(ursprung))
+ +kat_h_part(katFreq.n2(ursprung)-wordFreq.n2(k.word))
+ -kat_h_part(katFreq.n2(ursprung))
+ +kat_h_part(katFreq.n1(ziel)+wordFreq.n1(k.word))
+ -kat_h_part(katFreq.n1(ziel))
+ +kat_h_part(katFreq.n2(ziel)+wordFreq.n2(k.word))
+ -kat_h_part(katFreq.n2(ziel));
+ double bishZusatz = katFreq.myCriterionTerm();
+ _doChange(k);
+ double neuZusatz = katFreq.myCriterionTerm();
+ _undoChange(k);
+ if(verboseMode>2)
+ cout << "ZUSATZ: " << bishZusatz << " " << neuZusatz << " " <<neuZusatz-bishZusatz<<" " << v << endl;
+ v+=neuZusatz-bishZusatz;
+ }
+ else
+ {
+ cerr << "Fatal error: Unknown criterion: '"<<auswertung<<"'\n";
+ }
+ vassert( NULLFLOAT(Problem::valueChange(k)-v) );
+ return v;
+}
+
+
+void KategProblem::fillNWG(int w)
+
+{
+ if(nwgWord==w)
+ return;
+ else
+ {
+ Array<OneFreq> &after=wordFreq.after[w];
+ int size=after.size(),i;
+ nww=0;
+ nwg.init();
+ for(i=0;i<size;i++)
+ {
+ nwg.addFreq(katOfWord(after[i].w),after[i].n);
+ if(after[i].w==w)
+ nww=after[i].n;
+ }
+
+ Array<OneFreq> &before=wordFreq.before[w];
+ size=before.size();
+ ngw.init();
+ for(i=0;i<size;i++)
+ ngw.addFreq(katOfWord(before[i].w),before[i].n);
+ nwgWord=w;
+ }
+}
+
+void KategProblem::vnstrich(int i,int j)
+
+{
+ cout << ".) " << katFreq.n(i,j) << " ";
+ if( i==ursprung )
+ cout << "a) "<<-nwg.getFreq(j) << " ";
+ if( i==ziel )
+ cout << "b) " <<nwg.getFreq(j) << " ";
+
+ if( j==ursprung )
+ cout << "c) " <<-ngw.getFreq(i) << " ";
+ if( j==ziel )
+ cout << "d) " <<+ngw.getFreq(i) << " " ;
+
+ if( i==ursprung && j==ursprung )
+ cout << "e) " <<+nww << " ";
+ if( i==ziel && j==ziel )
+ cout << "f) " <<+nww << " " ;
+ if( i==ursprung && j==ziel )
+ cout << "g) " <<-nww << " ";
+ if( i==ziel && j==ursprung )
+ cout << "h) " <<-nww << " ";
+}
+
+
+
+
+
+void KategProblem::fastPutWord(int word,int toKat)
+
+{
+ massert(toKat>=0 && toKat<katFreq.nKats);
+
+
+
+ if( wordFreq.fixedWord[word]>=0 )
+ toKat=wordFreq.fixedWord[word];
+ massert(katOfWord(word)==-1);
+ setKatOfWord(word,toKat);
+}
+
+void KategProblem::fixInitLike()
+{
+ int fixed=0,fixed2=0;
+ over_arr(initLike,i)
+ if(initLike[i]>=0 )
+ {
+ fixed++;
+ if( initLike[i]>=wordFreq.minIndex[i] || initLike[i]==1 )
+ wordFreq.fixedWord[i]=initLike[i];
+ else
+ {
+ wordFreq.fixedWord[i]=wordFreq.minIndex[i]+initLike[i]-2;
+ fixed2++;
+ }
+ initLike[i]=-1;
+ }
+ cout << "Fixed from file are: " << fixed << " " << fixed2 << " words.\n";
+}
+
+void KategProblem::putWord(int word,int toKat)
+
+{
+ massert(toKat!=0);massert(toKat!=1);
+ massert(word<wordFreq.nWords);
+ massert(toKat<katFreq.nKats);
+ massert(wordFreq.fixedWord[word]<0);
+ int k=katOfWord(word);
+ massert(k!=0&&k!=1);
+ Array<OneFreq>& aft=wordFreq.after[word];
+ Array<OneFreq>& bef=wordFreq.before[word];
+ int nAft=aft.size();
+ int nBef=bef.size();
+ int i;
+ if(verboseMode>4)
+ cout << "putWord(" << word << "," << toKat << ")" << k << " nAft"
+ << nAft << " nBef" << nBef << " k" << k << "\n";
+
+ massert( k!=-1 );
+ massert( k!=toKat );
+
+ for(i=0;i<nAft;i++)
+ {
+ katFreq.addN(k,katOfWord(aft[i].w),-aft[i].n);
+ if(verboseMode>4)
+ cout << k << " " << katOfWord(aft[i].w) << " " << -aft[i].n << endl;
+ }
+ for(i=0;i<nBef;i++)
+ if( bef[i].w!=word )
+ {
+ katFreq.addN(katOfWord(bef[i].w),k,-bef[i].n);
+ if(verboseMode>4)
+ cout << katOfWord(bef[i].w) << " " << k << " " << -bef[i].n << endl;
+ }
+
+ setKatOfWord(word,toKat);
+
+ for(i=0;i<nAft;i++)
+ katFreq.addN(toKat,katOfWord(aft[i].w),aft[i].n);
+ for(i=0;i<nBef;i++)
+ if( bef[i].w!=word )
+ katFreq.addN(katOfWord(bef[i].w),toKat,bef[i].n);
+
+}
+
+
+
+
+
+
+
+
+
+
+static KategProblemChange theOneKategProblemChange;
+static int anzKategProblemChange=0;
+
+void *KategProblemChange::operator new(size_t size)
+{
+ anzKategProblemChange++;
+ massert(anzKategProblemChange>0);
+ massert(anzKategProblemChange<2);
+ if( anzKategProblemChange==1 )
+ return &theOneKategProblemChange;
+ else
+ {
+ if( verboseMode>1 )
+ cout << "generate instance of KategProblemChange: " << size
+ << " " << anzKategProblemChange<< endl;
+ return malloc(size);
+ }
+}
+void KategProblemChange::operator delete(void *ptr,size_t
+)
+{ massert(size==sizeof(KategProblemChange));
+ anzKategProblemChange--;
+ if( ptr!= &theOneKategProblemChange)
+ free(ptr);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+NWG::NWG(int n) : freq(n,0),timeOfFreq(n,0),not0(n),word(-1)
+{
+ massert(n>0);
+ curTime=1;
+ init();
+}
+
+void NWG::init()
+{
+ curTime++;
+ anzNot0=0;
+}
+
+void NWG::sort()
+{
+ qsort(not0.getPointerToData(),anzNot0,sizeof(int),intcompare);
+ massert(anzNot0<=not0.size());
+}
+
+
+int KategProblem::maxDimension()
+{
+ return _maxComp;
+}
+
+int KategProblem::maxDimensionVal()
+{
+ return _maxCompVal;
+}
+
diff --git a/mkcls-v2/KategProblem.h b/mkcls-v2/KategProblem.h
new file mode 100644
index 0000000..e5a5a46
--- /dev/null
+++ b/mkcls-v2/KategProblem.h
@@ -0,0 +1,439 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+#ifndef KATEG_OPT_H
+#define KATEG_OPT_H
+#include <string>
+
+#include <stdlib.h>
+#include "Problem.h"
+
+extern double rhoLo;
+
+typedef int Kategory;
+typedef int Word;
+
+
+
+#ifdef FREQTYPE_DOUBLE
+typedef double FreqType;
+#else
+typedef int FreqType;
+#endif
+
+
+#include "KategProblemWBC.h"
+
+
+#include "KategProblemKBC.h"
+
+
+enum {
+ INIT_RAN=1,
+ INIT_AIO=2,
+ INIT_LWRW=3,
+ INIT_FREQ=4,
+ INIT_OTHER=5
+ };
+
+
+enum {
+ W_RAN=(8|16),
+ W_DET_DECR=(16),
+ W_DET_INCR =(32)
+};
+#define CHOOSE_WORD (8|16|32)
+
+
+enum {
+ K_DET=(64),
+ K_RAN=(128),
+ K_BEST=(64|128)
+};
+#define CHOOSE_KAT (64|128)
+
+
+enum {
+ CRITERION_ML=0,
+ CRITERION_LO=1,
+ CRITERION_MY=2
+};
+
+
+
+class NWG
+{
+ private:
+ Array<FreqType> freq;
+
+ Array<int> timeOfFreq;
+
+
+
+
+ int curTime;
+ public:
+ NWG(int n);
+ void init();
+
+ int anzNot0;
+
+
+ Array<int> not0;
+
+ int word;
+
+ inline void addFreq(int C,FreqType n);
+
+ void sort();
+
+ FreqType getFreq(int i)
+ {
+ if( timeOfFreq[i]==curTime )
+ return freq[i];
+ else
+ return 0;
+ };
+};
+
+inline void NWG::addFreq(int g,FreqType n)
+{
+ if(timeOfFreq[g]==curTime)
+ freq[g]+=n;
+ else
+ {
+ timeOfFreq[g]=curTime;
+ freq[g]=n;
+ not0[anzNot0++]=g;
+ }
+}
+
+
+
+struct KategProblemChange : public ProblemChange
+{
+ void *operator new(size_t size);
+ void operator delete(void *ptr,size_t size);
+
+ int word;
+ int toKat;
+ int fromKat;
+};
+
+class KategProblem : public Problem
+{
+ private:
+ double kat_h_full(int n);
+ double kat_h_full(double n);
+ double kat_h_part(int n);
+ double kat_h_part(double n);
+ double sigmaVerfaelschung;
+ short katWasEmpty;
+
+
+
+ int nwgWord;
+
+ NWG nwg;
+ NWG ngw;
+ FreqType nww;
+
+ int ursprung,ziel;
+
+ Array<int> _katOfWord;
+
+ int _maxComp,_maxCompVal;
+
+ double nmo_my(int i,int j);
+ double nmo(int i,int j);
+
+
+ double nmo_lo(int i,int j,int &e0,int &e1);
+
+
+ void putWord(int word,int to);
+
+
+ void fastPutWord(int word,int to);
+
+
+ void setKatOfWord(int w,int k)
+{
+ if( !(wordFreq.fixedWord[w]==k||wordFreq.fixedWord[w]==-1||k==-1) )
+ {
+ cout << "mkcls::setKatOfWord::ERROR: " << w << " " << k << " " << wordFreq.fixedWord[w] << " " << (*words)[w] << endl;
+ }
+ _katOfWord[w]=k;
+ nwgWord=-1;
+};
+
+
+ void fillNWG(int w);
+
+
+ inline FreqType nstrich(int i,int j);
+
+
+ void vnstrich(int i,int j);
+
+
+
+ protected:
+ virtual int _change(ProblemChange **p);
+
+
+ virtual void _doChange(ProblemChange &c);
+
+
+ virtual void _undoChange(ProblemChange &c);
+
+
+ virtual double _value();
+
+
+ double _valueChange(KategProblemChange &k);
+
+
+ virtual void incrementDirection();
+
+
+ virtual int maxDimensionVal(void) ;
+
+
+ virtual int maxDimension(void) ;
+
+
+public:
+ leda_array<string> *words;
+typedef leda_set<int> intSet;
+
+leda_array<intSet> *kats;
+
+ KategProblemWBC wordFreq;
+ KategProblemKBC katFreq;
+
+ Array<int> initLike;
+
+ KategProblem(int aw,int mak,int _initialisierung,int _auswertung,
+ int _nachbarschaft,int minw=0);
+
+
+ virtual ~KategProblem();
+
+
+ virtual void _initialize(int initTyp);
+ virtual void _initialize(int initTyp,int specialFixedWord);
+
+
+ virtual double valueChange(ProblemChange&c);
+
+
+ virtual Problem *makeEqualProblem();
+
+
+ virtual double nicevalue(double value=1e100);
+
+
+ void makeKats();
+
+
+ virtual void dumpOn(ostream &strm);
+
+
+ virtual void dumpInfos(ostream &strm);
+
+
+
+
+
+ inline void katwahl(int k);
+
+
+ inline void wortwahl(int w);
+
+
+
+
+
+ inline int katOfWord(int w);
+
+
+ inline short wortwahl();
+
+
+ inline short katwahl() ;
+
+
+ virtual int maxNonBetterIterations();
+
+
+ virtual int expectedNumberOfIterations();
+
+
+ const char *getString(int i);
+ string getTheString(int i);
+
+
+ void makeTitle(char x[512]);
+
+
+ void fixInitLike();
+
+};
+
+inline int KategProblem::katOfWord(int w){return _katOfWord[w];};
+inline short KategProblem::wortwahl(){return nachbarschaft&CHOOSE_WORD;};
+inline short KategProblem::katwahl() {return nachbarschaft&CHOOSE_KAT;};
+
+inline void KategProblem::katwahl(int k)
+ {
+ nachbarschaft = (nachbarschaft&(~CHOOSE_KAT)) | k;
+ if(k==K_BEST)
+ _maxCompVal=1;
+ else
+ _maxCompVal=katFreq.nKats-2;
+ };
+
+inline void KategProblem::wortwahl(int w)
+ {
+ nachbarschaft = (nachbarschaft&(~CHOOSE_WORD)) | w;
+ };
+
+
+
+inline FreqType KategProblem::nstrich(int i,int j)
+{
+ FreqType n=0;
+
+ if( i==ursprung )
+ n-=nwg.getFreq(j);
+ if( i==ziel )
+ n+=nwg.getFreq(j);
+
+ if( j==ursprung )
+ n-=ngw.getFreq(i);
+ if( j==ziel )
+ n+=ngw.getFreq(i);
+
+ if( i==ursprung && j==ursprung )
+ n+=nww;
+ if( i==ziel && j==ziel )
+ n+=nww;
+
+ if( i==ursprung && j==ziel )
+ n-=nww;
+ if( i==ziel && j==ursprung )
+ n-=nww;
+
+ return n;
+}
+
+
+
+
+
+#define MAX_H_TABLE 4000
+extern double h_table[],l_table[],hmy_table[],hmy_sigma;
+
+
+inline double kat_mlog(double x)
+{
+ if(x<=1e-9)
+ return 0;
+ else
+ return log(x);
+}
+
+
+inline double kat_mlog(int s)
+{
+ if(s<=0)
+ return 0;
+ else if( s<MAX_H_TABLE )
+ {
+ massert( s==0 || l_table[s]==log(s) );
+ return l_table[s];
+ }
+ else
+ return log((double)(s));
+}
+
+
+
+inline double kat_hlo(int n)
+{
+ return n*kat_mlog(n-1);
+}
+
+inline double kat_hlo(double n)
+{
+ return n*kat_mlog(n-1);
+}
+
+
+inline double kat_h(int n)
+{
+ massert(n>=-1);
+ if(n<=0)
+ return 0;
+ else
+ if(n<MAX_H_TABLE)
+ {
+ massert(n==0||fabs(h_table[n]-n*log((double)n))<1e-8);
+ return h_table[n];
+ }
+ else
+ return n*log((double)(n));
+}
+inline double kat_h(double n)
+{
+ if(n<=1e-9)
+ return 0;
+ else
+ return n*log(n);
+}
+
+
+inline double kat_etaFkt(int _e0,int e1,int immer0,int cats)
+{
+ int e0 = _e0 - immer0;
+ int ePlus = cats*cats - _e0;
+ if(cats*cats-e0>1)
+ return e1*log( (ePlus-1.0)/(e0+1.0)*rhoLo );
+ else
+ return 0;
+}
+
+double mkat_h_full(int n,double tf);
+double mkat_h_part(int n,double cf);
+
+int Hash(const string& s);
+
+
+#endif
+
diff --git a/mkcls-v2/KategProblemKBC.cpp b/mkcls-v2/KategProblemKBC.cpp
new file mode 100644
index 0000000..97c40fc
--- /dev/null
+++ b/mkcls-v2/KategProblemKBC.cpp
@@ -0,0 +1,243 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include <stdlib.h>
+#include "KategProblem.h"
+
+double rhoLo=0.75;
+#define MAX_VERFAELSCHUNG 5000
+double verfTab[MAX_VERFAELSCHUNG],verfTabSigma=-1.0;
+double verfaelsche(int a,double b)
+{
+
+ if( a>=0&&verfTabSigma==b&&a<MAX_VERFAELSCHUNG )
+ {
+ massert(verfTab[a]== b*(erf(10000.0) - erf(a/b))/2+a);
+ return verfTab[a];
+ }
+ else
+ {
+ double x = b*(erf(10000.0) - erf(a/b))/2+a;
+ return x;
+ }
+}
+double verfaelsche(double,double b)
+{
+ abort();
+ return b;
+}
+
+KategProblemKBC::KategProblemKBC(int s,double sv) :
+ _n(s),_n1(s,0),_n2(s,0),sigmaVerfaelschung(sv),withVerfaelschung(sv!=0.0),
+ _nverf(s),_n1verf(s,0.0),_n2verf(s,0.0),_nWords(0),
+ eta0(s*s),eta1(0),c1_0(s),c2_0(s),
+ _bigramVerfSum(0.0),_unigramVerfSum1(0.0),_unigramVerfSum2(0.0),nKats(s)
+
+{
+ verfInit0=0.0;
+ int i;
+ if( withVerfaelschung )
+ {
+ verfInit0=verfaelsche(0,sv);
+ cout << "VERFAELSCHUNG wird mitgefuehrt => LANGSAMER!!!\n";
+ }
+ for(i=0;i<s;i++)
+ {
+ _n[i].init(s,0);
+ _nverf[i].init(s,verfInit0);
+ _n1verf[i]=_n2verf[i]=verfInit0;
+ _bigramVerfSum+=verfInit0*s;
+ _unigramVerfSum1+=verfInit0;
+ _unigramVerfSum2+=verfInit0;
+ }
+ if( withVerfaelschung )
+ {
+ cout << "VERFAELSCHUNG " << _bigramVerfSum << " " << _unigramVerfSum1 << " " << _unigramVerfSum2 << endl;
+ }
+ verfTabSigma=sigmaVerfaelschung;
+
+
+
+}
+
+void KategProblemKBC::setN(int w1,int w2, FreqType n)
+
+{
+ addN(w1,w2,-_n[w1][w2]);
+ addN(w1,w2,n);
+}
+
+
+double KategProblemKBC::fullBewertung(int auswertung)
+{
+
+ double bewertung=0;
+ int c1,c2;
+
+
+ switch( auswertung )
+ {
+ case CRITERION_ML:
+ for(c1=0;c1<nKats;c1++)
+ {
+ for(c2=0;c2<nKats;c2++)
+ bewertung-=kat_h(_n[c1][c2]);
+ bewertung+=kat_h(_n1[c1])+kat_h(_n2[c1]);
+ }
+ break;
+ case CRITERION_MY:
+ {
+ for(c1=0;c1<nKats;c1++)
+ {
+ for(c2=0;c2<nKats;c2++)
+ bewertung-=mkat_h_full((int)n(c1,c2),nverf(c1,c2));
+ bewertung+=mkat_h_part((int)(n1(c1)),n1verf(c1))+mkat_h_part((int)(n2(c1)),n2verf(c1));
+ }
+ double u1=_unigramVerfSum1-verfInit0*c1_0;
+ double u2=_unigramVerfSum2-verfInit0*c2_0;
+ double b=_bigramVerfSum-verfInit0*(c1_0*nKats+c2_0*nKats-c1_0*c2_0);
+ if( verboseMode>1 )
+ {
+ cout << "CRITERION_MY: " << bewertung << endl;
+ cout << "U1:"<<_unigramVerfSum1 << " n:"<<u1<< " "
+ << "U2:"<<_unigramVerfSum2 << " n:"<<u2<< " "
+ << "U3:"<<_bigramVerfSum << " n:"<<b<< endl;
+ }
+ if(b>0.000001)
+ {
+
+
+ if(verboseMode>1 )
+ cout << " NEU: " <<_nWords*log( u1 * u2 / b ) << endl;
+ bewertung -= _nWords*log( u1 * u2 / b );
+ if(verboseMode>1)
+ cout << "SCHLUSSBEWERTUNG: " << bewertung << endl;
+ }
+ else
+ cout << "B zu klein " << b << endl;
+ }
+ break;
+ case CRITERION_LO:
+ for(c1=0;c1<nKats;c1++)
+ {
+ for(c2=0;c2<nKats;c2++)
+ bewertung-=_n[c1][c2]*kat_mlog(_n[c1][c2]-1-rhoLo);
+ bewertung+=_n1[c1]*kat_mlog(_n1[c1]-1)+_n2[c1]*kat_mlog(_n2[c1]-1);
+ }
+ bewertung-=kat_etaFkt(eta0,eta1,(c1_0*nKats+c2_0*nKats-c1_0*c2_0),nKats);
+ break;
+ default:
+ cerr << "Error: wrong criterion " << auswertung << endl;
+ exit(1);
+ }
+ return bewertung;
+}
+
+double KategProblemKBC::myCriterionTerm()
+{
+ iassert( withVerfaelschung );
+ double r;
+ double u1=_unigramVerfSum1-verfInit0*c1_0;
+ double u2=_unigramVerfSum2-verfInit0*c2_0;
+ double b=_bigramVerfSum-verfInit0*(c1_0*nKats+c2_0*nKats-c1_0*c2_0);
+
+
+ if( verboseMode>1 )
+ {
+ cout << "nwords divisor:"<<_nWords << " " << u1 * u2 / b << endl;
+ cout << "ergebnis: "<<_nWords*log( u1 * u2 / b ) << endl;
+ cout << "0: "<<c1_0 << endl;
+ }
+ r = _nWords*log( u1 * u2 / b );
+
+ return -r;
+}
+
+
+
+
+double KategProblemKBC::bigramVerfSum()
+{
+ double sum=0;
+ for(int c1=0;c1<nKats;c1++)
+ for(int c2=0;c2<nKats;c2++)
+ sum+=nverf(c1,c2);
+ cout << "BIGRAMVERFSUM: " << sum << endl;
+ return sum;
+}
+
+double KategProblemKBC::unigramVerfSum1()
+{
+ double sum=0;
+ for(int c1=0;c1<nKats;c1++)
+ sum+=n1verf(c1);
+ cout << "UNIGRAMVERFSUM1: " << sum << endl;
+ return sum;
+}
+
+double KategProblemKBC::unigramVerfSum2()
+{
+ double sum=0;
+ for(int c1=0;c1<nKats;c1++)
+ sum+=n2verf(c1);
+ cout << "UNIGRAMVERFSUM2: " << sum << endl;
+ return sum;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/mkcls-v2/KategProblemKBC.h b/mkcls-v2/KategProblemKBC.h
new file mode 100644
index 0000000..4bac62a
--- /dev/null
+++ b/mkcls-v2/KategProblemKBC.h
@@ -0,0 +1,157 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef KATEGPROBLEMKBC_H
+#define KATEGPROBLEMKBC_H
+
+typedef Array<FreqType> FreqArray;
+typedef Array<double> FreqArrayReal;
+
+
+double verfaelsche(int a,double b);
+double verfaelsche(double a,double b);
+
+class KategProblemKBC
+
+
+{
+ friend class KategProblem;
+
+ private:
+ Array<FreqArray> _n;
+ Array<FreqType> _n1;
+
+ Array<FreqType> _n2;
+
+
+ double sigmaVerfaelschung;
+ short withVerfaelschung;
+
+ Array<FreqArrayReal> _nverf;
+ Array<double> _n1verf;
+ Array<double> _n2verf;
+ FreqType _nWords;
+
+ protected:
+ int eta0;
+ int eta1;
+ int c1_0;
+ int c2_0;
+ double _bigramVerfSum;
+ double _unigramVerfSum1;
+ double _unigramVerfSum2;
+ double verfInit0;
+
+ public:
+ int nKats;
+
+ KategProblemKBC(int nKats,double sv);
+
+
+ double fullBewertung(int auswertung);
+
+
+ FreqType n(int w1,int w2) { return _n[w1][w2]; };
+
+
+ FreqType n1(int w) { return _n1[w];};
+
+
+ FreqType n2(int w) { return _n2[w];};
+
+
+ double bigramVerfSum();
+ double unigramVerfSum1();
+ double unigramVerfSum2();
+
+ double nverf(int w1,int w2) { return _nverf[w1][w2]; }
+
+ double n1verf(int w) { return _n1verf[w]; };
+
+ double n2verf(int w) { return _n2verf[w]; };
+
+ inline void addN(int w1,int w2, FreqType n);
+
+
+ void setN(int w1,int w2, FreqType n);
+
+
+ double myCriterionTerm();
+
+};
+
+inline void KategProblemKBC::addN(int w1,int w2, FreqType n)
+{
+ if(n!=0)
+ {
+ FreqType &s= _n[w1][w2];
+ if(s==0)
+ eta0--;
+ else if(s==1)
+ eta1--;
+ if(_n1[w1]==0)
+ c1_0--;
+ if(_n2[w2]==0)
+ c2_0--;
+
+ if(withVerfaelschung)
+ {
+ double verfOld=verfaelsche(s,sigmaVerfaelschung);
+ double verfNew=verfaelsche(s+n,sigmaVerfaelschung);
+ double verfOld1=verfaelsche(_n1[w1],sigmaVerfaelschung);
+ assert(verfOld1==_n1verf[w1]);
+ double verfNew1=verfaelsche(_n1[w1]+n,sigmaVerfaelschung);
+ double verfOld2=verfaelsche(_n2[w2],sigmaVerfaelschung);
+ assert(verfOld2==_n2verf[w2]);
+ double verfNew2=verfaelsche(_n2[w2]+n,sigmaVerfaelschung);
+ _n1verf[w1]=verfNew1;
+ _unigramVerfSum1+=verfNew1-verfOld1;
+ _n2verf[w2]=verfNew2;
+ _unigramVerfSum2+=verfNew2-verfOld2;
+ _nverf[w1][w2]=verfNew;
+ _bigramVerfSum+=verfNew-verfOld;
+ _nWords+=n;
+ }
+ s+=n;_n1[w1]+=n;_n2[w2]+=n;
+
+ assert(_n[w1][w2]>=0);
+ assert(_n1[w1]>=0);
+ assert(_n2[w2]>=0);
+
+ if(s==0)
+ eta0++;
+ else if(s==1)
+ eta1++;
+ if(_n1[w1]==0)
+ c1_0++;
+ if(_n2[w2]==0)
+ c2_0++;
+ }
+};
+#endif
diff --git a/mkcls-v2/KategProblemTest.cpp b/mkcls-v2/KategProblemTest.cpp
new file mode 100644
index 0000000..8c76ce5
--- /dev/null
+++ b/mkcls-v2/KategProblemTest.cpp
@@ -0,0 +1,700 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "KategProblemTest.h"
+
+#include "ProblemTest.h"
+#include "HCOptimization.h"
+#include "TAOptimization.h"
+#include "RRTOptimization.h"
+#include "GDAOptimization.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string>
+#include <strstream>
+
+typedef pair<string,string> PSS;
+
+#define NEW_SENTENCE_END "mkcls-mapped-dollar-symbol-$"
+
+#ifdef NeXT
+char *strdup(char *a)
+{
+ char *p = (char *)malloc(strlen(a)+1);
+ strcpy(p,a);
+ return p;
+}
+
+#endif
+
+
+void writeClasses(Array<Kategory> &katOfWord,KategProblem &problem,ostream &to)
+{
+ for(int i=0;i<katOfWord.size();i++)
+ {
+ if( strcmp(problem.getString(i),"$") )
+ if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 )
+ to << "$" << "\t" << katOfWord[i] << endl;
+ else
+ to << problem.getString(i) << "\t" << katOfWord[i] << endl;
+ }
+}
+
+
+void mysplit(const string &s,string &s1,string &s2)
+{
+ unsigned int i=0;
+ for(;i<s.length();i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break;
+ s1=s.substr(0,i);
+ for(;i<s.length();i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break;
+ s2=s.substr(i,s.length()-i);
+
+ iassert(s1.size());
+ iassert(s2.size());
+}
+
+
+
+int fromCatFile(KategProblem *p,const char *fname,bool verb)
+{
+ leda_h_array<string,int> translation(-1);
+ int maxCat=2;
+ ifstream in(fname);
+ if(!in)
+ {
+ cerr << "Error: File '" << fname << "' cannot be opened.\n";
+ exit(1);
+ }
+ for(int i=0;i<p->wordFreq.nWords;i++)
+ (p->initLike)[i]= -1;
+
+
+ translation["1"]=1;
+ translation["0"]=0;
+
+
+ string s;
+ while( getline(in,s) )
+ {
+ string str,categ;
+ mysplit(s,str,categ);
+ int i=p->words->binary_locate(str);
+ if(i>=0 && (*(p->words))[i]==str )
+ {
+
+ if( translation[categ]==-1 )
+ translation[categ]=maxCat++;
+ int cat=translation[categ];
+ if( (p->initLike)[i]!= -1 )
+ cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n";
+ (p->initLike)[i]=cat;
+ }
+ else
+ cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n";
+ }
+
+ if( verboseMode )
+ cout << "We have " << maxCat << " read non-empty categories"
+ " (with words from the corpus).\n";
+
+ if(maxCat>p->katFreq.nKats)
+ {
+ cerr << "Error: Not enough categories reserved (only "
+ << p->katFreq.nKats << ", but i need " << maxCat << ").\n";
+ exit(1);
+ }
+
+
+ int i=p->words->binary_locate("$");
+ if( i>=0 && (*(p->words))[i]=="$" )
+ (p->initLike)[i]=0;
+ else
+ if( verboseMode )
+ cerr << "Warning: No '$' in vocabulary!\n";
+
+
+ int errors=0;
+ for(i=0;i<p->wordFreq.nWords;i++)
+ if((p->initLike)[i]== -1 )
+ {
+ if( verb ) cerr << "Error: I don't know the category of word " << i
+ << " (" << (*(p->words))[i] << ") " << ".\n";
+ errors=1;
+ }
+ return errors;
+}
+
+
+
+KategProblem *makeKategProblem(const leda_h_array<PSS,FreqType>&cTbl,const leda_set<string>&setVokabular, int maxClass,int initialisierung,
+ int auswertung,int nachbarschaft,int minWordFrequency)
+{
+
+ int nwrd=0;
+ leda_array<string>&sVok = *new leda_array<string>(setVokabular.size());
+ string s;
+ unsigned int ctr=0;
+ forall_set(leda_set<string>,s,setVokabular)
+ {
+ if( verboseMode>2 )
+ cout << "mkcls:Wort " << ctr << " " << s << endl;
+ sVok[ctr++]=s;
+ }
+ for(unsigned int z=0;z<ctr-1;z++)
+ iassert( sVok[z]<sVok[z+1] );
+ sVok.sort();
+
+ if( verboseMode>2 )
+ cout << "*****Vocabulary: " << sVok;
+
+ unsigned int vokSize=sVok.size();
+ massert(vokSize==ctr); massert(vokSize==setVokabular.size());
+ if(verboseMode)
+ {cout << "Size of vocabulary: " << vokSize << "\n";cout.flush();}
+
+ KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung,
+ auswertung,nachbarschaft,minWordFrequency);
+ KategProblemWBC &w=k->wordFreq;
+ k->words=&sVok;
+
+ Array<int> after(vokSize,0);
+ Array<int> before(vokSize,0);
+
+
+ nwrd=0;
+ {
+ PSS s;
+ forall_defined_h2(PSS,FreqType,s,cTbl)
+ {
+ const string&ss1=s.first;
+ const string&ss2=s.second;
+ if( ss2.length()&&(ss1!="$" || ss2!="$") )
+ {
+ int i1=sVok.binary_search(ss1);
+ int i2=sVok.binary_search(ss2);
+ iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
+ after[i1]++;
+ before[i2]++;
+ }
+ if( verboseMode&&((nwrd++)%10000==0) )
+ {cout<<"Statistiken-1 " << nwrd<< ". \r";cout.flush();}
+ }
+ }
+
+ for(unsigned int i=0;i<vokSize;i++)
+ {
+ w.setAfterWords(i,after[i]);
+ w.setBeforeWords(i,before[i]);
+ }
+
+
+ {
+ nwrd=0;
+ PSS s;
+ forall_defined_h2(PSS,FreqType,s,cTbl)
+ {
+ const string&ss1=s.first;
+ const string&ss2=s.second;
+ FreqType p=cTbl[s];
+ if( ss2.length()&&(ss1!="$" || ss2!="$") )
+ {
+ int i1=sVok.binary_search(ss1);
+ int i2=sVok.binary_search(ss2);
+ iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
+ w.setFreq(i1,i2,p);
+ if( verboseMode>2 )
+ cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " "
+ << ss2 << ":" << i2 << " " << p << endl;
+ }
+ if( verboseMode&&((nwrd++)%10000==0) )
+ {cout<<"Statistiken-2 " <<nwrd<< ". \r";cout.flush();}
+ }
+ }
+
+ w.testFull();
+ if(verboseMode){cout << "Datenintegritaet getestet.\n";cout.flush();}
+ return k;
+}
+
+KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung,
+ int auswertung,int nachbarschaft,int minWordFrequency)
+{
+ ifstream file(str);
+ if(!file)return 0;
+ leda_set<string> setVokabular;
+ leda_h_array<PSS,FreqType> cTbl;
+ double c=0;
+ if( verboseMode )cout << "NGRFILE: " << str << endl;
+ string s1,s2;
+ while(file >> c >> s1 >> s2)
+ {
+ if( s1.length()==0||s2.length()==0 )
+ {
+ cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl;
+ return 0;
+ }
+ if( c==0 )
+ {
+ cerr << "Count ist 0 " << s1 << " " << s2 << endl;
+ return 0;
+ }
+ cTbl[pair<string,string>(s1,s2)]=(FreqType)c;
+ setVokabular.insert(s1);
+ setVokabular.insert(s2);
+ if( verboseMode>1 )
+ cout << "R: " << s1 << " " << s2 << " " << c << endl;
+ c=0;
+ }
+
+ return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
+}
+
+
+
+
+
+
+
+
+KategProblem *fromKModel(const char *str,int maxClass,int initialisierung,
+ int auswertung,int nachbarschaft,int minWordFrequency)
+{
+ string oldText,text,line;
+ ifstream f(str);
+ if( !f )
+ {
+ cerr << "ERROR: can not open file " << str << ".\n";
+ return 0;
+ }
+
+ leda_set<string> setVokabular;
+ leda_h_array<PSS,FreqType> cTbl(0);
+ oldText="$";
+ while(1)
+ {
+ getline(f,line);
+ if(f.fail() && !f.bad() && !f.eof())
+ {
+ cerr << "WARNING: strange characters in stream (getline) " << endl;f.clear();
+ }
+ if(!f)break;
+
+ istrstream f2(line.c_str());
+ while( 1 )
+ {
+ f2 >> text;
+ if(f2.fail() && !f2.bad() && !f2.eof())
+ {
+ cerr << "WARNING: strange characters in stream (>>) !\n";
+ f2.clear(ios::failbit);
+ }
+ if(!f2){break;}
+
+
+
+
+
+
+ if( text == "$" )
+ text = "mkcls-mapped-dollar-symbol-$";
+ if( !setVokabular.member(text) )setVokabular.insert(text);
+ cTbl[pair<string,string>(oldText,text)]++;
+ oldText=text;
+ }
+ text="$";
+ if( !setVokabular.member(text) )setVokabular.insert(text);
+ cTbl[pair<string,string>(oldText,text)]++;
+ oldText=text;
+ }
+ return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
+}
+
+
+
+
+
+void KategProblemSetParameters(KategProblem &p)
+{
+ if( p.katwahl()==K_BEST )
+ {
+ TAOptimization::defaultAnnRate=0.7;
+ RRTOptimization::defaultAnnRate=0.95;
+ GDAOptimization::defaultAlpha=0.05;
+ if( verboseMode )
+ cout << "Parameter-setting like W-DET-BEST\n";
+ }
+ else
+ {
+ TAOptimization::defaultAnnRate=0.4;
+ RRTOptimization::defaultAnnRate=0.6;
+ GDAOptimization::defaultAlpha=0.0125;
+ if( verboseMode )
+ cout << "Parameter-setting like W-DET-DET\n";
+ }
+}
+
+
+
+
+KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue,
+ int auswertung,int nachbarschaft,float relInit)
+{
+ KategProblem &k=
+ *new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft);
+ KategProblemWBC &w=k.wordFreq;
+ Array<int> after(ANZ_WORD,0);
+ Array<int> before(ANZ_WORD,0);
+ Array<FreqArray> twoD(ANZ_WORD);
+ int i;
+ for(i=0;i<ANZ_WORD;i++) twoD[i].init(ANZ_WORD,0);
+
+ for(i=0;i<ANZ_WORD;i++)
+ {
+ massert(after[i]==0);
+ massert(before[i]==0);
+ for(int j=0;j<ANZ_WORD;j++)
+ {
+ massert(twoD[i][j]==0);
+ }
+ }
+ for(i=0;i<ANZ_WORD*ANZ_WORD*relInit;i++)
+ {
+ int x=randomInt(ANZ_WORD);
+ int y=randomInt(ANZ_WORD);
+ if(twoD[x][y]==0)
+ {
+ after[x]++;
+ before[y]++;
+ }
+ twoD[x][y]+=randomInt(10)+1;
+ }
+ for(i=0;i<ANZ_WORD;i++)
+ {
+ w.setAfterWords(i,after[i]);
+ w.setBeforeWords(i,before[i]);
+ }
+
+ for(i=0;i<ANZ_WORD;i++)
+ {
+ for(int j=0;j<ANZ_WORD;j++)
+ if( twoD[i][j] )
+ w.setFreq(i,j,twoD[i][j]);
+ }
+ w.testFull();
+ return k;
+}
+
+
+
+
+char *makeTitle(KategProblem &problem,int verfahren)
+{
+ char x[1024];
+ switch(verfahren)
+ {
+ case HC_OPT:
+ strcpy(x,"HC ");
+ break;
+ case SA_OPT:
+ strcpy(x,"SA ");
+ break;
+ case TA_OPT:
+ strcpy(x,"TA ");
+ break;
+ case GDA_OPT:
+ strcpy(x,"GDA ");
+ break;
+ case RRT_OPT:
+ strcpy(x,"RRT ");
+ break;
+ }
+ problem.makeTitle(x+strlen(x));
+ return strdup(x);
+}
+
+
+
+
+#define MAX_MULTIPLE 10
+
+Array<KategProblem *> &_izrOptimization(Array<KategProblem *> &probs,
+int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord,
+int anzIter,int verfahren)
+{
+ massert(anzprob>1);
+ massert(probs[0]->wordFreq.mindestAnzahl<=1);
+ KategProblem *p0=probs[0];
+
+ int nWords=p0->wordFreq.nWords;
+ int nKats=p0->katFreq.nKats;
+ int minimumNumberOfWords = max(1,int(nWords*0.95));
+
+ int indexOfDurchschnitt;
+ Array<int> newWords(nWords);
+ int useAnzprob=anzprob;
+ do
+ {
+ int w,k;
+ indexOfDurchschnitt=0;
+ for(w=0;w<nWords;w++)
+ newWords[w]=-1;
+ for(k=0;k<useAnzprob;k++)
+ {
+ massert(probs[k]->wordFreq.nWords==nWords);
+ probs[k]->makeKats();
+ }
+
+ for(w=0;w<nWords;w++)
+ {
+ if( newWords[w]==-1 )
+ {
+
+
+
+ leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)];
+ for(k=1;k<useAnzprob;k++)
+ durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)];
+
+
+ int _anzInDurchschnitt=0;
+ int nr=0;
+ forall_set(leda_set<int>,nr,durchschnitt)
+ {
+ _anzInDurchschnitt++;
+ newWords[nr]=indexOfDurchschnitt;
+ }
+ if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 )
+ {
+ cout << "- (";
+ forall_set(leda_set<int>,nr,durchschnitt)
+ {
+ cout << p0->getString(nr);
+ if( p0->wordFreq.n1(nr)==1 )
+ cout << "* ";
+ else
+ cout << " ";
+ }
+ cout << ")\n";
+ }
+
+
+
+
+ for(k=0;k<useAnzprob;k++)
+ {
+ durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)];
+ }
+ indexOfDurchschnitt++;
+ }
+ }
+
+ if(indexOfDurchschnitt>=minimumNumberOfWords)
+ {
+ if(useAnzprob==1)
+ {
+ cout << "useAnzProb==1 => mysterious.\n";
+ break;
+ }
+ useAnzprob--;
+ }
+ }
+ while(indexOfDurchschnitt>=minimumNumberOfWords);
+
+
+ Array<KategProblem *> &neu=*new Array<KategProblem *>(MAX_MULTIPLE*anzprob,(KategProblem *)0);
+ qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem);
+ massert(useAnzprob<=probs.size());
+ double startTime=clockSec();
+ int i, numberOfNew;
+ for(numberOfNew=0; (clockSec()-startTime<timeForOneRed)
+ || (numberOfNew < anzprob) ; numberOfNew++)
+ {
+ int w;
+ if( numberOfNew==anzprob*MAX_MULTIPLE-1 )
+ break;
+ KategProblem *p
+ = neu[numberOfNew]
+ = new KategProblem(indexOfDurchschnitt,nKats-2,
+ p0->initialisierung,p0->auswertung,p0->nachbarschaft);
+
+ for(w=0;w<indexOfDurchschnitt;w++)
+ {
+ p->wordFreq.setAfterWords(w,5);
+ p->wordFreq.setBeforeWords(w,5);
+ }
+ for(w=0;w<nWords;w++)
+ {
+ Array<OneFreq> &after=p0->wordFreq.after[w];
+ int size=after.size();
+ for(i=0;i<size;i++)
+ p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n);
+ }
+ p->wordFreq.testFull(1);
+
+
+
+
+
+
+ p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words());
+ double w1=0.0,w2=0.0;
+ if(numberOfNew<useAnzprob)
+ {
+
+ for(i=0;i<nWords;i++)
+ (p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i);
+ p->_initialize(5);
+ HCOptimization hc(*p,-1);
+ if(verboseMode)
+ {
+ w1=p->nicevalue();
+ cout << "from old category system:" << w1 << endl;
+ }
+ hc.minimize(-1);
+ if(verboseMode)
+ {
+ w2=p->nicevalue();
+ if(w2<w1)
+ cout << "improvement: " << w1-w2 << endl;
+ }
+ }
+ else
+ {
+ p->_initialize(1);
+ double mean;
+ StatVar end,laufzeit,start;
+ solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start);
+ w2=p->value();
+ if(verboseMode)
+ cout << "new category system: " << w2 << " (" << p->nicevalue()
+ << ") Zeit: " << clockSec() << "\n";
+ }
+ }
+ int p;
+ for(p=0;p<probs.size();p++)
+ {
+ if( probs[p] )
+ delete probs[p];
+ }
+ qsort(neu.getPointerToData(),numberOfNew,sizeof(Problem *),compareProblem);
+ massert(numberOfNew<=neu.size());
+ if( verboseMode )
+ cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt
+ << " words. costs: " << neu[0]->value() << " "
+ << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: "
+ << clockSec() << endl;
+ if( indexOfDurchschnitt<=nKats
+ || (clockSec()>maxClock&&maxClock) )
+ {
+ if( clockSec()>maxClock&&maxClock )
+ cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n";
+ for(i=0;i<nWords;i++)
+ katOfWord[i]=neu[0]->katOfWord(newWords[i]);
+ return neu;
+ }
+ else
+ {
+ Array<Kategory> &newKatOfWord=
+ *(new Array<Kategory>(neu[0]->wordFreq.nWords,-1));
+ Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed,
+ maxClock,newKatOfWord,
+ anzIter+1,verfahren);
+ for(i=0;i<nWords;i++)
+ katOfWord[i]=newKatOfWord[newWords[i]];
+ return erg;
+ }
+}
+
+
+
+
+KategProblem *izrOptimization(KategProblem &p,int minN,int firstN,
+ double clockForOneRed,double maxClock,int verfahren)
+{
+ Array<Kategory> katOfWord(p.wordFreq.nWords,-1);
+ int startN;
+ if( clockForOneRed<=0 )
+ startN=firstN;
+ else
+ startN=1000;
+ Array<KategProblem *> probs(startN);
+ double val1=0.0,val2=0.0;
+ double endTime=-1;
+
+ double startTime=clockSec();
+ int i;
+ for(i=0;i<startN;i++)
+ {
+ StatVar end,laufzeit,start;
+ double mean;
+ probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem());
+ solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start);
+ if( i==minN-1 )
+ endTime = clockSec();
+ if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) )
+ break;
+ }
+ if( endTime<0 )
+ endTime=clockSec();
+ massert(i>=firstN);
+
+ qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem);
+ massert(i<=probs.size());
+ if( clockForOneRed<=0 )
+ {
+ clockForOneRed=endTime-startTime;
+ if( verboseMode )
+ cout << "time for one reduction: " << clockForOneRed << endl;
+ }
+ _izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren);
+
+ KategProblem *n=(KategProblem *)(p.makeEqualProblem());
+ n->initLike= katOfWord;
+ n->_initialize(5);
+ if( verboseMode )
+ val1=n->value();
+ HCOptimization hc(*n,-1);
+ hc.minimize(-1);
+ val2=n->value();
+ if( verboseMode )
+ cout << "last improvement: " << val2-val1 << "\n";
+ cout << "final costs: " << val2 << " " << n->nicevalue() << endl;
+ if(PrintBestTo)
+ n->dumpOn(*PrintBestTo);
+ return n;
+}
+
+
+
+
+
+
+
+
+
+
+
diff --git a/mkcls-v2/KategProblemTest.h b/mkcls-v2/KategProblemTest.h
new file mode 100644
index 0000000..7767b7d
--- /dev/null
+++ b/mkcls-v2/KategProblemTest.h
@@ -0,0 +1,60 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#include "KategProblem.h"
+
+
+KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initialisierung,
+ int auswertung,int nachbarschaft,float relInit=0.1);
+
+
+
+KategProblem *fromKModel(const char *str,int maxClass,int initialisierung,
+ int auswertung,int nachbarschaft,int minWordFrequency);
+
+
+KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung,
+ int auswertung,int nachbarschaft,int minWordFrequency);
+
+void writeClasses(Array<Kategory> &katOfWord,KategProblem &problem,ostream &to);
+
+
+
+int fromCatFile(KategProblem *p,const char *s,bool verb=1);
+
+
+
+KategProblem *izrOptimization(KategProblem &p,int minN,int firstN,
+double clockForOneRed,double maxClock,int verfahren);
+
+
+
+void KategProblemSetParameters(KategProblem &p);
+
+
diff --git a/mkcls-v2/KategProblemWBC.cpp b/mkcls-v2/KategProblemWBC.cpp
new file mode 100644
index 0000000..1a0d439
--- /dev/null
+++ b/mkcls-v2/KategProblemWBC.cpp
@@ -0,0 +1,344 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include <stdlib.h>
+#include "KategProblem.h"
+
+static int oneFreqCompareSteigend(const void *p,const void *j)
+{
+#ifdef FREQTYPE_DOUBLE
+ if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) )
+ return -1;
+ if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) )
+ return +1;
+ else
+ return 0;
+#else
+ return ((OneFreq *)p)->n - ((OneFreq *)j)->n;
+#endif
+}
+static int oneFreqCompareFallend(const void *p,const void *j)
+{
+#ifdef FREQTYPE_DOUBLE
+ if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) )
+ return -1;
+ if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) )
+ return +1;
+ else
+ return 0;
+#else
+ return -((OneFreq *)p)->n + ((OneFreq *)j)->n;
+#endif
+}
+
+
+KategProblemWBC::KategProblemWBC(int n,int minw)
+: _n1(n,0),_n2(n,0),with_h_of_words(0),afterFilled(n,0),beforeFilled(n,0),filled(0),fixedWord(n,-1),absteigend(0),nWords(n),nTranspWords(0),
+ mindestAnzahl(minw),after(n),before(n),minIndex(n,-1),maxIndex(n,-1)
+
+{
+}
+
+KategProblemWBC::~KategProblemWBC()
+
+{
+ massert( after.size()==nWords);
+ if( absteigend )
+ delete absteigend;
+}
+
+void KategProblemWBC::init(int specialFixedWord)
+{
+
+ nTranspWords=0;
+ int i;
+ for(i=0;i<_n1.size();i++)
+ {
+ if( (_n1[i]<mindestAnzahl && _n2[i]<mindestAnzahl && minIndex[i]<=1) ||i==specialFixedWord )
+ {
+
+ if(!( fixedWord[i]==1 || fixedWord[i]== -1))
+ cerr << "mkcls:KategProblemWBC::init::ERROR: " << i << " " << fixedWord[i] << endl;
+ fixedWord[i]=1;
+ }
+ else if(fixedWord[i]<0)
+ nTranspWords++;
+ }
+ if( absteigend==0 )
+ absteigend= &(getSortedList(0));
+
+
+
+
+
+ if(verboseMode && nTranspWords!=_n1.size()-1 )
+ cout << "Es sind: " <<nTranspWords<<" transportierbar.\n";
+}
+
+void KategProblemWBC::set_h_of_words(double s)
+
+{
+ with_h_of_words=1;
+ h_of_words = -s;
+}
+
+double KategProblemWBC::get_h_of_words()
+
+{
+ if( with_h_of_words )
+ return -h_of_words;
+ else
+ {
+ h_of_words=0;
+ for(int i=0;i<nWords;i++)
+ h_of_words+=0.5*(kat_h(_n2[i])+kat_h(_n1[i]));
+ with_h_of_words=1;
+ return -h_of_words;
+ }
+}
+
+
+void KategProblemWBC::setAfterWords(int w,int anzahl)
+
+{
+ OneFreq o;
+ o.w=-1;
+ o.n=0;
+ afterFilled[w]=0;
+ after[w].init(anzahl,o,1);
+}
+void KategProblemWBC::setBeforeWords(int w,int anzahl)
+
+{
+ OneFreq o;
+ o.w=-1;
+ o.n=0;
+ beforeFilled[w]=0;
+ before[w].init(anzahl,o,1);
+}
+
+
+void KategProblemWBC::setFreq(int w1,int w2,FreqType anzahl)
+
+{
+ OneFreq o;
+ o.n=anzahl;
+
+ o.w=w2;
+ after[w1][afterFilled[w1]++]=o;
+ _n1[w1]+=anzahl;
+ o.w=w1;
+ before[w2][beforeFilled[w2]++]=o;
+ _n2[w2]+=anzahl;
+}
+
+void KategProblemWBC::addFreq(int w1,int w2,FreqType anzahl)
+
+{
+ OneFreq o;
+ o.n=anzahl;
+ int pos=-1,i;
+ for(i=0;i<afterFilled[w1];i++)
+ if(after[w1][i].w==w2)
+ pos=i;
+
+ if(pos==-1)
+ {
+ o.w=w2;
+ after[w1][afterFilled[w1]++]=o;
+ }
+ else
+ after[w1][pos].n+=anzahl;
+ _n1[w1]+=anzahl;
+
+ pos=-1;
+ for(i=0;i<beforeFilled[w2];i++)
+ if(before[w2][i].w==w1)
+ pos=i;
+ if(pos==-1)
+ {
+ o.w=w1;
+ before[w2][beforeFilled[w2]++]=o;
+ }
+ else
+ before[w2][pos].n+=anzahl;
+ _n2[w2]+=anzahl;
+}
+
+
+short KategProblemWBC::testFull(int doIt)
+
+{
+ int enaNom=0;
+ int afterFilledSum=0,beforeFilledSum=0;
+ int ret=1,i;
+ for(i=0;i<nWords;i++)
+ {
+ if( n1(i)==1 && n2(i)==1 )
+ enaNom++;
+ afterFilledSum+=afterFilled[i];
+ beforeFilledSum+=beforeFilled[i];
+ if(afterFilled[i]!=after[i].size())
+ {
+ ret=0;
+ if( doIt )
+ after[i].resize(afterFilled[i]);
+ }
+ if(beforeFilled[i]!=before[i].size())
+ {
+ ret=0;
+ if( doIt )
+ before[i].resize(beforeFilled[i]);
+ }
+
+ }
+ if( ret==0 && !doIt )
+ {
+ cerr << "Error: Unfilled word bigram statistics.\n";
+ exit(1);
+ }
+ else
+ filled=1;
+ if( verboseMode>1 )
+ {
+ cout << "MEAN(|L(w)|+|R(w)|)=" << (beforeFilledSum/(float)nWords)
+ +(afterFilledSum/(float)nWords) << endl;
+ cout << "Hapaslegomena: " << enaNom << endl;
+ }
+ int symmetrisch=1;
+ for(i=0;i<nWords;i++)
+ {
+ int j;
+ massert(before[i].size()==beforeFilled[i]);
+ massert( after[i].size()== afterFilled[i]);
+ FreqType sum=0;
+ for(j=0;j<after[i].size();j++)
+ sum+=after[i][j].n;
+ massert( sum==_n1[i] );
+ sum=0;
+ for(j=0;j<before[i].size();j++)
+ sum+=before[i][j].n;
+ massert(sum==_n2[i]);
+ if(_n1[i]!=_n2[i])
+ {
+ symmetrisch=0;
+ if( verboseMode>1 )
+ cout << "Asymmetrie: " << i << " " << _n1[i] << " " << _n2[i] << endl;
+ }
+
+ }
+ if(verboseMode && symmetrisch==0)
+ cout << "Warning: word bigram statistic is not symmetric "
+ "(this is possibly an error)\n";
+ return ret;
+}
+
+Array<Word> &KategProblemWBC::getSortedList(int steigend)
+
+{
+ int siz=_n2.size(),i;
+ massert(filled);
+ Array<Word> &sortedList =*new Array<Word>(siz);
+ Array<OneFreq> list(siz);
+ int pos=0;
+ for(i=0;i<siz;i++)
+ {
+ if( fixedWord[i]<0 )
+ {
+ list[pos].w=i;
+ list[pos].n=_n1[i];
+ pos++;
+ }
+ }
+ int anzFree=pos;
+ for(i=0;i<siz;i++)
+ {
+ if( fixedWord[i]>=0 )
+ {
+ list[pos].w=i;
+ list[pos].n=_n1[i];
+ pos++;
+ }
+ }
+ massert(pos==siz);
+ if(steigend )
+ qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareSteigend);
+ else
+ qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareFallend);
+ massert( anzFree<=list.size() );
+
+ for(i=0;i<siz;i++)
+ {
+ sortedList[i]=list[i].w;
+ massert(steigend || i==0 || i>=anzFree || list[i-1].n>=list[i].n );
+ massert((!steigend) || i==0 || i>=anzFree || list[i-1].n<=list[i].n );
+ }
+ return sortedList;
+}
+
+FreqType KategProblemWBC::numberOfWords()
+
+{
+ FreqType n1=0,n2=0;
+ for(int i=0;i<_n1.size();i++)
+ {
+ n1+=_n1[i];
+ n2+=_n2[i];
+ }
+ #ifndef FREQTYPE_DOUBLE
+ massert(n1==n2);
+ #endif
+ return n1;
+}
+
+void KategProblemWBC::setDollar(int n)
+
+{
+ if( fixedWord[n]<0 )
+ nTranspWords--;
+ fixedWord[n]=0;
+}
+
+void KategProblemWBC::initializeIndex(const leda_array<string>&words,char firstChar,int unten,int oben,bool noHapas)
+{
+ int n=0;
+ int i;
+ massert(-1<unten);massert(unten<oben);
+ if( verboseMode )
+ cout << "InitializeIndex: " << firstChar << " u:" << unten << " o:" << oben << " " << noHapas << endl;
+ over_array(words,i)
+ {
+ if( words[i][0]==firstChar && (noHapas || ((short)(n1(i)+0.0001))>=mindestAnzahl || ((short)(n2(i)+0.0001))>=mindestAnzahl) )
+ {
+ minIndex[i]=unten;
+ maxIndex[i]=oben;
+ n++;
+ }
+ }
+ if( verboseMode )
+ cout << "InitializeIndex gefunden fuer " << n << " Woerter.\n";
+}
+
diff --git a/mkcls-v2/KategProblemWBC.h b/mkcls-v2/KategProblemWBC.h
new file mode 100644
index 0000000..8a399e5
--- /dev/null
+++ b/mkcls-v2/KategProblemWBC.h
@@ -0,0 +1,131 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef KATEGPROBLEMWBC_H
+#define KATEGPROBLEMWBC_H
+
+struct OneFreq
+{
+ int w;
+ FreqType n;
+};
+
+typedef Array<OneFreq> ManyFreq;
+
+class KategProblemWBC
+
+{
+
+ friend class KategProblem;
+
+ private:
+ Array<FreqType> _n1;
+
+ Array<FreqType> _n2;
+
+
+ double h_of_words;
+
+
+ short with_h_of_words;
+
+ Array<int> afterFilled;
+ Array<int> beforeFilled;
+
+ Array<int> &getSortedList(int steigend);
+
+
+ protected:
+ KategProblemWBC(int n,int minw);
+
+
+ ~KategProblemWBC();
+
+
+ short filled;
+
+ Array<int> fixedWord;
+ Array<int> *absteigend;
+
+ void init(int specialFixedWord=-1);
+
+
+ public:
+ int nWords;
+ int nTranspWords;
+ short mindestAnzahl;
+ Array<ManyFreq> after;
+ Array<ManyFreq> before;
+ Array<int> minIndex;
+ Array<int> maxIndex;
+
+
+
+ void setAfterWords(int w,int anzahl);
+
+
+ void setBeforeWords(int w,int anzahl);
+
+
+ void setFreq(int w1,int w2, FreqType anzahl);
+
+
+ void addFreq(int w1,int w2,FreqType anzahl);
+
+
+ void setDollar(int n);
+
+
+ int fixed(int w)
+ {
+ return fixedWord[w];
+ }
+
+ FreqType n1(int w) { return _n1[w];};
+
+
+ FreqType n2(int w) { return _n2[w];};
+
+
+ FreqType numberOfWords();
+
+
+ short testFull(int doIt=0);
+
+
+ double get_h_of_words();
+
+
+ void set_h_of_words(double s);
+
+
+ void initializeIndex(const leda_array<string>&words,char firstChar,int min,int max,bool noHapas);
+};
+
+#endif
diff --git a/mkcls-v2/LICENSE b/mkcls-v2/LICENSE
new file mode 100644
index 0000000..5b2225e
--- /dev/null
+++ b/mkcls-v2/LICENSE
@@ -0,0 +1,282 @@
+
+
+Preamble
+
+The licenses for most software are designed to take away your freedom
+to share and change it. By contrast, the GNU General Public License is
+intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the
+rights. These restrictions translate to certain responsibilities for
+you if you distribute copies of the software, or if you modify it.
+
+For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on,
+we want its recipients to know that what they have is not the
+original, so that any problems introduced by others will not reflect
+on the original authors' reputations.
+
+Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at
+all.
+
+The precise terms and conditions for copying, distribution and
+modification follow.
+
+
+TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+0. This License applies to any program or other work which contains a
+notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the Program
+(independent of having been made by running the Program). Whether that
+is true depends on what the Program does.
+
+1. You may copy and distribute verbatim copies of the Program's source
+code as you receive it, in any medium, provided that you conspicuously
+and appropriately publish on each copy an appropriate copyright notice
+and disclaimer of warranty; keep intact all the notices that refer to
+this License and to the absence of any warranty; and give any other
+recipients of the Program a copy of this License along with the
+Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a
+fee.
+
+2. You may modify your copy or copies of the Program or any portion of
+it, thus forming a work based on the Program, and copy and distribute
+such modifications or work under the terms of Section 1 above,
+provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that
+ in whole or in part contains or is derived from the Program or
+ any part thereof, to be licensed as a whole at no charge to all
+ third parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you
+ provide a warranty) and that users may redistribute the program
+ under these conditions, and telling the user how to view a copy
+ of this License. (Exception: if the Program itself is interactive
+ but does not normally print such an announcement, your work based
+ on the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of
+ Sections 1 and 2 above on a medium customarily used for software
+ interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt otherwise
+to copy, modify, sublicense or distribute the Program is void, and
+will automatically terminate your rights under this License. However,
+parties who have received copies, or rights, from you under this
+License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted
+herein. You are not responsible for enforcing compliance by third
+parties to this License.
+
+
+7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+9. The Free Software Foundation may publish revised and/or new
+versions of the General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation. If the Program does not specify a
+version number of this License, you may choose any version ever
+published by the Free Software Foundation.
+
+10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the
+author to ask for permission. For software which is copyrighted by the
+Free Software Foundation, write to the Free Software Foundation; we
+sometimes make exceptions for this. Our decision will be guided by the
+two goals of preserving the free status of all derivatives of our free
+software and of promoting the sharing and reuse of software generally.
+
+NO WARRANTY
+
+11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE
+LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS
+AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF
+ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+
+12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+END OF TERMS AND CONDITIONS
diff --git a/mkcls-v2/MSBOptimization.cpp b/mkcls-v2/MSBOptimization.cpp
new file mode 100644
index 0000000..9478826
--- /dev/null
+++ b/mkcls-v2/MSBOptimization.cpp
@@ -0,0 +1,229 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "MSBOptimization.h"
+#include <stdlib.h>
+#include "ProblemTest.h"
+
+#ifdef __GNUC__
+template class Array<double>;
+template class Array<ProbAndOpt>;
+#endif
+
+struct doubleInt { double a; int i; };
+static int doubleintcompare(const void *p,const void *j)
+{
+ if(((struct doubleInt *)p)->a < ((doubleInt *)j)->a)
+ return -1;
+ else if(((struct doubleInt *)p)->a == ((doubleInt *)j)->a)
+ return 0;
+ else
+ return 1;
+}
+
+
+MSBOptimization::MSBOptimization(Problem &p,int verf,int anz,Array<double> &pos,Array<double> &por)
+: PopOptimization(p,verf,anz),
+percentOfSteps(pos),percentOfRun(por),nachMinimierung(0)
+{
+}
+
+
+void MSBOptimization::zInitialize()
+{
+ PopOptimization::zInitialize();
+
+ int iterationsschritte;
+ double mean;
+ StatVar end,laufzeit,start;
+ zufallSeed();
+
+
+
+
+ solveProblem(ProblemTestVerboseMode,*originalProblem,2,-1,verfahren,mean,
+ end,laufzeit,start,0,&iterationsschritte);
+ expectedSteps=(int)(iterationsschritte);
+
+ if(verboseMode)
+ cout << "MSB:mean number of steps for one run: " << expectedSteps << endl;
+}
+
+
+double MSBOptimization::minimize(int)
+{
+ if( initialisiert==0 )
+ zInitialize();
+
+ int i;
+ int anz=size();
+ int numproblems=anz;
+
+ if( verboseMode )
+ {
+ double usedSteps=0;
+ for(i=0;i<percentOfSteps.size();i++)
+ {
+ usedSteps+=expectedSteps*(percentOfSteps[i]-
+ (i==0?0:percentOfSteps[i-1]))*numproblems;
+ numproblems=(int)(ceil(anz*(1.0-percentOfRun[i])));
+ if( numproblems<1 )numproblems=1;
+ }
+ usedSteps+=expectedSteps*
+ (1.0-percentOfSteps[percentOfSteps.size()-1])*numproblems;
+ cout << "MSB: speed factor: "
+ << (double)usedSteps/(expectedSteps*size()) << endl;
+ numproblems=anz=size();
+ }
+
+ for(i=0;i<percentOfSteps.size();i++)
+ {
+
+ int steps=(int)(expectedSteps*(percentOfSteps[i]-
+ (i==0?0:percentOfSteps[i-1])));
+
+
+ for(int a=0;a<numproblems;a++)
+ {
+
+ double v;
+ v= optimization(a)->minimize(steps);
+ if(verboseMode)cout << "MSB:" << i << " " << a << ":" << v << endl;
+ }
+
+ sort();
+
+ if(verboseMode)
+ cout << "MSB: best:" << problem(0)->value()
+ << " worst:" << problem(numproblems-1)->value() << endl;
+
+
+ numproblems=(int)(anz*(1.0-percentOfRun[i]));
+ if( numproblems<1 )
+ numproblems=1;
+ if(verboseMode)
+ cout << "MSB: now i have : " << numproblems << " Problem's." << endl;
+ if(numproblems==1)
+ break;
+ }
+ assert( numproblems>0 );
+
+
+ for(int a=0;a<numproblems;a++)
+ optimization(a)->minimize(-1);
+ sort();
+
+ double ergebnisWert = problem(0)->value();
+ cout << "MSB: value:" << ergebnisWert << " (nicevalue:"
+ << problem(0)->nicevalue() << ")\n";
+ nachMinimierung=1;
+ return ergebnisWert;
+}
+
+
+
+void MSBOptimization::optimizeValues(Problem &p,int verfahren)
+{
+ int i;
+ struct doubleInt ri[20];
+ double mean;
+ StatVar end,laufzeit,start;
+ solveProblem(ProblemTestVerboseMode,p,5,-1,verfahren,mean,end,laufzeit,start);
+ double fivePercentSteps=(int)(laufzeit.getMean()/20.0);
+ double qualitaet[20][20];
+ for(i=0;i<20;i++)
+ {
+ Optimization *o=(Optimization *)genIterOptimizer(verfahren,p,-1);
+ for(int a=0;a<20;a++)
+ {
+ qualitaet[i][a]=o->minimize((int)fivePercentSteps);
+ cout << qualitaet[i][a] << " ";
+ }
+ ri[i].a=o->minimize(-1);
+ ri[i].i=i;
+ cout << ri[i].a << endl;
+ delete o;
+ }
+ qsort(ri,20,sizeof(struct doubleInt),doubleintcompare);
+
+ cout << "#Beschneidungsmatrix, welche die drei besten Laeufe erhaelt: ";
+ for(i=0;i<20;i++)
+ {
+ int a;
+ struct doubleInt v[20];
+ for(a=0;a<20;a++)
+ { v[a].i=a;v[a].a=qualitaet[a][i];}
+ qsort(v,20,sizeof(struct doubleInt),doubleintcompare);
+ int nr=0;
+ for(a=0;a<20;a++)
+ if( v[a].i==ri[0].i || v[a].i==ri[1].i || v[a].i==ri[2].i )
+ nr=a;
+ float percent=(1.0-nr/20.0)*100.0;
+ if(nr==2)
+ percent=100.0;
+ cout << "# " << i << " " << (i/20.0)*100 << "% " << percent << "%\n";
+ }
+ cout << "#Beschneidungsmatrix, welche die zwei besten Laeufe erhaelt: ";
+ for(i=0;i<20;i++)
+ {
+ int a;
+ struct doubleInt v[20];
+ for(a=0;a<20;a++)
+ { v[a].i=a;v[a].a=qualitaet[a][i];}
+ qsort(v,20,sizeof(struct doubleInt),doubleintcompare);
+ int nr=0;
+ for(a=0;a<20;a++)
+ if( v[a].i==ri[0].i || v[a].i==ri[1].i )
+ nr=a;
+ float percent=(1.0-nr/20.0)*100.0;
+ if(nr==1)
+ percent=100.0;
+ cout << "# " << i << " " << (i/20.0)*100 << "% " << percent << "%\n";
+ }
+ cout << "#Beschneidungsmatrix, welche den besten Lauf erhaelt: ";
+ for(i=0;i<20;i++)
+ {int a;
+ struct doubleInt v[20];
+ for(a=0;a<20;a++)
+ { v[a].i=a;v[a].a=qualitaet[a][i];}
+ qsort(v,20,sizeof(struct doubleInt),doubleintcompare);
+ int nr=0;
+ for(a=0;a<20;a++)
+ if( v[a].i==ri[0].i )
+ nr=a;
+ float percent=(1.0-nr/20.0)*100.0;
+ if(nr==0)
+ percent=100.0;
+ cout << "# " << i << " " << (i/20.0)*100 << "% " << percent << "%\n";
+ }
+}
+
+
+Problem& MSBOptimization::bestProblem()
+{
+ assert(nachMinimierung==1);
+ return *(problem(0));
+}
diff --git a/mkcls-v2/MSBOptimization.h b/mkcls-v2/MSBOptimization.h
new file mode 100644
index 0000000..ab30c98
--- /dev/null
+++ b/mkcls-v2/MSBOptimization.h
@@ -0,0 +1,77 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef MSBOPTIMIZATION
+#define MSBOPTIMIZATION
+
+#include "PopOptimization.h"
+
+class MSBOptimization : public PopOptimization
+ {
+
+ protected:
+
+ Array<double> percentOfSteps;
+ Array<double> percentOfRun;
+
+ int expectedSteps;
+ short nachMinimierung;
+
+ virtual void zInitialize();
+
+
+ public:
+ MSBOptimization(Problem &s,int verf,int anz,Array<double> &pos,
+ Array<double> &por);
+
+
+ virtual ~MSBOptimization(){}
+
+ virtual double minimize(int steps=-1);
+
+
+ static void optimizeValues(Problem &p,int verfahren);
+
+
+ Problem& bestProblem();
+
+
+};
+#endif
+
+
+
+
+
+
+
+
+
+
+
diff --git a/mkcls-v2/MYOptimization.cpp b/mkcls-v2/MYOptimization.cpp
new file mode 100644
index 0000000..ced9d31
--- /dev/null
+++ b/mkcls-v2/MYOptimization.cpp
@@ -0,0 +1,85 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "MYOptimization.h"
+
+MYOptimization::MYOptimization(Problem &p,int m)
+: IterOptimization(p,m),acceptFlagsNumber(0),acceptions(0),total(0)
+{
+}
+MYOptimization::MYOptimization(MYOptimization &o)
+: IterOptimization(o),acceptFlagsNumber(0),acceptions(0),total(0)
+{
+}
+short MYOptimization::accept(double delta)
+ {
+ int doIt;
+ int verbesserung = delta<0;
+ if( delta < 0 )
+ doIt=1;
+ else
+ {
+ if(total>=NUMBER_OF_ACCEPTIONS)
+ {
+ double prob = acceptions/(float)(NUMBER_OF_ACCEPTIONS);
+ double zuf = zufall01();
+
+ doIt=zuf<prob;
+ }
+ else
+ doIt=0;
+ }
+ if( total>=NUMBER_OF_ACCEPTIONS )
+ {
+ if( acceptFlags[acceptFlagsNumber] )
+ acceptions--;
+ }
+ acceptFlags[acceptFlagsNumber]=verbesserung;
+ if( verbesserung )
+ acceptions++;
+ total++;
+ acceptFlagsNumber++;
+ if(acceptFlagsNumber>=NUMBER_OF_ACCEPTIONS)
+ acceptFlagsNumber=0;
+ return doIt;
+ }
+
+short MYOptimization::end()
+ {
+ return endFlag>0 && total>NUMBER_OF_ACCEPTIONS && acceptions==0;
+ }
+void MYOptimization::abkuehlen()
+ {
+ }
+
+
+
+void MYOptimization::makeGraphOutput()
+{
+ IterOptimization::makeGraphOutput();
+ *GraphOutput << acceptions;
+}
+
diff --git a/mkcls-v2/MYOptimization.h b/mkcls-v2/MYOptimization.h
new file mode 100644
index 0000000..a6ca70c
--- /dev/null
+++ b/mkcls-v2/MYOptimization.h
@@ -0,0 +1,61 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+#ifndef MYOPTIMIZATION
+#define MYOPTIMIZATION
+#include "IterOptimization.h"
+
+#define NUMBER_OF_ACCEPTIONS 100
+
+class MYOptimization: public IterOptimization {
+
+ protected:
+ virtual short accept(double delta);
+
+
+ virtual void abkuehlen();
+
+
+ virtual short end();
+
+
+ public:
+ MYOptimization(Problem &p,int maxIter=-1);
+
+
+ MYOptimization(MYOptimization &o);
+
+
+ int acceptFlags[NUMBER_OF_ACCEPTIONS],acceptFlagsNumber;
+ int acceptions,total;
+
+ void makeGraphOutput();
+
+};
+
+#endif
diff --git a/mkcls-v2/Makefile b/mkcls-v2/Makefile
new file mode 100644
index 0000000..a37a70d
--- /dev/null
+++ b/mkcls-v2/Makefile
@@ -0,0 +1,23 @@
+OBJS = GDAOptimization.o HCOptimization.o Problem.o \
+ IterOptimization.o ProblemTest.o RRTOptimization.o \
+ MYOptimization.o SAOptimization.o TAOptimization.o \
+ Optimization.o KategProblemTest.o KategProblemKBC.o \
+ KategProblemWBC.o KategProblem.o StatVar.o general.o \
+ mkcls.o
+
+CFLAGS = -Wall -W -DNDEBUG -O3 -Wno-deprecated
+
+.cpp.o:
+ $(CXX) $(CFLAGS) -c $< -o $@
+
+LDFLAGS =
+
+mkcls: $(OBJS)
+ $(CXX) $(CFLAGS) -o mkcls $(OBJS) $(LDFLAGS)
+
+remove clean:
+ -rm -f *.o mkcls
+
+
+
+
diff --git a/mkcls-v2/Optimization.cpp b/mkcls-v2/Optimization.cpp
new file mode 100644
index 0000000..03e06df
--- /dev/null
+++ b/mkcls-v2/Optimization.cpp
@@ -0,0 +1,30 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "Optimization.h"
+
+Optimization::~Optimization() {}
+
diff --git a/mkcls-v2/Optimization.h b/mkcls-v2/Optimization.h
new file mode 100644
index 0000000..4c43427
--- /dev/null
+++ b/mkcls-v2/Optimization.h
@@ -0,0 +1,49 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef OPTIMIZATION
+#define OPTIMIZATION
+
+#include "Problem.h"
+#include "general.h"
+
+class Optimization
+{
+
+public:
+
+ virtual double minimize(int steps)=0;
+ virtual ~Optimization();
+
+};
+#endif
+
+
+
+
diff --git a/mkcls-v2/PopOptimization.cpp b/mkcls-v2/PopOptimization.cpp
new file mode 100644
index 0000000..2e65a2c
--- /dev/null
+++ b/mkcls-v2/PopOptimization.cpp
@@ -0,0 +1,105 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "PopOptimization.h"
+#include "ProblemTest.h"
+
+
+int compareProbAndOpt(const void *p,const void *j)
+{
+ double a=((ProbAndOpt *)p)->prob->value();
+ double b=((ProbAndOpt *)j)->prob->value();
+ if(a==b)
+ return 0;
+ if(a<b)
+ return -1;
+ else
+ return +1;
+}
+bool operator<(const ProbAndOpt&a, const ProbAndOpt&b)
+ {
+ return a.prob->value()<b.prob->value();
+ }
+bool operator==(const ProbAndOpt&a, const ProbAndOpt&b)
+ {
+ return a.prob->value()==b.prob->value();
+ }
+
+ostream& operator<<(ostream&o , const ProbAndOpt&){return o;}
+istream& operator>>(istream&i , ProbAndOpt&){return i;}
+
+
+
+PopOptimization::PopOptimization(Problem &p,int verf,int anz)
+: probandopt(anz),initialisiert(0),verfahren(verf)
+{
+ originalProblem = &p;
+}
+
+
+int PopOptimization::size()
+{
+ return probandopt.size();
+}
+
+Problem *PopOptimization::problem(int i)
+{
+ assert(initialisiert);
+ return probandopt[i].prob;
+}
+
+Optimization *PopOptimization::optimization(int i)
+{
+ assert(initialisiert);
+ return probandopt[i].opt;
+}
+
+void PopOptimization::zInitialize()
+{
+ int i;
+ zufallSeed();
+ for(i=0;i<size();i++)
+ {
+ probandopt[i].prob=originalProblem->makeEqualProblem();
+ probandopt[i].prob->initialize();
+ }
+
+ zufallSeed();
+ for(i=0;i<size();i++)
+ probandopt[i].opt=(Optimization *)genIterOptimizer(verfahren,
+ *(probandopt[i].prob),-1);
+
+ initialisiert=1;
+}
+
+
+void PopOptimization::sort()
+{
+ assert(initialisiert);
+
+ probandopt.sort(size());
+}
+
diff --git a/mkcls-v2/PopOptimization.h b/mkcls-v2/PopOptimization.h
new file mode 100644
index 0000000..3ae5ff3
--- /dev/null
+++ b/mkcls-v2/PopOptimization.h
@@ -0,0 +1,89 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef POPULATIONOPTIMIZATION
+#define POPULATIONOPTIMIZATION
+
+#include "Optimization.h"
+
+typedef struct
+{
+ Optimization *opt;
+ Problem *prob;
+} ProbAndOpt;
+
+bool operator<(const ProbAndOpt&a, const ProbAndOpt&b);
+bool operator==(const ProbAndOpt&a, const ProbAndOpt&b);
+ostream& operator<<(ostream& , const ProbAndOpt&b);
+istream& operator>>(istream& , ProbAndOpt&b);
+
+inline DEFINE_STANDARD_COMPARE(ProbAndOpt)
+
+int compareProbAndOpt(const void *p,const void *j);
+
+class PopOptimization : public Optimization {
+
+
+ private:
+ Array<ProbAndOpt> probandopt;
+
+ protected:
+ int initialisiert;
+ Problem *originalProblem;
+
+
+ int verfahren;
+
+
+ virtual void zInitialize();
+
+
+ public:
+ PopOptimization(Problem &s,int verf,int anz);
+
+
+ virtual ~PopOptimization() {}
+
+ int size();
+
+
+ void sort();
+
+
+ virtual Problem& bestProblem()=0;
+
+
+ Problem *problem(int i);
+
+
+ Optimization *optimization(int i);
+
+
+};
+#endif
diff --git a/mkcls-v2/Problem.cpp b/mkcls-v2/Problem.cpp
new file mode 100644
index 0000000..6e126c8
--- /dev/null
+++ b/mkcls-v2/Problem.cpp
@@ -0,0 +1,165 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#include "Problem.h"
+#include "Optimization.h"
+
+Problem::~Problem() {}
+
+Problem::Problem(int max,int anz,int _initialisierung,int _auswertung,
+ int _nachbarschaft)
+: initialized(0),curCompVal(0),curCompChange(0),maxCompVal(max),maxComp(anz),curComp(0),
+ initialisierung(_initialisierung),auswertung(_auswertung),nachbarschaft(_nachbarschaft),
+ numberOfFullEvaluations(0),numberOfPartEvaluations(0),numberOfDoChange(0)
+{
+ if( verboseMode>1 )
+ cout << "Initialization of Problem: " << maxComp << " " << maxCompVal
+ << endl;
+}
+
+void Problem::initialize(int i)
+{
+ curComp=curCompVal=curCompChange=0;
+ numberOfFullEvaluations=numberOfPartEvaluations=numberOfDoChange=0;
+ initialized=1;
+ if( i== -23 )
+ _initialize(initialisierung);
+ else
+ _initialize(i);
+ maxComp=maxDimension();
+ maxCompVal=maxDimensionVal();
+}
+
+void Problem::doChange(ProblemChange &c)
+{
+ assert (initialized);
+ curCompChange=1;
+ _doChange(c);
+ numberOfDoChange++;
+}
+
+void Problem::incrementDirection()
+{
+ if( maxCompVal==curCompVal )
+ curCompVal=0;
+ curCompChange=0;
+ curComp=(curComp+1)%maxComp;
+}
+
+ProblemChange& Problem::change()
+{
+ assert( initialized );
+ assert( maxCompVal>=curCompVal);
+
+ if( curCompChange||maxCompVal==curCompVal )
+ incrementDirection();
+
+ ProblemChange *p;
+ int changeFound=_change(&p);
+ curCompVal++;
+ if( changeFound==0 )
+ return change();
+ else
+ return *p;
+}
+double Problem::value()
+{
+ numberOfFullEvaluations++;
+ if( !initialized )
+ initialize();
+ return _value();
+}
+
+double Problem::valueChange(ProblemChange &x)
+{
+ numberOfPartEvaluations++;
+ assert( initialized );
+ double currentValue=value();
+ _doChange(x);numberOfDoChange++;
+ double newValue=value();
+ _undoChange(x);numberOfDoChange++;
+ assert( currentValue==value() );
+ return newValue-currentValue;
+}
+
+void Problem::dumpOn(ostream &strm)
+{
+ assert( initialized );
+ strm << "Problem(" << initialisierung << "," << auswertung << ","
+ << nachbarschaft << ")\n";
+ strm << " #value: " << numberOfFullEvaluations << endl;
+ strm << "#valueChange: " << numberOfPartEvaluations << endl;
+ strm << " #doChange: " << numberOfDoChange << endl;
+}
+
+StatVar& Problem::deviationStatVar(Optimization &s,int anz)
+{
+ assert( initialized );
+ StatVar &v=*new StatVar;
+ double cur=value();
+ int howOften=0;
+ while( v.getNum()<anz )
+ {
+ if( howOften++>50000 )
+ break;
+ double neuer=s.minimize(1);
+ if( neuer>cur )
+ v.addValue(neuer-cur);
+ cur=neuer;
+ vassert(NULLFLOAT(cur-value()));
+ }
+ return v;
+}
+
+void Problem::dumpInfos(ostream &strm)
+{
+ strm << "Problem: " << endl;
+ assert( initialized );
+}
+
+
+double Problem::nicevalue(double)
+{
+ return value();
+}
+
+int Problem::maxDimensionVal(void) {return -1;}
+int Problem::maxDimension(void) {return -1;}
+
+ProblemChange::~ProblemChange()
+ {
+ }
+
+ProblemChange::ProblemChange()
+ {
+ }
+
+void Problem::setValuesFrom(Problem *p)
+{
+ numberOfFullEvaluations=p->numberOfFullEvaluations;
+ numberOfPartEvaluations=p->numberOfPartEvaluations;
+ numberOfDoChange=p->numberOfDoChange;
+ initialized=p->initialized;
+}
diff --git a/mkcls-v2/Problem.h b/mkcls-v2/Problem.h
new file mode 100644
index 0000000..337390e
--- /dev/null
+++ b/mkcls-v2/Problem.h
@@ -0,0 +1,159 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef PROBLEMCHANGE
+#define PROBLEMCHANGE
+#include <iostream>
+#include "general.h"
+#include "StatVar.h"
+
+class Optimization;
+
+class ProblemChange
+
+{
+ public:
+ virtual ~ProblemChange();
+ ProblemChange();
+};
+
+class Problem {
+
+ private:
+ short initialized;
+ int curCompVal;
+ short curCompChange;
+ int maxCompVal;
+ int maxComp;
+
+
+ protected:
+ int curComp;
+
+ void setValuesFrom(Problem *p);
+
+ virtual int maxDimensionVal(void) ;
+
+
+ virtual int maxDimension(void) ;
+
+
+ inline int curDimension(void) { assert(maxComp!=-1);return curComp;}
+
+
+ inline int curDimensionVal(void) { assert(maxComp!=-1);return curCompVal;}
+
+
+
+ virtual void _doChange(ProblemChange &c)=0;
+
+
+ virtual int _change(ProblemChange **p)=0;
+
+
+ virtual void _undoChange(ProblemChange &c)=0;
+
+
+ virtual void _initialize(int initialisierung)=0;
+
+
+ virtual double _value()=0;
+
+
+ public:
+ Problem(int maxCompVal=-1,int maxComp=-1,int _initialisierung=0,
+ int _auswertung=0,int _nachbarschaft=0);
+
+ virtual ~Problem();
+
+
+ void doChange(ProblemChange &c);
+
+
+ ProblemChange& change();
+
+
+ virtual double value();
+
+
+ virtual double valueChange(ProblemChange &c);
+
+
+ virtual void initialize(int a= -23);
+
+
+ inline virtual short endCriterion();
+
+
+ virtual int maxNonBetterIterations()=0;
+
+
+ virtual int expectedNumberOfIterations()=0;
+
+
+ virtual void dumpOn(ostream &strm);
+
+
+ virtual void dumpInfos(ostream &strm);
+
+
+ virtual Problem *makeEqualProblem()=0;
+
+
+ virtual double nicevalue(double vorher=1e100);
+
+
+ virtual StatVar& deviationStatVar(Optimization &s,int anz);
+
+
+ virtual void incrementDirection();
+
+
+
+
+
+ int initialisierung;
+ int auswertung;
+ int nachbarschaft;
+
+ int numberOfFullEvaluations;
+ int numberOfPartEvaluations;
+ int numberOfDoChange;
+
+
+
+};
+
+inline short Problem::endCriterion()
+{
+ return 0;
+};
+
+#endif
+
diff --git a/mkcls-v2/ProblemTest.cpp b/mkcls-v2/ProblemTest.cpp
new file mode 100644
index 0000000..40fea7a
--- /dev/null
+++ b/mkcls-v2/ProblemTest.cpp
@@ -0,0 +1,264 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "ProblemTest.h"
+#include "HCOptimization.h"
+#include "RRTOptimization.h"
+#include "SAOptimization.h"
+#include "TAOptimization.h"
+#include "GDAOptimization.h"
+#include "MYOptimization.h"
+#include <stdio.h>
+#include "general.h"
+#include <stdlib.h>
+
+short ProblemTestVerboseMode=1;
+ofstream *PrintBestTo=0,*PrintBestTo2=0;
+
+
+int compareProblem(const void *p,const void *j)
+{
+ double a=(*(Problem **)p)->value();
+ double b=(*(Problem **)j)->value();
+ if(a==b)
+ return 0;
+ if(a<b)
+ return -1;
+ else
+ return +1;
+}
+
+
+IterOptimization *genIterOptimizer(int verfahren,Problem &problem,int maxIter)
+{
+ IterOptimization *opt;
+ switch(verfahren)
+ {
+ case HC_OPT:
+ opt = new HCOptimization(problem,maxIter);
+ break;
+ case GDA_OPT:
+ opt = new GDAOptimization(problem,maxIter);
+ break;
+ case SA_OPT:
+ opt = new SAOptimization(problem,maxIter);
+ break;
+ case TA_OPT:
+ opt = new TAOptimization(problem,maxIter);
+ break;
+ case RRT_OPT:
+ opt = new RRTOptimization(problem,maxIter);
+ break;
+ case MY_OPT:
+ opt = new MYOptimization(problem,maxIter);
+ break;
+ default:
+ return 0;
+ }
+ problem.initialize();
+ return opt;
+}
+
+
+double solveProblem(int verbose,Problem &problem,int versuche,
+ int optimierungsschritte,int verfahren,double &mean,
+ StatVar &endNice,StatVar &auswertungen,StatVar &startNice,
+ double maxClock,int *iterationsschritte)
+{
+ double smallestV=1e100;
+ Problem *bestP=0;
+ StatVar start,end;
+ StatVar dauer;
+ StatVar iterschritte;
+
+ for(int i=0;i<versuche;i++)
+ {
+ if(verbose>2)
+ {
+ cout << " " << i << " of " << versuche << ".\n";
+ cout.flush();
+ }
+ double vorher=clockSec();
+
+ IterOptimization *opt=genIterOptimizer(verfahren,problem,
+ optimierungsschritte);
+ problem.numberOfPartEvaluations=0;
+
+ startNice.addValue(problem.nicevalue());
+ start.addValue(problem.value());
+
+ double v=opt->minimize(optimierungsschritte);
+
+ if( problem.numberOfPartEvaluations==0)
+ auswertungen.addValue(opt->getCurStep());
+ else
+ auswertungen.addValue(problem.numberOfPartEvaluations);
+ iterschritte.addValue(opt->getCurStep());
+
+ endNice.addValue(problem.nicevalue());
+ end.addValue(problem.value());
+ dauer.addValue(clockSec()-vorher);
+ if( verbose>2 )
+ {
+ cout << i << ". " << v << ": ";
+ problem.dumpOn(cout);
+ }
+ delete opt;
+ if( v<smallestV && verbose>1 )
+ {
+ bestP=problem.makeEqualProblem();
+ smallestV=v;
+ }
+ if( verbose>2 )
+ cout << " time: " << clockSec() << " best:" << endNice.quantil(0)
+ << " this:" << problem.nicevalue() << endl;
+ if( maxClock && clockSec()>maxClock )
+ {
+ if(verbose)
+ cout << "Stop because of time limit ( " << (clockSec()-maxClock)
+ << " Sekunden)\n";
+ break;
+ }
+ }
+
+ if(verbose)
+ {
+ cout << "\n***** " << start.getNum() << " runs. (algorithm:";
+ switch(verfahren)
+ {
+ case HC_OPT:
+ cout << "HC";
+ break;
+ case RRT_OPT:
+ cout << "RRT";
+ break;
+ case GDA_OPT:
+ cout << "GDA";
+ break;
+ case TA_OPT:
+ cout << "TA";
+ break;
+ case SA_OPT:
+ cout << "SA";
+ break;
+ case MY_OPT:
+ cout << "MY";
+ break;
+ default:
+ cout << "!unknown!";
+ }
+ cout << ")*****\n";
+ problem.dumpInfos(cout);
+ cout << endl;
+ cout << "start-costs: "; start.dumpOn(cout); cout << endl;
+ cout << " end-costs: "; end.dumpOn(cout); cout << endl;
+ cout << " start-pp: "; startNice.dumpOn(cout); cout << endl;
+ cout << " end-pp: "; endNice.dumpOn(cout); cout << endl;
+ cout << " iterations: "; auswertungen.dumpOn(cout); cout << endl;
+ cout << " time: "; dauer.dumpOn(cout);
+ cout << endl;
+ }
+ if( bestP )
+ {
+ if(PrintBestTo)
+ bestP->dumpOn(*PrintBestTo);
+ else
+ bestP->dumpOn(cout);
+ delete bestP;
+ }
+ mean = end.getMean();
+ if( iterationsschritte )
+ *iterationsschritte=(int)(iterschritte.getMean());
+ return end.getSmallest();
+}
+
+
+
+void multiSolveProblem(Problem &problem,int versuche,int maxSeconds)
+{
+ int i;
+ int maxLaeufe;
+ double rDummy;
+ StatVar end[MAX_OPT_NR],auswertungen[MAX_OPT_NR],start[MAX_OPT_NR];
+ double maxClock=clockSec()+maxSeconds;
+ if(maxSeconds<=0)maxClock=0;
+ solveProblem(ProblemTestVerboseMode,problem,versuche,-1,HC_OPT,rDummy,
+ end[HC_OPT],auswertungen[HC_OPT],start[HC_OPT],maxClock);
+ maxLaeufe=(int)(auswertungen[HC_OPT].getMean()*5);
+ for(i=0;i<MAX_OPT_NR;i++)
+ {
+ if( i==HC_OPT )
+ continue;
+ double maxClock=clockSec()+maxSeconds;
+ if(maxSeconds<=0)maxClock=0;
+ solveProblem(ProblemTestVerboseMode,problem,versuche, -1,i,rDummy,end[i],
+ auswertungen[i],start[i],maxClock);
+ }
+ end[HC_OPT].title = " HC";
+ end[SA_OPT].title = " SA";
+ end[GDA_OPT].title = " GDA";
+ end[RRT_OPT].title = " RRT";
+ end[TA_OPT].title = " TA";
+ end[MY_OPT].title = " MY";
+
+ for(i=0;i<MAX_OPT_NR;i++)
+ end[i].quantil(0.5);
+
+ cout << "mean: \n";
+ compareStatVarQuantil=-1;
+ qsort(end,MAX_OPT_NR,sizeof(StatVar),compareStatVar);
+ for(i=0;i<MAX_OPT_NR;i++)
+ cout << end[i].title << " " << end[i].getMean() << endl;
+
+ cout << "\nbest: \n";
+ compareStatVarQuantil=0;
+ qsort(end,MAX_OPT_NR,sizeof(StatVar),compareStatVar);
+ for(i=0;i<MAX_OPT_NR;i++)
+ cout << end[i].title << " " << end[i].quantil(compareStatVarQuantil)
+ << endl;
+
+ cout << "\n20%-quantil: \n";
+ compareStatVarQuantil=0.2;
+ qsort(end,MAX_OPT_NR,sizeof(StatVar),compareStatVar);
+ for(i=0;i<MAX_OPT_NR;i++)
+ cout << end[i].title << " " << end[i].quantil(compareStatVarQuantil)
+ << endl;
+}
+
+
+void metaOptimization(Problem &tp,int nLaeufe,int nPars)
+{
+ double bestPar,bestValue;
+
+ bestPar=IterOptimizationOptimizeParameter(tp,TAOptimization::defaultAnnRate,0.0,1.0,nLaeufe,nPars,TA_OPT,bestValue);
+ cout << "#TA(defaultAnnRate) BEST-PAR: " << bestPar << " BEST-VAL: " << bestValue << endl;
+ bestPar=IterOptimizationOptimizeParameter(tp,RRTOptimization::defaultAnnRate,0.0,1.0,nLaeufe,nPars,RRT_OPT,bestValue);
+ cout << "#RRT(defaultAnnRate) BEST-PAR: " << bestPar << " BEST-VAL: " << bestValue << endl;
+ bestPar=IterOptimizationOptimizeParameter(tp,GDAOptimization::defaultAlpha,0.0,0.01,nLaeufe,nPars,GDA_OPT,bestValue);
+ cout << "#GDA(defaultAlpha) BEST-PAR: " << bestPar << " BEST-VAL: " << bestValue << endl;
+ bestPar=IterOptimizationOptimizeParameter(tp,SAOptimization::defaultEndAnnRate,0.0,1.0,nLaeufe,nPars,SA_OPT,bestValue);
+ cout << "#SA(defaultEndAnnRate) BEST-PAR: " << bestPar << " BEST-VAL: " << bestValue << endl;
+}
diff --git a/mkcls-v2/ProblemTest.h b/mkcls-v2/ProblemTest.h
new file mode 100644
index 0000000..4bd8bda
--- /dev/null
+++ b/mkcls-v2/ProblemTest.h
@@ -0,0 +1,65 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef PROBLEMTEST_H
+#define PROBLEMTEST_H
+
+#include "Problem.h"
+#include "StatVar.h"
+#include <fstream>
+
+
+enum {TA_OPT, HC_OPT, SA_OPT,RRT_OPT,GDA_OPT,MAX_OPT_NR,MY_OPT };
+
+class IterOptimization;
+
+extern short ProblemTestVerboseMode;
+
+extern ofstream *PrintBestTo,*PrintBestTo2;
+
+double solveProblem(int verbose,Problem &problem,int versuche,
+int optimierungsschritte,int verfahren,double &mean,StatVar &endValue,
+StatVar &laufzeit,StatVar &initValue,double maxSec= 0,int *iterationsschritte=0);
+
+
+
+int compareProblem(const void *p,const void *j);
+
+
+
+void multiSolveProblem(Problem &problem,int versuche,int maxSeconds);
+
+
+
+IterOptimization *genIterOptimizer(int verfahren,Problem &problem,int maxIter);
+
+
+void metaOptimization(Problem &p,int nLaeufe,int nPars);
+
+#endif
diff --git a/mkcls-v2/README b/mkcls-v2/README
new file mode 100644
index 0000000..8e453df
--- /dev/null
+++ b/mkcls-v2/README
@@ -0,0 +1,10 @@
+========================================================================
+mkcls is a tool to train word classes by using a
+maximum-likelihood-criterion. The resulting word classes are
+especially suited for language models or statistical translation
+models. The program mkcls was written by Franz Josef Och
+(och@informatik.rwth-aachen.de)
+========================================================================
+
+In order to know about the options of mkcls simply start the program
+without arguments.
diff --git a/mkcls-v2/RRTOptimization.cpp b/mkcls-v2/RRTOptimization.cpp
new file mode 100644
index 0000000..55e2122
--- /dev/null
+++ b/mkcls-v2/RRTOptimization.cpp
@@ -0,0 +1,217 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "RRTOptimization.h"
+#include "ProblemTest.h"
+
+double RRTOptimization::defaultAnnRate=0.6;
+double RRTOptimization::defaultMultiple=2.0;
+
+
+
+RRTOptimization::RRTOptimization(Problem &p,double t,double dt,int m)
+: IterOptimization(p,m),deviation(t),deltaDeviation(dt)
+{
+ assert(deviation>=0);
+}
+
+
+
+RRTOptimization:: RRTOptimization(Problem &p,int m)
+: IterOptimization(p,m),deviation(-1),deltaDeviation(0)
+{
+}
+
+
+
+RRTOptimization::RRTOptimization(RRTOptimization &o)
+: IterOptimization(o)
+{
+ deviation = o.deviation;
+ deltaDeviation= o.deltaDeviation;
+ record = o.record;
+}
+
+
+
+void RRTOptimization::zInitialize()
+{
+ IterOptimization::zInitialize();
+ if( deviation<0 )
+ {
+
+
+ int n;
+
+ StatVar &v=problem.deviationStatVar(*this,ANZ_VERSCHLECHTERUNGEN);
+
+ if( maxStep>0 )
+ n=(int)(maxStep*4.0/5.0);
+ else
+ maxStep=n=(int)(problem.expectedNumberOfIterations()*defaultMultiple);
+
+ deviation = v.quantil(defaultAnnRate);
+ deltaDeviation = deviation/(float)n;
+
+ if( verboseMode>0 )
+ cout << "#Algorithm: Record-To-Record-Travel: (anfAnnRate="
+ << defaultAnnRate << ",T=" << deviation << ",deltaT="
+ << deltaDeviation << ")\n";
+
+ curStep=0;
+ endFlag=0;
+ delete &v;
+ problem.initialize();
+ IterOptimization::zInitialize();
+ }
+ record=problem.value();
+ assert(deviation>=0);
+}
+
+short RRTOptimization::end()
+{
+ return ( endFlag>0 && deviation==0.0 );
+}
+void RRTOptimization::abkuehlen()
+{
+ if( deviation>=0 )
+ {
+ deviation -= deltaDeviation;
+ if(deviation<0)
+ deviation=0;
+ }
+}
+short RRTOptimization::accept(double delta)
+{
+ if( deviation<0 )
+ return 1;
+ else
+ {
+ if( delta + curValue - deviation < record )
+ {
+ if( delta + curValue < record )
+ record = delta+curValue;
+ return 1;
+ }
+ else
+ return 0;
+ }
+}
+
+void RRTOptimization::makeGraphOutput()
+{
+ IterOptimization::makeGraphOutput();
+ *GraphOutput << deviation;
+}
+
+
+
+
+double RRTOptimization::optimizeValue(Problem &p,int proParameter,int numParameter,int typ,
+ int optimierungsschritte,int print)
+{
+ switch(typ)
+ {
+ case 1:
+ {
+ double bestPar=-1,best=1e100;
+ if( print )
+ cout << "#RRT-optimizeValues: Quantil: " << numParameter << endl;
+ for(int i=0;i<=numParameter;i++)
+ {
+ StatVar end,laufzeit,init;
+ double now;
+ if(i==0) defaultAnnRate=0.2;
+ else defaultAnnRate = 0.3+(float)(0.6*i)/numParameter;
+ solveProblem(0,p,proParameter,optimierungsschritte,RRT_OPT,now,
+ end,laufzeit,init);
+ if( best>now )
+ {
+ best=now;
+ bestPar=defaultAnnRate;
+ }
+ if( print )
+ {
+ cout << defaultAnnRate << " ";
+ cout << end.getMean() << " " << end.quantil(0.2) << " "
+ << end.quantil(0.79) << " " << laufzeit.getMean() << " "
+ << end.quantil(0.0) << " " << end.getSigma() << " "
+ << end.getSigmaSmaller() << " " << end.getSigmaBigger()
+ << " " << now << endl;
+ }
+ }
+ if( print )
+ cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit "
+ "Bester Sigma SigmaSmaller SigmaBigger\n";
+ defaultAnnRate=0.8;
+ return bestPar;
+ }
+ break;
+ case 10:
+ {
+ double i;
+ double bestPar=-1,best=1e100;
+ StatVar end,laufzeit,init;
+
+ if( print )
+ cout << "#RRT-optimizeValues: defaultMultiple" << 8 << endl;
+ for(i=0.5;i<=10;i+=1.5)
+ {
+ double now;
+ defaultMultiple = i;
+ solveProblem(0,p,proParameter,optimierungsschritte,RRT_OPT,now,
+ end,laufzeit,init);
+ if( best>now )
+ {
+ best=now;
+ bestPar=defaultMultiple;
+ }
+ if( print )
+ {
+ cout << defaultMultiple << " ";
+ cout << end.getMean() << " " << end.quantil(0.2) << " "
+ << end.quantil(0.79) << " " << laufzeit.getMean() << " "
+ << end.quantil(0.0) << " " << end.getSigma() << " "
+ << end.getSigmaSmaller() << " " << end.getSigmaBigger()
+ << " " << now << endl;
+ }
+ }
+ if( print )
+ cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit "
+ "Bester Sigma SigmaSmaller SigmaBigger\n";
+ defaultMultiple=2.0;
+ return bestPar;
+ }
+ break;
+ default:
+ cerr << "Error: wrong parameter-type in RRTOptimization::optimizeValue ("
+ << typ << ")\n";
+ exit(1);
+ }
+ return 1e100;
+}
+
+
diff --git a/mkcls-v2/RRTOptimization.h b/mkcls-v2/RRTOptimization.h
new file mode 100644
index 0000000..42ec6e2
--- /dev/null
+++ b/mkcls-v2/RRTOptimization.h
@@ -0,0 +1,79 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef RRTOPTIMIZATION
+#define RRTOPTIMIZATION
+#include "IterOptimization.h"
+
+class RRTOptimization : public IterOptimization {
+
+
+ private:
+ double deviation;
+ double deltaDeviation;
+ double record;
+
+ protected:
+ virtual void zInitialize();
+
+
+ virtual short accept(double delta);
+
+
+ virtual void abkuehlen();
+
+
+ virtual short end();
+
+
+ virtual void makeGraphOutput();
+
+
+ public:
+ RRTOptimization(Problem &p,double temperatur,
+ double deltaTemperatur,int maxIter=-1);
+
+
+ RRTOptimization(Problem &p,int maxIter=-1);
+
+
+ RRTOptimization(RRTOptimization &o);
+
+
+ static double optimizeValue(Problem &p,int proParameter,
+ int numParameter,int typ,int schritte= -1,int verbose=1);
+
+
+ static double defaultAnnRate;
+
+ static double defaultMultiple;
+
+};
+
+#endif
diff --git a/mkcls-v2/SAOptimization.cpp b/mkcls-v2/SAOptimization.cpp
new file mode 100644
index 0000000..6ae589a
--- /dev/null
+++ b/mkcls-v2/SAOptimization.cpp
@@ -0,0 +1,280 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include <stdlib.h>
+#include <iostream>
+
+#include "SAOptimization.h"
+
+#include "ProblemTest.h"
+
+#define ALPHA 0.95
+
+double SAOptimization::defaultAnfAnnRate=0.9;
+double SAOptimization::defaultEndAnnRate=1e-9;
+double SAOptimization::defaultMultiple=2.0;
+
+
+
+SAOptimization::SAOptimization(Problem &p,int m)
+: IterOptimization(p,m), temperatur(-1)
+{
+}
+
+
+
+
+SAOptimization::SAOptimization(Problem &p,double t,double a,int s,int m)
+: IterOptimization(p,m),temperatur(t), alpha(a),schrittzahl(s)
+{
+ assert(alpha<1);
+ assert(schrittzahl>0);
+ assert(t>0);
+}
+
+
+SAOptimization::SAOptimization(SAOptimization &o)
+: IterOptimization(o)
+{
+ temperatur = o.temperatur;
+ endTemperatur = o.endTemperatur;
+ alpha = o.alpha;
+ schrittzahl = o.schrittzahl;
+ stepsForAbkuehlung = o.stepsForAbkuehlung;
+}
+
+
+void SAOptimization::zInitialize()
+{
+ IterOptimization::zInitialize();
+ if( temperatur<0)
+ {
+
+
+
+ StatVar &v=problem.deviationStatVar(*this,ANZ_VERSCHLECHTERUNGEN);
+
+ if( maxStep>0 )
+ stepsForAbkuehlung=(int)(maxStep*4.0/5.0);
+ else
+ maxStep=stepsForAbkuehlung=(int)(problem.expectedNumberOfIterations()*
+ defaultMultiple);
+
+ temperatur = v.getMean()/log(1/defaultAnfAnnRate);
+ endTemperatur = v.getMean()/log(1/defaultEndAnnRate);
+ schrittzahl = (int)(stepsForAbkuehlung/(log(endTemperatur/temperatur)/
+ log(ALPHA)));
+ if(schrittzahl==0)schrittzahl=1;
+ alpha = ALPHA;
+
+ if( verboseMode )
+ cout << "#Algorithm: Simulated Annealing(anfAnnRate="
+ << defaultAnfAnnRate <<",(endAnnRate=" << defaultEndAnnRate
+ << ",T0=" << temperatur<< ",Te=" << endTemperatur<< ",schrittzahl="
+ << schrittzahl<< ",stepsForAbkuehlung=" << stepsForAbkuehlung
+ << ")\n";
+ curStep=0;
+ endFlag=0;
+ delete &v;
+ problem.initialize();
+ IterOptimization::zInitialize();
+ }
+}
+
+short SAOptimization::end()
+{
+ if( temperatur>endTemperatur )
+ bestStep = curStep;
+ if( endFlag>0 && temperatur<endTemperatur)
+ return 1;
+ else
+ return 0;
+}
+void SAOptimization::abkuehlen()
+{
+ if(temperatur>=0)
+ {
+ if( curStep%schrittzahl == 0 )
+ temperatur=temperatur * alpha;
+ if( curStep> stepsForAbkuehlung)
+ temperatur = 0;
+ }
+}
+short SAOptimization::accept(double delta)
+{
+ if( temperatur<0 )
+ return 1;
+ else
+ {
+ if( delta > 0 )
+ {
+ if( temperatur==0 )
+ return 0;
+ else
+ {
+ double z=zufall01();
+ assert(z!=0.0);
+ if(z==0.0)
+ z+=1e-20;
+ double e=exp(-delta/temperatur);
+
+
+
+ return z+0.000000000001<=e;
+ }
+ }
+ else
+ return 1;
+ }
+}
+
+void SAOptimization::makeGraphOutput()
+{
+ IterOptimization::makeGraphOutput();
+ *GraphOutput << temperatur;
+}
+
+
+
+
+double SAOptimization::optimizeValue(Problem &p,int proParameter,int numParameter,
+ int typ,int optimierungsschritte,int print)
+{
+ switch(typ)
+ {
+ case 1:
+ {
+ double bestPar=-1,best=1e100;
+ double now;
+ if( print )
+ cout << "#SA-optimizeValues: defaultAnfAnnRate" << endl;
+ for(int i=0;i<numParameter;i++)
+ {
+ StatVar end,laufzeit,init;
+ defaultAnfAnnRate=0.1 + (1.0/numParameter)*i;
+ solveProblem(0,p,proParameter,optimierungsschritte,SA_OPT,now,
+ end,laufzeit,init);
+ if( best>now )
+ {
+ best=now;
+ bestPar=defaultAnfAnnRate;
+ }
+ if( print )
+ {
+ cout << defaultAnfAnnRate << " ";
+ cout << end.getMean() << " " << end.quantil(0.2) << " "
+ << end.quantil(0.79) << " " << laufzeit.getMean() << " "
+ << end.quantil(0.0) << " " << end.getSigma() << " "
+ << end.getSigmaSmaller() << " " << end.getSigmaBigger()
+ << " " << now << endl;
+ }
+ }
+ if( print )
+ cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit "
+ "Bester Sigma SigmaSmaller SigmaBigger\n";
+ defaultAnfAnnRate=0.9;
+ return bestPar;
+ }
+ break;
+ case 2:
+ {
+ double bestPar=-1,best=1e100;
+ double now;
+ if( print )
+ cout << "#Optimierung von SA: defaultEndAnnRate" << endl;
+ for(int i=1;i<=numParameter;i++)
+ {
+ StatVar end,laufzeit,init;
+ defaultEndAnnRate=1/(pow(10.0,i));
+ solveProblem(0,p,proParameter,optimierungsschritte,SA_OPT,now,end,
+ laufzeit,init);
+ if( best>now )
+ {
+ best=now;
+ bestPar=defaultEndAnnRate;
+ }
+ if( print )
+ {
+ cout << defaultEndAnnRate << " ";
+ cout << end.getMean() << " " << end.quantil(0.2) << " "
+ << end.quantil(0.79) << " " << laufzeit.getMean() << " "
+ << end.quantil(0.0) << " " << end.getSigma() << " "
+ << end.getSigmaSmaller() << " " << end.getSigmaBigger()
+ << " " << now << endl;
+ }
+ }
+ if( print )
+ cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit "
+ "Bester Sigma SigmaSmaller SigmaBigger\n";
+ defaultEndAnnRate=1/10000.0;
+ return bestPar;
+ }
+ break;
+ case 10:
+ {
+ double bestPar=-1,best=1e100;
+
+ if( print )
+ cout << "#SA-optimizeValues: defaultMultiple " << 8 << endl;
+ for(int i=1;i<=6;i++)
+ {
+ StatVar end,laufzeit,init;
+ double now;
+ defaultMultiple = i;
+ solveProblem(0,p,proParameter,optimierungsschritte,SA_OPT,now,end,
+ laufzeit,init);
+ if( best>now )
+ {
+ best=now;
+ bestPar=defaultMultiple;
+ }
+ if( print )
+ {
+ cout << defaultMultiple << " ";
+ cout << end.getMean() << " " << end.quantil(0.2) << " "
+ << end.quantil(0.79) << " " << laufzeit.getMean() << " "
+ << end.quantil(0.0) << " " << end.getSigma() << " "
+ << end.getSigmaSmaller() << " " << end.getSigmaBigger()
+ << " " << now << endl;
+ }
+ }
+ if( print )
+ cout << "#Parameter Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit "
+ "Bester Sigma SigmaSmaller SigmaBigger\n";
+ defaultMultiple=2.0;
+ return bestPar;
+ }
+ break;
+ default:
+ cerr << "Error: wrong parameter-type in SAOptimization::optimizeValue ("
+ << typ << ")\n";
+ exit(1);
+ }
+ return 1e100;
+}
+
+
+
diff --git a/mkcls-v2/SAOptimization.h b/mkcls-v2/SAOptimization.h
new file mode 100644
index 0000000..97c528b
--- /dev/null
+++ b/mkcls-v2/SAOptimization.h
@@ -0,0 +1,86 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef SAOPTIMIZATION
+#define SAOPTIMIZATION
+#include "IterOptimization.h"
+
+class SAOptimization : public IterOptimization
+ {
+
+
+ private:
+ double temperatur;
+ double endTemperatur;
+ double alpha;
+ int schrittzahl;
+ int stepsForAbkuehlung;
+
+ protected:
+ virtual void zInitialize();
+
+
+ virtual short accept(double delta);
+
+
+ virtual void abkuehlen();
+
+
+ virtual short end();
+
+
+ virtual void makeGraphOutput();
+
+
+ public:
+ SAOptimization(Problem &p,double temperatur,double alpha,
+ int schrittzahl,int maxIter=-1);
+
+
+ SAOptimization(Problem &p,int maxIter=-1);
+
+
+ SAOptimization(SAOptimization &o);
+
+
+ static double optimizeValue(Problem &p,int proParameter,
+ int numParameter,int typ,
+ int schritte= -1,int verbose=1);
+
+
+ static double defaultAnfAnnRate;
+
+ static double defaultEndAnnRate;
+
+ static double defaultMultiple;
+
+
+};
+#endif
+
diff --git a/mkcls-v2/StatVar.cpp b/mkcls-v2/StatVar.cpp
new file mode 100644
index 0000000..dbd76cd
--- /dev/null
+++ b/mkcls-v2/StatVar.cpp
@@ -0,0 +1,140 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#include "StatVar.h"
+#include <iostream>
+#include <stdlib.h>
+
+double compareStatVarQuantil=-1;
+
+StatV::~StatV() {}
+
+
+int doublecompare(const void *p,const void *j)
+{
+ if( *(double *)p == *(double *)j)
+ return 0;
+ if( *(double *)p- *(double *)j<0 )
+ return -1;
+ else
+ return 1;
+}
+
+int compareStatVar(const void *p,const void *j)
+{
+ double a;
+ double b;
+ if(compareStatVarQuantil>=0)
+ {
+ a=((StatVar *)p)->quantil(compareStatVarQuantil);
+ b=((StatVar *)j)->quantil(compareStatVarQuantil);
+ }
+ else
+ {
+ a=((StatVar *)p)->getMean();
+ b=((StatVar *)j)->getMean();
+ }
+ if(a==b)
+ return 0;
+ if(a<b)
+ return -1;
+ else
+ return +1;
+}
+
+
+double StatVar::getSigmaSmaller()
+{
+ double ss=0;
+ int ns=0;
+ for(int i=0;i<n;i++)
+ {
+ if( values[i]<getMean() )
+ {
+ ss+=(values[i]-getMean())*(values[i]-getMean());
+ ns++;
+ }
+ }
+ if( ss/ns>0 )
+ return sqrt(ss/ns);
+ else
+ return 0;
+}
+double StatVar::getSigmaBigger()
+{
+ double ss=0;
+ int ns=0;
+ for(int i=0;i<n;i++)
+ if( values[i]>getMean() )
+ {
+ ss+=(values[i]-getMean())*(values[i]-getMean());
+ ns++;
+ }
+ if( ss/ns>0 )
+ return sqrt(ss/ns);
+ else
+ return 0;
+}
+
+
+
+void StatV::dumpOn(ostream &strm)
+{
+ strm << "MEAN: " << getMean() << " (" << smallest << "-" << biggest
+ << ") SIGMA:" << getSigma()<< " ";
+}
+
+
+
+double StatVar::quantil(double percent)
+{
+ int index=(int)(n*percent);
+ if(index==n)
+ index=n-1;
+ assert(index>=0&&index<n);
+ if(sortedFlag==0)
+ {
+ qsort(values.getPointerToData(),n,sizeof(double),doublecompare);
+ assert(n<=values.size());
+ sortedFlag=1;
+ }
+ if(index<0)
+ {
+ cerr << "WARNING: StatVar.cc\n";
+ return 0.0;
+ }
+ else
+ return values[index];
+}
+
+
+void StatVar::printValues(ostream &strm)
+{
+ qsort(values.getPointerToData(),n,sizeof(double),doublecompare);
+ assert(n<=values.size());
+ for(int i=0;i<n;i++)
+ strm << i/(double)n << " " << values[i] << endl;
+ return;
+}
diff --git a/mkcls-v2/StatVar.h b/mkcls-v2/StatVar.h
new file mode 100644
index 0000000..bdf1e19
--- /dev/null
+++ b/mkcls-v2/StatVar.h
@@ -0,0 +1,134 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#ifndef STATVAR_H
+#define STATVAR_H
+
+#include <stdlib.h>
+#include <iostream>
+#include "Array.h"
+#include "mystl.h"
+#include "myleda.h"
+#include <cmath>
+
+
+extern double compareStatVarQuantil;
+int compareStatVar(const void *p,const void *j);
+
+class StatV
+
+{
+ protected:
+ int n;
+ double sum;
+ double squareSum;
+ double smallest,biggest;
+
+ public:
+ const char *title;
+ StatV() : n(0),sum(0),squareSum(0),smallest(1e100),biggest(-1e100),title("") {}
+ virtual ~StatV();
+
+
+ virtual void addValue(double a)
+ {
+ n++;
+ sum+=a;
+ squareSum+=a*a;
+ if(smallest>a)
+ smallest=a;
+ if(biggest<a)
+ biggest=a;
+
+ }
+
+
+ double getMean()
+ { return sum/n; }
+
+
+ double getSigma()
+ {
+ if(squareSum/n - getMean()*getMean()<=0)
+ return 0.0;
+ else
+ return sqrt(squareSum/n - getMean()*getMean());
+ }
+
+
+
+ double getBiggest()
+ { return biggest; }
+
+
+ double getSmallest()
+ { return smallest; }
+
+
+ int getNum()
+ { return n; }
+
+
+ void dumpOn(ostream &strm);
+
+
+};
+
+class StatVar : public StatV
+{
+ private:
+ Array<double> values;
+ short sortedFlag;
+ public:
+ StatVar()
+ : values(10,0.0,1),sortedFlag(0) {}
+ virtual ~StatVar(){}
+ double quantil(double percent=0.5);
+
+
+ inline double value(int i)
+ {return values[i];}
+
+
+ void printValues(ostream &strm);
+
+
+ virtual void addValue(double a)
+ {
+ sortedFlag=0;
+ values[n]=a;
+ StatV::addValue(a);
+ }
+
+ double getSigmaSmaller();
+
+
+ double getSigmaBigger();
+
+
+};
+
+
+#endif
diff --git a/mkcls-v2/TAOptimization.cpp b/mkcls-v2/TAOptimization.cpp
new file mode 100644
index 0000000..074ff62
--- /dev/null
+++ b/mkcls-v2/TAOptimization.cpp
@@ -0,0 +1,208 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "TAOptimization.h"
+#include "ProblemTest.h"
+
+
+double TAOptimization::defaultAnnRate=0.4;
+double TAOptimization::defaultMultiple=2.0;
+
+
+TAOptimization::TAOptimization(Problem &p,double t,double d,int m)
+: IterOptimization(p,m) , temperatur(t) , deltaTemperatur(d)
+{
+ assert(t>0 && d>0);
+}
+
+
+
+TAOptimization::TAOptimization(Problem&p,int m)
+: IterOptimization(p,m), temperatur(-1)
+{
+}
+
+
+
+TAOptimization::TAOptimization(TAOptimization &o)
+: IterOptimization(o)
+{
+ temperatur= o.temperatur;
+ deltaTemperatur= o.deltaTemperatur;
+}
+
+
+
+
+void TAOptimization::zInitialize()
+{
+ IterOptimization::zInitialize();
+ if( temperatur<0)
+ {
+
+
+ int n;
+
+ StatVar &v=problem.deviationStatVar(*this,ANZ_VERSCHLECHTERUNGEN);
+
+ if(maxStep>0)
+ n=(int)(maxStep*4.0/5.0);
+ else
+ maxStep=n=(int)(problem.expectedNumberOfIterations()*defaultMultiple);
+
+ temperatur = v.quantil(defaultAnnRate);
+ deltaTemperatur = temperatur/n;
+
+ if( verboseMode>0 )
+ cout << "#TA: (anfAnnRate="
+ << defaultAnnRate << ",T=" << temperatur << ",deltaT="
+ << deltaTemperatur << ")\n";
+ curStep=0;
+ endFlag=0;
+ delete &v;
+ }
+}
+
+
+short TAOptimization::end()
+{
+
+
+ if( temperatur>0 )
+ {
+ endFlag=0;
+ bestStep=curStep;
+ }
+ return endFlag>0;
+}
+
+short TAOptimization::accept(double delta)
+{
+ if( temperatur<0 )
+ return 1;
+ else
+ if( delta < temperatur )
+ return 1;
+ else
+ return 0;
+}
+
+void TAOptimization::abkuehlen()
+{
+ if( temperatur>=0 )
+ temperatur=(temperatur-deltaTemperatur>0)?(temperatur-deltaTemperatur):0;
+}
+
+void TAOptimization::makeGraphOutput()
+{
+ IterOptimization::makeGraphOutput();
+ *GraphOutput << temperatur;
+}
+
+
+
+
+double TAOptimization::optimizeValue(Problem &p,int proParameter,int numParameter,int typ,
+ int optimierungsschritte,int print)
+{
+ switch(typ)
+ {
+ case 1:
+ {
+ double bestPar=-1,best=1e100;
+ if(print)cout << "#TA-optimizeValues: " << numParameter << endl;
+ for(int i=0;i<=numParameter;i++)
+ {
+ StatVar end,laufzeit,init;
+ double now;
+ defaultAnnRate = (float)(i)/numParameter;
+ solveProblem(0,p,proParameter,optimierungsschritte,TA_OPT,now,end,
+ laufzeit,init);
+ if( best>now )
+ {
+ best=now;
+ bestPar=defaultAnnRate;
+ }
+ if( print)
+ {
+ cout << defaultAnnRate << " ";
+ cout << end.getMean() << " " << end.quantil(0.2) << " "
+ << end.quantil(0.79) << " " << laufzeit.getMean() << " "
+ << end.quantil(0.0) << " " << end.getSigma() << " "
+ << end.getSigmaSmaller() << " " << end.getSigmaBigger()
+ << " " << now << endl;
+ }
+ }
+ if( print )
+ cout << "#Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit Bester"
+ " Sigma SigmaSmaller SigmaBigger\n";
+ defaultAnnRate=0.5;
+ return bestPar;
+ }
+ break;
+ case 10:
+ {
+ double bestPar=-1,best=1e100;
+ if( print )
+ cout << "#TA-optimizeValues: defaultMultiple " << 10 << endl;
+ for(int i=1;i<=6;i++)
+ {
+ StatVar end,laufzeit,init;
+ double now;
+ defaultMultiple = i;
+ solveProblem(0,p,proParameter,optimierungsschritte,TA_OPT,now,
+ end,laufzeit,init);
+ if( best>now )
+ {
+ best=now;
+ bestPar=defaultMultiple;
+ }
+ if( print )
+ {
+ cout << defaultMultiple << " ";
+ cout << end.getMean() << " " << end.quantil(0.2) << " "
+ << end.quantil(0.79) << " " << laufzeit.getMean() << " "
+ << end.quantil(0.0) << " " << end.getSigma() << " "
+ << end.getSigmaSmaller() << " " << end.getSigmaBigger()
+ << " " << now << endl;
+ }
+ }
+ if( print )
+ cout << "#Mittelwert 0.2-Quantil 0.8-Quantil Laufzeit Bester Sigma "
+ " SigmaSmaller SigmaBigger\n";
+ defaultMultiple=2.0;
+ return bestPar;
+ }
+ break;
+ default:
+ cerr << "Error: wrong parameter-type in TAOptimization::optimizeValue ("
+ << typ << ")\n";
+ exit(1);
+ }
+ return 1e100;
+}
+
+
diff --git a/mkcls-v2/TAOptimization.h b/mkcls-v2/TAOptimization.h
new file mode 100644
index 0000000..3382306
--- /dev/null
+++ b/mkcls-v2/TAOptimization.h
@@ -0,0 +1,78 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+#ifndef TAOPTIMIZATION
+#define TAOPTIMIZATION
+
+#include "IterOptimization.h"
+
+class TAOptimization : public IterOptimization {
+
+
+ private:
+ double temperatur;
+ double deltaTemperatur;
+
+ protected:
+ virtual void zInitialize();
+
+
+ virtual short accept(double delta);
+
+
+ virtual void abkuehlen();
+
+
+ virtual short end();
+
+
+ virtual void makeGraphOutput();
+
+
+ public:
+ TAOptimization(Problem &p,double temperatur,
+ double deltaTemperatur,int maxIter=-1);
+
+
+ TAOptimization(Problem &p,int maxIter=-1);
+
+
+ TAOptimization(TAOptimization &o);
+
+
+ static double optimizeValue(Problem &p,int proParameter,
+ int numParameter,int typ,int schritte= -1,int verbose=1);
+
+
+ static double defaultAnnRate;
+
+ static double defaultMultiple;
+
+};
+#endif
diff --git a/mkcls-v2/general.cpp b/mkcls-v2/general.cpp
new file mode 100644
index 0000000..ddd5fe4
--- /dev/null
+++ b/mkcls-v2/general.cpp
@@ -0,0 +1,120 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+
+extern "C" {
+#include <sys/time.h>
+#include <sys/resource.h>
+
+
+}
+
+#include "general.h"
+
+extern "C" {
+#ifndef __linux__
+int getrusage(int who, struct rusage *rusage);
+#endif
+};
+int verboseMode=0;
+
+#ifdef aNeXT
+#define NO_TEMPLATES
+#endif
+
+
+void myerror(int line,const char *file,const char *expression)
+{
+ cerr << "(general.h):Assertion failed: '" << expression << "' ::: b "
+ << file << ":" << line << endl;
+}
+
+
+void imyerror(int line,const char *file,const char *expression)
+{
+ cerr << "Error: '" << expression << "' ::: in Source " << file
+ << ":" << line << endl;
+ #ifndef DEBUG
+
+ #endif
+}
+
+
+
+void zufallSeed(int z)
+{
+#ifdef NeXT
+ srandom(z);
+#else
+ srand48(z);
+#endif
+}
+
+
+
+double zufall01()
+{
+#ifdef NeXT
+ return (double)(random()%65536)/65536.0;
+#else
+ return drand48();
+#endif
+}
+
+
+
+double zufall(double min,double max)
+{
+ double z=zufall01()*(max-min)+min;
+ assert(z>=min&&z<max);
+ return z;
+}
+
+
+
+int randomInt(int exclusive)
+{
+ int i=(int)zufall(0,exclusive);
+ assert(i>=0);
+ assert(i<exclusive);
+ return i;
+}
+
+double clockSec()
+{
+#ifdef linux
+ enum __rusage_who who=RUSAGE_SELF;
+#else
+ int who=RUSAGE_SELF;
+#endif
+ struct rusage rusage;
+ getrusage(who, &rusage);
+ return rusage.ru_utime.tv_sec+rusage.ru_utime.tv_usec/1000000.0;
+}
diff --git a/mkcls-v2/general.h b/mkcls-v2/general.h
new file mode 100644
index 0000000..8db48aa
--- /dev/null
+++ b/mkcls-v2/general.h
@@ -0,0 +1,89 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+
+
+
+
+#ifndef GENERAL_HEADER
+#define GENERAL_HEADER
+
+#include <iostream>
+#ifdef NeXT
+#include <minmax.h>
+#endif
+#include <string.h>
+
+
+
+#define NULLFLOAT(x) ( fabs(x)<=0.0000001 )
+#define EQUALFLOAT(x,y) ( fabs(x-y)<(fabs(x)+fabs(y))/10000000.0 )
+
+
+
+
+#define TEST_RANDOM_SEED 532567487
+
+double zufall01();
+
+
+double zufall(double min,double max);
+
+
+int randomInt(int exclusive);
+
+
+void zufallSeed(int z =TEST_RANDOM_SEED);
+
+
+
+
+#include "myassert.h"
+#include <cassert>
+#include "Array.h"
+
+
+
+
+
+
+double clockSec();
+
+extern int verboseMode;
+
+
+
+inline string operator&(const string&a,const string&b)
+{
+ string c(a);
+ c+=b;
+ return c;
+}
+
+
+
+#endif
+
diff --git a/mkcls-v2/makePackage.sh b/mkcls-v2/makePackage.sh
new file mode 100644
index 0000000..2790e61
--- /dev/null
+++ b/mkcls-v2/makePackage.sh
@@ -0,0 +1,43 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#! /bin/csh
+
+setenv VERSION `date +%Y-%m-%d`
+rm -rf mkcls-v2
+
+mkdir mkcls-v2
+foreach i ( Array.h FixedArray.h FlexArray.h GDAOptimization.C GDAOptimization.h HCOptimization.C HCOptimization.h IterOptimization.C IterOptimization.h KategProblem.C KategProblem.h KategProblemKBC.C KategProblemKBC.h KategProblemTest.C KategProblemTest.h KategProblemWBC.C KategProblemWBC.h MSBOptimization.C MSBOptimization.h MYOptimization.C MYOptimization.h Optimization.C Optimization.h PopOptimization.C PopOptimization.h Problem.C Problem.h ProblemTest.C ProblemTest.h RRTOptimization.C RRTOptimization.h SAOptimization.C SAOptimization.h StatVar.C StatVar.h TAOptimization.C TAOptimization.h general.C general.h makePackage.sh mkcls.C my.h myassert.h myleda.h mystl.h )
+ cat $i | filterIfdef.out NO_LIGHT_GIZA | filterIfdefInverse.out DEBUG | filterIfdefInverse.out DEBUG_TRICKY_IBM3 | filterIfdefInverse.out VDEBUG | stripcmt | addHead.out -file header > mkcls-v2/$i
+end
+
+cp Makefile.simple mkcls-v2/Makefile
+cp ../giza++/GNU.GPL mkcls-v2
+cp ../giza++/LICENSE mkcls-v2
+cp README mkcls-v2
+
+tar cf - mkcls-v2 | gzip -9 > mkcls.$VERSION.tar.gz
+
+cd mkcls-v2
+gmake -k
+cd ..
+
diff --git a/mkcls-v2/mkcls.cpp b/mkcls-v2/mkcls.cpp
new file mode 100644
index 0000000..90ebfde
--- /dev/null
+++ b/mkcls-v2/mkcls.cpp
@@ -0,0 +1,618 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include <stdio.h>
+#include <iostream>
+#include <stdlib.h>
+#include <ctype.h>
+#include "general.h"
+
+#include "KategProblem.h"
+#include "KategProblemTest.h"
+
+#include "ProblemTest.h"
+#include "TAOptimization.h"
+#include "GDAOptimization.h"
+#include "RRTOptimization.h"
+#include "SAOptimization.h"
+#include "HCOptimization.h"
+
+
+double SigmaVerfaelschung=5.0;
+int OneWithHapas=1;
+char *hapaxInitName=0;
+
+
+
+
+
+static int nLaeufe=1,nLaeufeReduce=3;
+
+
+static int optimizeParameterAnzahl=10;
+
+
+static int IterOptVerf=TA_OPT;
+
+
+static int MaxIterOptSteps= -1;
+
+
+static int MaxSecs=0;
+
+
+
+
+
+static int InitValue=INIT_RAN;
+
+
+static int Criterion=CRITERION_ML;
+
+
+static int Wwahl=W_DET_DECR;
+
+
+static int Kwahl=K_BEST;
+
+
+static int NumberCategories=100;
+
+
+static int MinWordFrequency=0;
+
+
+static int IterOptSet=0;
+
+
+static KategProblem *p = 0;
+
+
+char korpusName[1024]="train";
+int korpusIsText=1;
+
+
+char *FileForOther=0;
+
+void printUsage(int r)
+{
+ cout <<
+ "mkcls - a program for making word classes: Usage: \n"
+ " mkcls [-nnum] [-ptrain] [-Vfile] opt\n"
+
+
+
+
+
+
+ "-V output classes (Default: no file)\n"
+
+
+ "-n number of optimization runs (Default: 1); larger number => better results\n"
+
+ "-p filename of training corpus (Default: 'train')\n"
+
+
+
+
+
+
+
+
+ "Example:\n"
+ " mkcls -c80 -n10 -pin -Vout opt\n"
+ " (generates 80 classes for the corpus 'in' and writes the classes in 'out')\n"
+ "Literature: \n"
+ " Franz Josef Och: »Maximum-Likelihood-Schätzung von Wortkategorien mit Verfahren\n"
+ " der kombinatorischen Optimierung« Studienarbeit, Universität Erlangen-Nürnberg,\n"
+ " Germany,1995. \n";
+ exit(r);
+}
+
+
+
+
+
+
+
+void makeIterOpt()
+{
+ double maxTime=clockSec()+MaxSecs;
+ if(MaxSecs==0)maxTime=0;
+ double mean;
+ StatVar end,laufzeit,init;
+ solveProblem(1+(PrintBestTo!=0),*p,nLaeufe,MaxIterOptSteps,IterOptVerf,
+ mean,end,laufzeit,init,maxTime);
+ if( verboseMode>1 )
+ p->dumpOn(cout);
+}
+
+
+
+void makeIzrOpt()
+{
+ double maxTime=clockSec()+MaxSecs;
+ if(MaxSecs==0)maxTime=0;
+ izrOptimization(*p,nLaeufeReduce,nLaeufeReduce,0,maxTime,IterOptVerf);
+}
+
+
+
+int makeMetaOpt(int argc,char **argv)
+{
+ int ret=0;
+
+ if(argc==4 || argc==3)
+ {
+ int typ=0;
+ if( argc==4 )
+ {
+ sscanf(argv[3],"%d",&typ);
+ assert(typ>0 && typ<=11 );
+ }
+ if( isdigit(argv[2][0]) )
+ {
+ int a;
+ sscanf(argv[2],"%d",&a);
+ switch(a)
+ {
+ case 1:
+ SAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,1);
+ break;
+ case 2:
+ SAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,2);
+ break;
+ case 3:
+ SAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,10);
+ break;
+ case 4:
+ TAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,1);
+ break;
+ case 5:
+ TAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,10);
+ break;
+ case 6:
+ RRTOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,1);
+ break;
+ case 7:
+ RRTOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,10);
+ break;
+ case 8:
+ GDAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,1);
+ break;
+ default:
+ cerr << "Error: Wrong number of parameter (" << argv[2]
+ << ").\n";
+ printUsage(1);
+ }
+ }
+ else
+ {
+ if(strcasecmp(argv[2],"gda")==0)
+ {
+ GDAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,typ);
+ }
+ else if(strcasecmp(argv[2],"ta")==0)
+ {
+ TAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,typ);
+ }
+ else if(strcasecmp(argv[2],"rrt")==0)
+ {
+ RRTOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,typ);
+ }
+ else if(strcasecmp(argv[2],"sa")==0)
+ {
+ SAOptimization::optimizeValue(*p,nLaeufe,
+ optimizeParameterAnzahl,typ);
+ }
+
+
+
+
+ else
+ {
+ cerr << "Error: unknown algorithm" << argv[2] << endl;
+ printUsage(1);
+ }
+ }
+ }
+ else
+ {
+ cerr << "Error: wrong number of arguments: " << argc << endl;
+ printUsage(1);
+ }
+ return ret;
+}
+
+
+
+
+
+
+
+
+
+
+void setVerfahren(char *p)
+{
+ if(strcasecmp(p,"rrt")==0 )
+ IterOptVerf=RRT_OPT;
+ else if(strcasecmp(p,"ta")==0)
+ IterOptVerf=TA_OPT;
+ else if(strcasecmp(p,"gda")==0)
+ IterOptVerf=GDA_OPT;
+ else if(strcasecmp(p,"sa")==0)
+ IterOptVerf=SA_OPT;
+ else if(strcasecmp(p,"hc")==0)
+ IterOptVerf=HC_OPT;
+ else
+ {
+ cerr << "Error: Unknown iterativ-optimizing algorithm '" << p << "'.\n";
+ printUsage(1);
+ }
+}
+
+
+
+void setInitValue(char *iv,char *fileForOther)
+{
+ if(strcasecmp(iv,"ran")==0 )
+ InitValue=INIT_RAN;
+ else if(strcasecmp(iv,"aio")==0)
+ InitValue=INIT_AIO;
+ else if(strcasecmp(iv,"gda")==0)
+ InitValue=INIT_LWRW;
+ else if(strcasecmp(iv,"freq")==0)
+ InitValue=INIT_FREQ;
+ else if(strcasecmp(iv,"other")==0)
+ {
+ InitValue=INIT_OTHER;
+ FileForOther=strdup(fileForOther);
+ }
+ else
+ {
+ cerr << "Error: Unknown initialization '" << p << "'.\n";;
+ printUsage(1);
+ }
+}
+
+
+void setWwahl(const char *ww)
+{
+ if(strcasecmp(ww,"ran")==0 )
+ Wwahl=W_RAN;
+ else if(strcasecmp(ww,"det")==0)
+ Wwahl=W_DET_DECR;
+ else if(strcasecmp(ww,"incr")==0)
+ Wwahl=W_DET_INCR;
+ else
+ {
+ cerr << "Error: Unknown word-selection '" << ww << "'.\n";;
+ printUsage(1);
+ }
+}
+
+
+void setKwahl(const char *kw)
+{
+ if( strcasecmp(kw,"det")==0 )
+ Kwahl=K_DET;
+ else if(strcasecmp(kw,"ran")==0 )
+ Kwahl=K_RAN;
+ else if(strcasecmp(kw,"best")==0)
+ Kwahl=K_BEST;
+ else
+ {
+ cerr << "Error: Unknown category-selection '" << kw << "'.\n";
+ printUsage(1);
+ }
+}
+
+
+void setParameter(const char *nr1,const char *nr2)
+{
+ int n1;
+ float n2;
+ sscanf(nr1,"%d",&n1);
+ sscanf(nr2,"%f",&n2);
+ IterOptSet=1;
+ switch(n1)
+ {
+ case 1:
+ SAOptimization::defaultAnfAnnRate=n2;
+ if(verboseMode)cout << "Parameter gamma_0 (SA) set to "
+ << SAOptimization::defaultAnfAnnRate << endl;
+ iassert(0<=SAOptimization::defaultAnfAnnRate&&
+ SAOptimization::defaultAnfAnnRate<=1);
+ break;
+ case 2:
+ SAOptimization::defaultEndAnnRate=n2;
+ if(verboseMode)cout << "Parameter gamma_e (SA) set to "
+ << SAOptimization::defaultEndAnnRate << endl;
+ iassert(0<=SAOptimization::defaultEndAnnRate
+ &&SAOptimization::defaultEndAnnRate<=1);
+ break;
+ case 3:
+ SAOptimization::defaultMultiple=n2;
+ if(verboseMode)cout << "Parameter nu_e (SA) set to "
+ << SAOptimization::defaultMultiple << endl;
+ iassert( SAOptimization::defaultMultiple>0 );
+ break;
+ case 4:
+ TAOptimization::defaultAnnRate=n2;
+ if(verboseMode)cout << "Parameter gamma_{TA} set to "
+ << TAOptimization::defaultAnnRate << endl;
+ iassert(0<=TAOptimization::defaultAnnRate
+ &&TAOptimization::defaultAnnRate<=1);
+ break;
+ case 5:
+ TAOptimization::defaultMultiple=n2;
+ if(verboseMode)cout << "Parameter nu_{TA} set to "
+ << TAOptimization::defaultMultiple << endl;
+ iassert( TAOptimization::defaultMultiple>0 );
+ break;
+ case 6:
+ RRTOptimization::defaultAnnRate=n2;
+ if(verboseMode)cout << "Parameter gamma_{RRT} set to "
+ << RRTOptimization::defaultAnnRate << endl;
+ iassert(0<=RRTOptimization::defaultAnnRate
+ && RRTOptimization::defaultAnnRate<=1);
+ break;
+ case 7:
+ RRTOptimization::defaultMultiple=n2;
+ if(verboseMode)cout << "Parameter nu_{RRT} set to "
+ << RRTOptimization::defaultMultiple << endl;
+ iassert( RRTOptimization::defaultMultiple>0 );
+ break;
+ case 8:
+ GDAOptimization::defaultAlpha=n2;
+ if(verboseMode)cout << "Parameter alpha set to "
+ << GDAOptimization::defaultAlpha << endl;
+ iassert(0<=GDAOptimization::defaultAlpha
+ && GDAOptimization::defaultAlpha<1 );
+ break;
+ default:
+ cerr << "Error: Wrong parameter number " << nr1 << " " << n1 << endl;
+ printUsage(1);
+ }
+}
+
+
+
+void setKorpusName(const char *s)
+{
+ strcpy(korpusName,s);
+}
+
+void setHapaxInitName(const char *s)
+{
+ hapaxInitName=strdup(s);
+}
+
+void setKorpus()
+{
+ if( korpusIsText )
+ {
+ if( (p=fromKModel(korpusName,NumberCategories,InitValue,Criterion,Wwahl|Kwahl,
+ MinWordFrequency))==0)
+ {
+ cerr << "Error: Could not read the file '" << korpusName << "'.\n";
+ printUsage(1);
+ }
+ }
+ else
+ {
+ if( (p=fromNgrFile(korpusName,NumberCategories,InitValue,Criterion,Wwahl|Kwahl,
+ MinWordFrequency))==0)
+ {
+ cerr << "Error: Could not read the file '" << korpusName << "'.\n";
+ printUsage(1);
+ }
+ p->wordFreq.initializeIndex(*(p->words),'1',2,1+NumberCategories/2,!OneWithHapas);
+ p->wordFreq.initializeIndex(*(p->words),'2',2+NumberCategories/2,1+NumberCategories,OneWithHapas);
+ }
+ if( IterOptSet==0 )
+ KategProblemSetParameters(*p);
+}
+
+
+
+
+
+
+int main(int argc,char **argv)
+{
+ double startTime=clockSec();
+ zufallSeed();
+ while( argc>1 && argv[1][0]=='-' )
+ {
+
+ switch(argv[1][1])
+ {
+ case 'v':
+ sscanf(argv[1]+2,"%d",&verboseMode);
+ iassert(verboseMode>=0);
+ break;
+ case 'O':
+ sscanf(argv[1]+2,"%d",&OneWithHapas);
+ cout << "OneWithHapas: " << OneWithHapas << endl;
+ break;
+ case 'n':
+ sscanf(argv[1]+2,"%d",&nLaeufe);
+ nLaeufeReduce=nLaeufe;
+ iassert( nLaeufe>=1 );
+ break;
+ case 'l':
+ Criterion=1;
+ if( argv[1][2] )
+ {
+ sscanf(argv[1]+2,"%lf",&rhoLo);
+ if( verboseMode )
+ cout << "Parameter rho (for LO) set to" << rhoLo << ".\n";
+ iassert(0<=rhoLo && rhoLo<=1);
+ }
+ if( verboseMode )
+ cout << "Criterion LO used.\n";
+ break;
+ case 'y':
+ Criterion=2;
+ if( argv[1][2] )
+ {
+ sscanf(argv[1]+2,"%lf",&SigmaVerfaelschung);
+ if( verboseMode )
+ cout << "Parameter rho (for LO) set to" << SigmaVerfaelschung << ".\n";
+ iassert(0<SigmaVerfaelschung);
+ }
+ if( verboseMode )
+ cout << "My special criterion used.\n";
+ break;
+ case 'p':
+ setKorpusName(argv[1]+2);
+ assert(argv[2]&&argv[2][0]!='-' || argv[2][0]!='i');
+ break;
+ case 'P':
+ setKorpusName(argv[1]+2);
+ korpusIsText=0;
+ assert(argv[2]&&argv[2][0]!='-' || argv[2][0]!='i');
+ break;
+ case 'i':
+ setInitValue(argv[1]+2,argv[2]);
+ if( InitValue==INIT_OTHER )
+ argv++,argc--;
+ break;
+ case 'h':
+ setHapaxInitName(argv[1]+2);
+ break;
+ case 'k':
+ setKwahl(argv[1]+2);
+ break;
+ case 'w':
+ setWwahl(argv[1]+2);
+ break;
+ case 'c':
+ sscanf(argv[1]+2,"%d",&NumberCategories);
+ iassert(NumberCategories>=2);
+ break;
+ case 'm':
+ sscanf(argv[1]+2,"%d",&MinWordFrequency);
+ break;
+ case 'e':
+ setParameter(argv[1]+2,argv[2]);
+ argv++,argc--;
+ break;
+ case 'a':
+ setVerfahren(argv[1]+2);
+ break;
+ case 'r':
+ {
+ int s;
+ sscanf(argv[1]+2,"%d",&s);
+ zufallSeed(s);
+ }
+ break;
+ case 'V':
+ if(argv[1][2])
+ {
+ char str[1024];
+ strcpy(str,argv[1]+2);
+ PrintBestTo=new ofstream(str);
+ strcat(str,".cats");
+ PrintBestTo2=new ofstream(str);
+ }
+ else
+ cout << "AUSGABE auf cout\n";
+ break;
+ case 'M':
+ sscanf(argv[1]+2,"%d",&MaxIterOptSteps);
+ break;
+ case 's':
+ sscanf(argv[1]+2,"%d",&MaxSecs);
+ break;
+ case 'N':
+ sscanf(argv[1]+2,"%d",&optimizeParameterAnzahl);
+ break;
+ case 'o':
+ GraphOutput = new ofstream(argv[1]+2);
+ if( GraphOutput==0 )
+ cerr << "Warning: Open failed for file '" << argv[1]+2 << "'.\n";
+ break;
+ default:
+ cerr << "Fehlerhafte Option: " << argv[1] << endl;
+ printUsage(1);
+ }
+ argv++;
+ argc--;
+ }
+
+
+ setKorpus();
+ if( FileForOther )
+ {
+ fromCatFile(p,FileForOther);
+ p->initialisierung=InitValue;
+ p->_initialize(InitValue);
+ }
+
+ if( hapaxInitName )
+ {
+ fromCatFile(p,hapaxInitName,0);
+ p->fixInitLike();
+ }
+
+ double start2Time=clockSec();
+
+ if(argc>=2 && strcasecmp(argv[1],"opt")==0 )
+ makeIterOpt();
+ else if(argc>=2 && strcasecmp(argv[1],"meta-opt")==0)
+ makeMetaOpt(argc,argv);
+ else if(argc>=2 && strcasecmp(argv[1],"izr-opt")==0)
+ makeIzrOpt();
+
+
+ else
+ {
+ makeIterOpt();
+ }
+
+ if( verboseMode )
+ {
+ cout << " full-time: " << clockSec()-startTime << endl;
+ cout << "optimize-time: " << clockSec()-start2Time << endl;
+ }
+ return 0;
+}
+
diff --git a/mkcls-v2/my.h b/mkcls-v2/my.h
new file mode 100644
index 0000000..ba06657
--- /dev/null
+++ b/mkcls-v2/my.h
@@ -0,0 +1,54 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#ifndef HEADER_my_DEFINED
+#define HEADER_my_DEFINED
+
+#define over_array(a,i) for(i=(a).low();i<=(a).high();i++)
+#define backwards_array(a,i) for(i=(a).high();i>=(a).low();i--)
+#define over_arr(a,i) for(int i=(a).low();i<=(a).high();i++)
+#define over_arrMAX(a,i,max) for(int i=(a).low();i<=min((a).high(),max-1);i++)
+#define backwards_arr(a,i) for(int i=(a).high();i>=(a).low();i--)
+
+extern double n1mult,n2mult,n3mult;
+
+inline double realProb(int n1,int n2)
+{
+ massert(n1<=n2);
+ iassert(n1>=0&&n2>0);
+ if(n2==0)n2=1;
+ return ((double)n1)/(double)n2;
+}
+
+inline double verfProb(int n1,int n2)
+{
+ double prob = realProb(n1,n2);
+ if( n1==1 )return prob*n1mult;
+ else if( n1==2 )return prob*n2mult;
+ else if( n1==3 )return prob*n3mult;
+ else return prob;
+}
+
+#endif
diff --git a/mkcls-v2/myassert.h b/mkcls-v2/myassert.h
new file mode 100644
index 0000000..da86ffb
--- /dev/null
+++ b/mkcls-v2/myassert.h
@@ -0,0 +1,44 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#ifndef MY_ASSERT_DEFINED
+#define MY_ASSERT_DEFINED
+void myerror(int line,const char *file,const char *expression);
+void imyerror(int line,const char *file,const char *expression);
+
+#define iassert(expression) do {if (!(expression)) {imyerror(__LINE__,__FILE__,#expression);}} while (0)
+
+#define massert(expr) do {} while(0)
+
+#define vassert(expr) do {} while(0)
+
+#include <assert.h>
+
+#endif
+
+
+
+
+
diff --git a/mkcls-v2/myleda.h b/mkcls-v2/myleda.h
new file mode 100644
index 0000000..6fc936b
--- /dev/null
+++ b/mkcls-v2/myleda.h
@@ -0,0 +1,278 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#ifndef myleda_HEADER_defined
+#define myleda_HEADER_defined
+using namespace std;
+#include "myassert.h"
+
+
+#if defined(USE_LEDA_array)||defined(USE_LEDA)
+#include <LEDA/array.h>
+#else
+
+#include "FixedArray.h"
+
+template<class T>
+class leda_array : public FixedArray<T>
+{
+public:
+ leda_array() {}
+ leda_array(int n) : FixedArray<T>(n) {}
+};
+#endif
+
+
+#if defined(USE_LEDA_set)||defined(USE_LEDA)
+#include <LEDA/set.h>
+#define forall_set(a,b,c) forall(b,c)
+#else
+#include <set>
+template<class T>
+class leda_set : public set<T>
+{
+public:
+ bool member(const T&m) const
+ { return this->count(m)!=0; }
+ void del(const T&m)
+ { this->erase(m); }
+};
+#define forall_set(a,b,c) for(a::iterator __i__=c.begin();__i__!=c.end()&&((b=*__i__),1);++__i__)
+template<class T>
+leda_set<T> operator&(const leda_set<T>&a,const leda_set<T>&b)
+{
+ leda_set<T>c;
+ insert_iterator<set<T> > iter(c,c.begin());
+ set_intersection(a.begin(),a.end(),b.begin(),b.end(),iter);
+ return c;
+}
+template<class T>
+leda_set<T> operator-(const leda_set<T>&a,const leda_set<T>&b)
+{
+ leda_set<T>c;
+ insert_iterator<set<T> > iter(c,c.begin());
+ set_difference(a.begin(),a.end(),b.begin(),b.end(),iter);
+ return c;
+}
+
+#endif
+
+
+#if defined(USE_LEDA_d_array)||defined(USE_LEDA)
+#include <LEDA/d_array.h>
+#define forall_defined_d(a,b,c,d) forall_defined(c,d)
+#define forall_d(a,b,c,d) forall(c,d)
+#else
+#include <map>
+template<class A,class B>
+class leda_d_array : public map<A,B>
+{
+private:
+ B init;
+public:
+ bool defined(const A&a) const
+ { return find(a)!=this->end(); }
+ const B&operator[](const A&a)const
+ {
+ typename map<A,B>::const_iterator pos=find(a);
+ iassert(pos!=this->end());
+ if( pos==this->end() )
+ return init;
+ else
+ return pos->second;
+ }
+ B&operator[](const A&a)
+ {
+ typename map<A,B>::iterator pos=find(a);
+ if( pos==this->end() )
+ {
+ insert(map<A,B>::value_type(a,init));
+ pos=find(a);
+ iassert(pos!=this->end());
+ }
+ return pos->second;
+ }
+};
+
+#define forall_defined_d(a,b,c,d) for(typename leda_d_array<a,b>::const_iterator __ii__=(d).begin();__ii__!=(d).end()&&((c=__ii__->first),1) ;++__ii__)
+#define forall_d(a,b,c,d) for(typename leda_d_array<a,b>::const_iterator __ii__=(d).begin();__ii__!=(d).end()&&((c=__ii__->second),1);++__ii__)
+#endif
+
+
+#if defined(USE_LEDA_h_array)||defined(USE_LEDA)
+#include <LEDA/h_array.h>
+#define forall_defined_h(a,b,c,d) forall_defined(c,d)
+#define forall_h(a,b,c,d) forall(c,d)
+#else
+
+double used_time();
+#if 0
+
+#include "my_hashmap.h"
+#define leda_h_array my_hashmap
+
+#else
+
+template<class T>
+class my_hash
+{
+public:
+ int operator()(const T&t)const {return Hash(t);}
+};
+
+inline int Hash(int value) { return value; }
+#define MY_HASH_BASE hash_map<A,B,my_hash<A> >
+
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+using __gnu_cxx::hash;
+#else
+#include <hash_map>
+#endif
+template<class A,class B>
+class leda_h_array : public MY_HASH_BASE
+{
+private:
+ B init;
+public:
+ leda_h_array() {}
+ leda_h_array(const B&_init)
+ : MY_HASH_BASE(),init(_init) {}
+ bool defined(const A&a) const
+ { return find(a)!=this->end(); }
+ const B&operator[](const A&a)const
+ {
+ typename MY_HASH_BASE::const_iterator pos=this->find(a);
+
+ if( pos==this->end() )
+ return init;
+ else
+ return pos->second;
+ }
+ B&operator[](const A&a)
+ {
+ typename MY_HASH_BASE::iterator pos=this->find(a);
+ if( pos==this->end() )
+ {
+ insert(typename MY_HASH_BASE::value_type(a,init));
+ pos=this->find(a);
+ iassert(pos!=this->end());
+ }
+ return pos->second;
+ }
+};
+
+#define forall_defined_h(a,b,c,d) for(typename leda_h_array<a,b>::const_iterator __jj__=(d).begin();__jj__!=(d).end()&&((c=__jj__->first),1); ++__jj__)
+#define forall_defined_h2(a,b,c,d) for(leda_h_array<a,b>::const_iterator __jj__=(d).begin();__jj__!=(d).end()&&((c=__jj__->first),1); ++__jj__)
+#define forall_h(a,b,c,d) for(typename leda_h_array<a,b>::const_iterator __jjj__=(d).begin();__jjj__!=(d).end()&&((c=__jjj__->second),1);++__jjj__)
+
+#endif
+
+#endif
+
+
+
+template<class T> int compare(const T&a,const T&b)
+{if(a==b)return 0; else if(a<b) return -1; else return 1;}
+
+template<class T,class U>
+ostream & operator<<(ostream&out,const leda_h_array<T,U>&w)
+{
+ T t;
+ bool makeNl=0;
+ out << "h_array{";
+ forall_defined_h(T,U,t,w)
+ {
+ if( makeNl )
+ out << "\n ";
+ out << "EL:" << t << " INH:" << w[t] << ".";
+ makeNl=1;
+ }
+ return out << "}\n";
+}
+template<class T,class U>
+ostream & operator<<(ostream&out,const leda_d_array<T,U>&w)
+{
+ T t;
+ bool makeNl=0;
+ out << "h_array{";
+ forall_defined_h(T,U,t,w)
+ {
+ if( makeNl )
+ out << "\n ";
+ out << "EL:" << t << " INH:" << w[t] << ".";
+ makeNl=1;
+ }
+ return out << "}\n";
+}
+
+template<class T>
+ostream&printSet(ostream&out,const leda_set<T>&s)
+{
+ bool first=1;
+ T t;
+ out << "{";
+ forall_set(typename set<T>,t,s)
+ {
+ if( first==0 )
+ out << ", ";
+ out << t;
+ first=0;
+ }
+ return out << "}\n";
+}
+
+template<class T,class U>
+istream & operator>>(istream&in,leda_h_array<T,U>&)
+{
+ return in;
+}
+
+template<class A,class B>
+bool operator==(const leda_h_array<A,B>&p1,const leda_h_array<A,B>&p2)
+{
+ A v;
+ forall_defined_h(A,B,v,p1)
+ if( !( p1[v]==p2[v]) ) return 0;
+ forall_defined_h(A,B,v,p2)
+ if( !( p1[v]==p2[v]) ) return 0;
+ return 1;
+}
+template<class A,class B>
+bool operator==(const leda_d_array<A,B>&p1,const leda_d_array<A,B>&p2)
+{
+ A v;
+ forall_defined_d(A,B,v,p1)
+ if( !( p1[v]==p2[v]) ) return 0;
+ forall_defined_d(A,B,v,p2)
+ if( !( p1[v]==p2[v]) ) return 0;
+ return 1;
+}
+
+
+
+
+#endif
diff --git a/mkcls-v2/mystl.h b/mkcls-v2/mystl.h
new file mode 100644
index 0000000..bcda88d
--- /dev/null
+++ b/mkcls-v2/mystl.h
@@ -0,0 +1,124 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+#ifndef MY_STL_H_DEFINED
+#define MY_STL_H_DEFINED
+#include <string>
+#include <utility>
+#if __GNUC__>2
+#include <ext/hash_map>
+using __gnu_cxx::hash_map;
+using __gnu_cxx::hash;
+#else
+#include <hash_map>
+#endif
+#include <cmath>
+
+using namespace std;
+
+#define over_string(a,i) for(unsigned int i=0;i<a.length();i++)
+
+inline int Hash(const string& s)
+{
+ int sum=0;
+ string::const_iterator i=s.begin(),end=s.end();
+ for(;i!=end;i++)sum=5*sum+(*i);
+ return sum;
+}
+
+template<class V> int Hash(const pair<V,V>&a)
+{ return Hash(a.first)+4*Hash(a.second); }
+
+template<class T1,class T2>
+istream& operator>>(istream &in,pair<T1,T2> &ir)
+{
+ char c;
+
+ do in.get(c); while (in && isspace(c));
+
+ if (!in) return in;
+
+ if (c != '(') in.putback(c);
+
+ in >> ir.first;
+
+ do in.get(c); while (isspace(c));
+ if (c != ',') in.putback(c);
+
+ in >> ir.second;
+
+ do in.get(c); while (c == ' ');
+ if (c != ')') in.putback(c);
+
+ return in;
+}
+
+template<class T1,class T2>
+ostream& operator<<(ostream &out,const pair<T1,T2> &ir)
+{
+ out << "(" << ir.first << "," << ir.second << ")";
+ return out;
+}
+
+void printSpaces(ostream&out,int n);
+void mysplit(const string &s,string &s1,string &s2);
+string untilChar(const string&s,char c);
+
+template<class A,class B,class C>
+class tri
+{
+public:
+ A a;
+ B b;
+ C c;
+ tri(){};
+ tri(const A&_a,const B&_b,const C&_c)
+ : a(_a),b(_b),c(_c) {}
+};
+template<class A,class B,class C>
+bool operator==(const tri<A,B,C>&x,const tri<A,B,C>&y)
+{ return x.a==y.a&&x.b==y.b&&x.c==y.c;}
+
+template<class A,class B,class C>
+bool operator<(const tri<A,B,C>&x,const tri<A,B,C>&y)
+{
+ if(x.a<y.a)
+ return 1;
+ if(y.a<x.a)
+ return 0;
+
+ if(x.b<y.b)
+ return 1;
+ if(y.b<x.b)
+ return 0;
+
+ if(x.c<y.c)
+ return 1;
+ if(y.c<x.c)
+ return 0;
+ return 0;
+}
+
+#endif