1 files changed, 700 insertions, 0 deletions
diff --git a/experimental/alignment-enabled/MGIZA/src/mkcls/KategProblemTest.cpp b/experimental/alignment-enabled/MGIZA/src/mkcls/KategProblemTest.cpp
new file mode 100644
index 0000000..8c76ce5
--- /dev/null
+++ b/experimental/alignment-enabled/MGIZA/src/mkcls/KategProblemTest.cpp
@@ -0,0 +1,700 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001  Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, 
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 
+USA.
+
+*/
+
+
+
+
+#include "KategProblemTest.h"
+
+#include "ProblemTest.h"
+#include "HCOptimization.h"
+#include "TAOptimization.h"
+#include "RRTOptimization.h"
+#include "GDAOptimization.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string>
+#include <strstream>
+
+typedef pair<string,string> PSS;
+
+#define NEW_SENTENCE_END "mkcls-mapped-dollar-symbol-$"
+
+#ifdef NeXT
+char *strdup(char *a)
+{
+  char *p = (char *)malloc(strlen(a)+1);
+  strcpy(p,a);
+  return p;
+}
+
+#endif
+
+
+void writeClasses(Array<Kategory> &katOfWord,KategProblem &problem,ostream &to)
+{	    
+  for(int i=0;i<katOfWord.size();i++)
+    {
+      if( strcmp(problem.getString(i),"$") )
+	if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 )
+	  to << "$" << "\t" << katOfWord[i] << endl;
+	else
+	  to << problem.getString(i) << "\t" << katOfWord[i] << endl;
+    }
+}
+
+
+void mysplit(const string &s,string &s1,string &s2)
+{
+  unsigned int i=0;
+  for(;i<s.length();i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break;
+  s1=s.substr(0,i);
+  for(;i<s.length();i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break;  
+  s2=s.substr(i,s.length()-i);
+  
+  iassert(s1.size());
+  iassert(s2.size());
+}
+
+
+
+int fromCatFile(KategProblem *p,const char *fname,bool verb)
+{
+  leda_h_array<string,int> translation(-1);
+  int maxCat=2;
+  ifstream in(fname);
+  if(!in)
+    {
+      cerr << "Error: File '" << fname << "' cannot be opened.\n";
+      exit(1);
+    }
+  for(int i=0;i<p->wordFreq.nWords;i++)
+    (p->initLike)[i]= -1;
+  
+  
+  translation["1"]=1;
+  translation["0"]=0;
+
+  
+  string s;
+  while( getline(in,s) ) 
+    {
+      string str,categ;
+      mysplit(s,str,categ);
+      int i=p->words->binary_locate(str);
+      if(i>=0 && (*(p->words))[i]==str )
+	{
+	  
+	  if( translation[categ]==-1 )
+	    translation[categ]=maxCat++;
+	  int cat=translation[categ];
+	  if( (p->initLike)[i]!= -1 )
+	    cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n";
+	  (p->initLike)[i]=cat;
+	}
+      else
+	cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n";
+    }
+  
+  if( verboseMode )
+    cout << "We have " << maxCat << " read non-empty categories" 
+      " (with words from the corpus).\n";
+  
+  if(maxCat>p->katFreq.nKats)
+    {
+      cerr << "Error: Not enough categories reserved (only " 
+	   << p->katFreq.nKats << ", but i need " << maxCat << ").\n";
+      exit(1);
+    }
+  
+  
+  int i=p->words->binary_locate("$");
+  if( i>=0 &&  (*(p->words))[i]=="$" )
+    (p->initLike)[i]=0;
+  else
+    if( verboseMode )
+      cerr << "Warning: No '$' in vocabulary!\n";
+  
+  
+  int errors=0;
+  for(i=0;i<p->wordFreq.nWords;i++)
+    if((p->initLike)[i]== -1 )
+      {
+	if( verb ) cerr << "Error: I don't know the category of word " << i 
+	     << " (" << (*(p->words))[i] << ") " << ".\n";
+	errors=1;
+      }
+  return errors;
+}
+
+
+
+KategProblem *makeKategProblem(const leda_h_array<PSS,FreqType>&cTbl,const leda_set<string>&setVokabular, int maxClass,int initialisierung,
+			 int auswertung,int nachbarschaft,int minWordFrequency)
+{
+  
+  int nwrd=0;
+  leda_array<string>&sVok = *new leda_array<string>(setVokabular.size());
+  string s;
+  unsigned int ctr=0;
+  forall_set(leda_set<string>,s,setVokabular)
+    {
+      if( verboseMode>2 )
+	cout << "mkcls:Wort " << ctr << " " << s << endl;
+      sVok[ctr++]=s;
+    }
+  for(unsigned int z=0;z<ctr-1;z++)
+    iassert( sVok[z]<sVok[z+1] );
+  sVok.sort();
+
+  if( verboseMode>2 )
+    cout << "*****Vocabulary: " << sVok;
+  
+  unsigned int vokSize=sVok.size();
+  massert(vokSize==ctr); massert(vokSize==setVokabular.size());
+  if(verboseMode)
+    {cout << "Size of vocabulary: " << vokSize << "\n";cout.flush();}
+  
+  KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung,
+				     auswertung,nachbarschaft,minWordFrequency);
+  KategProblemWBC &w=k->wordFreq;
+  k->words=&sVok;
+  
+  Array<int> after(vokSize,0);
+  Array<int> before(vokSize,0);
+  
+  
+  nwrd=0;
+  {
+    PSS s;
+    forall_defined_h2(PSS,FreqType,s,cTbl)
+      {
+	const string&ss1=s.first;
+	const string&ss2=s.second;
+	if( ss2.length()&&(ss1!="$" || ss2!="$") )
+	  {
+	    int i1=sVok.binary_search(ss1);
+	    int i2=sVok.binary_search(ss2);
+	    iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
+	    after[i1]++;
+	    before[i2]++;
+	  }
+	if( verboseMode&&((nwrd++)%10000==0) ) 
+	  {cout<<"Statistiken-1 " << nwrd<< ".      \r";cout.flush();}
+      }
+  }
+  
+  for(unsigned int i=0;i<vokSize;i++)
+    {
+      w.setAfterWords(i,after[i]);
+      w.setBeforeWords(i,before[i]);
+    }
+  
+  
+  {
+    nwrd=0;
+    PSS s;
+    forall_defined_h2(PSS,FreqType,s,cTbl)
+      {
+	const string&ss1=s.first;
+	const string&ss2=s.second;
+	FreqType p=cTbl[s];
+	if( ss2.length()&&(ss1!="$" || ss2!="$") )
+	  {
+	    int i1=sVok.binary_search(ss1);
+	    int i2=sVok.binary_search(ss2);
+	    iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
+	    w.setFreq(i1,i2,p);
+	    if( verboseMode>2 )
+	      cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << "   " 
+		   << ss2 << ":" << i2 << "   " << p << endl;
+	  }
+ 	if( verboseMode&&((nwrd++)%10000==0) ) 
+	  {cout<<"Statistiken-2 " <<nwrd<< ".    \r";cout.flush();}
+     }
+  }
+  
+  w.testFull();
+  if(verboseMode){cout << "Datenintegritaet getestet.\n";cout.flush();}
+  return k;
+}
+
+KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung,
+			 int auswertung,int nachbarschaft,int minWordFrequency)
+{
+  ifstream file(str);
+  if(!file)return 0;
+  leda_set<string> setVokabular;
+  leda_h_array<PSS,FreqType> cTbl;
+  double c=0;
+  if( verboseMode )cout << "NGRFILE: " << str << endl;
+  string s1,s2;
+  while(file >> c >> s1 >> s2)
+    {
+      if( s1.length()==0||s2.length()==0 )
+	{
+	  cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl;
+	  return 0;
+	}
+      if( c==0 )
+	{
+	  cerr << "Count ist 0 " << s1 << " " << s2 << endl;
+	  return 0;
+	}
+      cTbl[pair<string,string>(s1,s2)]=(FreqType)c;
+      setVokabular.insert(s1);
+      setVokabular.insert(s2);
+      if( verboseMode>1 )
+	cout << "R: " << s1 << " " << s2 << " " << c << endl;
+      c=0;
+    }
+  
+  return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
+}
+
+
+
+	
+
+	
+
+
+KategProblem *fromKModel(const char *str,int maxClass,int initialisierung,
+			 int auswertung,int nachbarschaft,int minWordFrequency)
+{
+  string oldText,text,line;
+  ifstream f(str);
+  if( !f )
+    {
+      cerr << "ERROR: can not open file " << str << ".\n";
+      return 0;
+    }
+  
+  leda_set<string> setVokabular;
+  leda_h_array<PSS,FreqType> cTbl(0);
+  oldText="$";
+  while(1)
+    {
+      getline(f,line);
+      if(f.fail() && !f.bad() && !f.eof())
+	{
+	  cerr << "WARNING: strange characters in stream (getline) " << endl;f.clear();
+	}
+      if(!f)break;
+      
+      istrstream f2(line.c_str());
+      while( 1 )
+	{
+	  f2 >> text;
+	  if(f2.fail() && !f2.bad() && !f2.eof())
+	    {
+	      cerr << "WARNING: strange characters in stream (>>) !\n";
+	      f2.clear(ios::failbit);
+	    }
+	  if(!f2){break;}
+	  
+	    
+	    
+	    
+	    
+	    
+	  if( text == "$" )
+	    text = "mkcls-mapped-dollar-symbol-$";
+	  if( !setVokabular.member(text) )setVokabular.insert(text);
+	  cTbl[pair<string,string>(oldText,text)]++;
+	  oldText=text;
+	}
+      text="$";
+      if( !setVokabular.member(text) )setVokabular.insert(text);
+      cTbl[pair<string,string>(oldText,text)]++;
+      oldText=text;
+    }
+  return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
+}
+ 
+
+
+
+
+void KategProblemSetParameters(KategProblem &p)
+{
+  if( p.katwahl()==K_BEST )
+    {
+      TAOptimization::defaultAnnRate=0.7;
+      RRTOptimization::defaultAnnRate=0.95;
+      GDAOptimization::defaultAlpha=0.05;
+      if( verboseMode )
+	cout << "Parameter-setting like W-DET-BEST\n";
+    }
+  else
+    {
+      TAOptimization::defaultAnnRate=0.4;
+      RRTOptimization::defaultAnnRate=0.6;
+      GDAOptimization::defaultAlpha=0.0125;
+      if( verboseMode )
+	cout << "Parameter-setting like W-DET-DET\n";
+    }
+}
+
+
+
+
+KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue,
+			int auswertung,int nachbarschaft,float relInit)
+{
+  KategProblem &k=
+    *new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft);
+  KategProblemWBC &w=k.wordFreq;
+  Array<int> after(ANZ_WORD,0);
+  Array<int> before(ANZ_WORD,0);
+  Array<FreqArray> twoD(ANZ_WORD);
+	int i;
+  for(i=0;i<ANZ_WORD;i++) twoD[i].init(ANZ_WORD,0); 
+  
+  for(i=0;i<ANZ_WORD;i++)
+    {
+      massert(after[i]==0);
+      massert(before[i]==0);
+      for(int j=0;j<ANZ_WORD;j++)
+	{
+	  massert(twoD[i][j]==0);
+	}
+    }
+  for(i=0;i<ANZ_WORD*ANZ_WORD*relInit;i++)
+    {
+      int x=randomInt(ANZ_WORD);
+      int y=randomInt(ANZ_WORD);
+      if(twoD[x][y]==0)
+	{
+	  after[x]++;
+	  before[y]++;
+	}
+      twoD[x][y]+=randomInt(10)+1;
+    }
+  for(i=0;i<ANZ_WORD;i++)
+    {
+      w.setAfterWords(i,after[i]);
+      w.setBeforeWords(i,before[i]);
+    }
+  
+  for(i=0;i<ANZ_WORD;i++)
+    {
+      for(int j=0;j<ANZ_WORD;j++)
+	if( twoD[i][j] )
+	    w.setFreq(i,j,twoD[i][j]);
+    }
+  w.testFull();
+  return k;
+}
+
+
+
+
+char *makeTitle(KategProblem &problem,int verfahren)
+{
+  char x[1024];
+  switch(verfahren)
+    {
+    case HC_OPT:
+      strcpy(x,"HC   ");
+      break;
+    case SA_OPT:
+      strcpy(x,"SA     ");
+      break;
+    case TA_OPT:
+      strcpy(x,"TA     ");
+      break;
+    case GDA_OPT:
+      strcpy(x,"GDA    ");
+      break;
+    case RRT_OPT:
+      strcpy(x,"RRT    ");
+      break;
+    }
+  problem.makeTitle(x+strlen(x));
+  return strdup(x);
+}
+
+
+
+
+#define MAX_MULTIPLE 10  
+
+Array<KategProblem *> &_izrOptimization(Array<KategProblem *> &probs,
+int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord,
+int anzIter,int verfahren)
+{
+  massert(anzprob>1);
+  massert(probs[0]->wordFreq.mindestAnzahl<=1);
+  KategProblem *p0=probs[0];
+
+  int nWords=p0->wordFreq.nWords;
+  int nKats=p0->katFreq.nKats;
+  int minimumNumberOfWords = max(1,int(nWords*0.95));
+
+  int indexOfDurchschnitt;
+  Array<int> newWords(nWords);
+  int useAnzprob=anzprob;
+  do
+    {
+      int w,k;
+      indexOfDurchschnitt=0;
+      for(w=0;w<nWords;w++)
+	newWords[w]=-1;
+      for(k=0;k<useAnzprob;k++)
+	{
+	  massert(probs[k]->wordFreq.nWords==nWords);
+	  probs[k]->makeKats();
+	}
+      
+      for(w=0;w<nWords;w++)
+	{
+	  if( newWords[w]==-1 )
+	    {
+	      
+	      
+	      
+	      leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)];
+	      for(k=1;k<useAnzprob;k++)
+	      durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)];
+	      
+	      
+	      int _anzInDurchschnitt=0;
+	      int nr=0;
+	      forall_set(leda_set<int>,nr,durchschnitt)
+		{
+		  _anzInDurchschnitt++;
+		  newWords[nr]=indexOfDurchschnitt;
+		}
+	      if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 )
+		{
+		  cout << "- (";
+		    forall_set(leda_set<int>,nr,durchschnitt)
+		    {
+		      cout << p0->getString(nr);
+		      if( p0->wordFreq.n1(nr)==1 )
+			cout << "* ";
+		      else
+			cout << " ";
+		    }
+		  cout << ")\n";
+		}
+	      
+	      
+	      
+	      
+	      for(k=0;k<useAnzprob;k++)
+		{
+		  durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)];
+		}
+	      indexOfDurchschnitt++;
+	    }
+	}
+      
+    if(indexOfDurchschnitt>=minimumNumberOfWords)
+	{
+	  if(useAnzprob==1)
+	    {
+	      cout << "useAnzProb==1 => mysterious.\n";
+	      break;	
+	    }
+	  useAnzprob--;
+	}
+    }
+  while(indexOfDurchschnitt>=minimumNumberOfWords);
+  
+  
+  Array<KategProblem *> &neu=*new Array<KategProblem *>(MAX_MULTIPLE*anzprob,(KategProblem *)0);
+  qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem);
+  massert(useAnzprob<=probs.size());
+  double startTime=clockSec();
+  int i, numberOfNew;
+  for(numberOfNew=0; (clockSec()-startTime<timeForOneRed) 
+                         || (numberOfNew < anzprob) ; numberOfNew++)
+    {
+      int w;
+      if( numberOfNew==anzprob*MAX_MULTIPLE-1 )
+	break;
+      KategProblem *p 
+	= neu[numberOfNew] 
+	  = new KategProblem(indexOfDurchschnitt,nKats-2,
+		          p0->initialisierung,p0->auswertung,p0->nachbarschaft);
+
+      for(w=0;w<indexOfDurchschnitt;w++)
+	{
+	  p->wordFreq.setAfterWords(w,5);
+	  p->wordFreq.setBeforeWords(w,5);
+	}
+      for(w=0;w<nWords;w++)
+	{
+	  Array<OneFreq> &after=p0->wordFreq.after[w];
+	  int size=after.size();
+	  for(i=0;i<size;i++)
+	    p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n);
+	}
+      p->wordFreq.testFull(1);
+      
+      
+      
+      
+      
+      
+      p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words());
+      double w1=0.0,w2=0.0;
+      if(numberOfNew<useAnzprob)
+	{
+	  
+	  for(i=0;i<nWords;i++)
+	    (p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i);
+	  p->_initialize(5);
+	  HCOptimization hc(*p,-1);
+	  if(verboseMode)
+	    {
+	      w1=p->nicevalue();
+	      cout << "from old category system:" << w1 << endl;
+	    }
+	  hc.minimize(-1);
+	  if(verboseMode)
+	    {
+	      w2=p->nicevalue();
+	      if(w2<w1)
+		cout << "improvement: " << w1-w2 << endl;
+	    }
+	}
+      else
+	{
+	  p->_initialize(1);
+	  double mean;
+	  StatVar end,laufzeit,start;
+	  solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start);
+	  w2=p->value();
+	  if(verboseMode)
+	      cout << "new category system: " << w2 << " (" << p->nicevalue() 
+		<< ") Zeit: " << clockSec() << "\n";
+	}
+    }
+	int p;
+  for(p=0;p<probs.size();p++)
+    {
+      if( probs[p] )
+	delete probs[p];
+    }
+  qsort(neu.getPointerToData(),numberOfNew,sizeof(Problem *),compareProblem);
+  massert(numberOfNew<=neu.size());
+  if( verboseMode )
+    cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt 
+      << " words. costs: " << neu[0]->value() << " " 
+      << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: " 
+      << clockSec() << endl;
+  if( indexOfDurchschnitt<=nKats 
+      || (clockSec()>maxClock&&maxClock) )
+    {
+      if( clockSec()>maxClock&&maxClock )
+	cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n";
+      for(i=0;i<nWords;i++)
+	katOfWord[i]=neu[0]->katOfWord(newWords[i]);
+      return neu;
+    }
+  else
+    {
+      Array<Kategory> &newKatOfWord=
+       *(new Array<Kategory>(neu[0]->wordFreq.nWords,-1));
+      Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed,
+						  maxClock,newKatOfWord,
+						  anzIter+1,verfahren);
+      for(i=0;i<nWords;i++)
+	katOfWord[i]=newKatOfWord[newWords[i]];
+      return erg;
+    }
+}
+
+
+
+
+KategProblem *izrOptimization(KategProblem &p,int minN,int firstN,
+			      double clockForOneRed,double maxClock,int verfahren)
+{
+  Array<Kategory> katOfWord(p.wordFreq.nWords,-1);
+  int startN;
+  if( clockForOneRed<=0 )
+    startN=firstN;
+  else
+    startN=1000;
+  Array<KategProblem *> probs(startN);
+  double val1=0.0,val2=0.0;
+  double endTime=-1;
+
+  double startTime=clockSec();
+  int i;
+  for(i=0;i<startN;i++)
+    {
+      StatVar end,laufzeit,start;
+      double mean;
+      probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem());
+      solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start);
+      if( i==minN-1 )
+	endTime = clockSec();
+      if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) )
+	break;
+    }
+  if( endTime<0 )
+    endTime=clockSec();
+  massert(i>=firstN);
+
+  qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem);
+  massert(i<=probs.size());
+  if( clockForOneRed<=0 )
+    {
+      clockForOneRed=endTime-startTime;
+      if( verboseMode )
+	cout << "time for one reduction: " << clockForOneRed << endl;
+    }
+  _izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren);
+  
+  KategProblem *n=(KategProblem *)(p.makeEqualProblem());
+  n->initLike= katOfWord;
+  n->_initialize(5);
+  if( verboseMode )
+    val1=n->value();
+  HCOptimization hc(*n,-1);
+  hc.minimize(-1);
+  val2=n->value();
+  if( verboseMode )
+    cout << "last improvement: " << val2-val1 << "\n";
+  cout << "final costs: " << val2 << " " << n->nicevalue() << endl;
+  if(PrintBestTo)
+    n->dumpOn(*PrintBestTo);
+  return n;
+}
+
+
+
+
+
+
+
+
+
+
+