Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'experimental/alignment-enabled/MGIZA/src/mkcls/KategProblemTest.cpp')
-rw-r--r--experimental/alignment-enabled/MGIZA/src/mkcls/KategProblemTest.cpp700
1 files changed, 700 insertions, 0 deletions
diff --git a/experimental/alignment-enabled/MGIZA/src/mkcls/KategProblemTest.cpp b/experimental/alignment-enabled/MGIZA/src/mkcls/KategProblemTest.cpp
new file mode 100644
index 0000000..8c76ce5
--- /dev/null
+++ b/experimental/alignment-enabled/MGIZA/src/mkcls/KategProblemTest.cpp
@@ -0,0 +1,700 @@
+/*
+
+Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och
+
+mkcls - a program for making word classes .
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+
+
+
+
+#include "KategProblemTest.h"
+
+#include "ProblemTest.h"
+#include "HCOptimization.h"
+#include "TAOptimization.h"
+#include "RRTOptimization.h"
+#include "GDAOptimization.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string>
+#include <strstream>
+
+typedef pair<string,string> PSS;
+
+#define NEW_SENTENCE_END "mkcls-mapped-dollar-symbol-$"
+
+#ifdef NeXT
+char *strdup(char *a)
+{
+ char *p = (char *)malloc(strlen(a)+1);
+ strcpy(p,a);
+ return p;
+}
+
+#endif
+
+
+void writeClasses(Array<Kategory> &katOfWord,KategProblem &problem,ostream &to)
+{
+ for(int i=0;i<katOfWord.size();i++)
+ {
+ if( strcmp(problem.getString(i),"$") )
+ if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 )
+ to << "$" << "\t" << katOfWord[i] << endl;
+ else
+ to << problem.getString(i) << "\t" << katOfWord[i] << endl;
+ }
+}
+
+
+void mysplit(const string &s,string &s1,string &s2)
+{
+ unsigned int i=0;
+ for(;i<s.length();i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break;
+ s1=s.substr(0,i);
+ for(;i<s.length();i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break;
+ s2=s.substr(i,s.length()-i);
+
+ iassert(s1.size());
+ iassert(s2.size());
+}
+
+
+
+int fromCatFile(KategProblem *p,const char *fname,bool verb)
+{
+ leda_h_array<string,int> translation(-1);
+ int maxCat=2;
+ ifstream in(fname);
+ if(!in)
+ {
+ cerr << "Error: File '" << fname << "' cannot be opened.\n";
+ exit(1);
+ }
+ for(int i=0;i<p->wordFreq.nWords;i++)
+ (p->initLike)[i]= -1;
+
+
+ translation["1"]=1;
+ translation["0"]=0;
+
+
+ string s;
+ while( getline(in,s) )
+ {
+ string str,categ;
+ mysplit(s,str,categ);
+ int i=p->words->binary_locate(str);
+ if(i>=0 && (*(p->words))[i]==str )
+ {
+
+ if( translation[categ]==-1 )
+ translation[categ]=maxCat++;
+ int cat=translation[categ];
+ if( (p->initLike)[i]!= -1 )
+ cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n";
+ (p->initLike)[i]=cat;
+ }
+ else
+ cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n";
+ }
+
+ if( verboseMode )
+ cout << "We have " << maxCat << " read non-empty categories"
+ " (with words from the corpus).\n";
+
+ if(maxCat>p->katFreq.nKats)
+ {
+ cerr << "Error: Not enough categories reserved (only "
+ << p->katFreq.nKats << ", but i need " << maxCat << ").\n";
+ exit(1);
+ }
+
+
+ int i=p->words->binary_locate("$");
+ if( i>=0 && (*(p->words))[i]=="$" )
+ (p->initLike)[i]=0;
+ else
+ if( verboseMode )
+ cerr << "Warning: No '$' in vocabulary!\n";
+
+
+ int errors=0;
+ for(i=0;i<p->wordFreq.nWords;i++)
+ if((p->initLike)[i]== -1 )
+ {
+ if( verb ) cerr << "Error: I don't know the category of word " << i
+ << " (" << (*(p->words))[i] << ") " << ".\n";
+ errors=1;
+ }
+ return errors;
+}
+
+
+
+KategProblem *makeKategProblem(const leda_h_array<PSS,FreqType>&cTbl,const leda_set<string>&setVokabular, int maxClass,int initialisierung,
+ int auswertung,int nachbarschaft,int minWordFrequency)
+{
+
+ int nwrd=0;
+ leda_array<string>&sVok = *new leda_array<string>(setVokabular.size());
+ string s;
+ unsigned int ctr=0;
+ forall_set(leda_set<string>,s,setVokabular)
+ {
+ if( verboseMode>2 )
+ cout << "mkcls:Wort " << ctr << " " << s << endl;
+ sVok[ctr++]=s;
+ }
+ for(unsigned int z=0;z<ctr-1;z++)
+ iassert( sVok[z]<sVok[z+1] );
+ sVok.sort();
+
+ if( verboseMode>2 )
+ cout << "*****Vocabulary: " << sVok;
+
+ unsigned int vokSize=sVok.size();
+ massert(vokSize==ctr); massert(vokSize==setVokabular.size());
+ if(verboseMode)
+ {cout << "Size of vocabulary: " << vokSize << "\n";cout.flush();}
+
+ KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung,
+ auswertung,nachbarschaft,minWordFrequency);
+ KategProblemWBC &w=k->wordFreq;
+ k->words=&sVok;
+
+ Array<int> after(vokSize,0);
+ Array<int> before(vokSize,0);
+
+
+ nwrd=0;
+ {
+ PSS s;
+ forall_defined_h2(PSS,FreqType,s,cTbl)
+ {
+ const string&ss1=s.first;
+ const string&ss2=s.second;
+ if( ss2.length()&&(ss1!="$" || ss2!="$") )
+ {
+ int i1=sVok.binary_search(ss1);
+ int i2=sVok.binary_search(ss2);
+ iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
+ after[i1]++;
+ before[i2]++;
+ }
+ if( verboseMode&&((nwrd++)%10000==0) )
+ {cout<<"Statistiken-1 " << nwrd<< ". \r";cout.flush();}
+ }
+ }
+
+ for(unsigned int i=0;i<vokSize;i++)
+ {
+ w.setAfterWords(i,after[i]);
+ w.setBeforeWords(i,before[i]);
+ }
+
+
+ {
+ nwrd=0;
+ PSS s;
+ forall_defined_h2(PSS,FreqType,s,cTbl)
+ {
+ const string&ss1=s.first;
+ const string&ss2=s.second;
+ FreqType p=cTbl[s];
+ if( ss2.length()&&(ss1!="$" || ss2!="$") )
+ {
+ int i1=sVok.binary_search(ss1);
+ int i2=sVok.binary_search(ss2);
+ iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
+ w.setFreq(i1,i2,p);
+ if( verboseMode>2 )
+ cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " "
+ << ss2 << ":" << i2 << " " << p << endl;
+ }
+ if( verboseMode&&((nwrd++)%10000==0) )
+ {cout<<"Statistiken-2 " <<nwrd<< ". \r";cout.flush();}
+ }
+ }
+
+ w.testFull();
+ if(verboseMode){cout << "Datenintegritaet getestet.\n";cout.flush();}
+ return k;
+}
+
+KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung,
+ int auswertung,int nachbarschaft,int minWordFrequency)
+{
+ ifstream file(str);
+ if(!file)return 0;
+ leda_set<string> setVokabular;
+ leda_h_array<PSS,FreqType> cTbl;
+ double c=0;
+ if( verboseMode )cout << "NGRFILE: " << str << endl;
+ string s1,s2;
+ while(file >> c >> s1 >> s2)
+ {
+ if( s1.length()==0||s2.length()==0 )
+ {
+ cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl;
+ return 0;
+ }
+ if( c==0 )
+ {
+ cerr << "Count ist 0 " << s1 << " " << s2 << endl;
+ return 0;
+ }
+ cTbl[pair<string,string>(s1,s2)]=(FreqType)c;
+ setVokabular.insert(s1);
+ setVokabular.insert(s2);
+ if( verboseMode>1 )
+ cout << "R: " << s1 << " " << s2 << " " << c << endl;
+ c=0;
+ }
+
+ return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
+}
+
+
+
+
+
+
+
+
+KategProblem *fromKModel(const char *str,int maxClass,int initialisierung,
+ int auswertung,int nachbarschaft,int minWordFrequency)
+{
+ string oldText,text,line;
+ ifstream f(str);
+ if( !f )
+ {
+ cerr << "ERROR: can not open file " << str << ".\n";
+ return 0;
+ }
+
+ leda_set<string> setVokabular;
+ leda_h_array<PSS,FreqType> cTbl(0);
+ oldText="$";
+ while(1)
+ {
+ getline(f,line);
+ if(f.fail() && !f.bad() && !f.eof())
+ {
+ cerr << "WARNING: strange characters in stream (getline) " << endl;f.clear();
+ }
+ if(!f)break;
+
+ istrstream f2(line.c_str());
+ while( 1 )
+ {
+ f2 >> text;
+ if(f2.fail() && !f2.bad() && !f2.eof())
+ {
+ cerr << "WARNING: strange characters in stream (>>) !\n";
+ f2.clear(ios::failbit);
+ }
+ if(!f2){break;}
+
+
+
+
+
+
+ if( text == "$" )
+ text = "mkcls-mapped-dollar-symbol-$";
+ if( !setVokabular.member(text) )setVokabular.insert(text);
+ cTbl[pair<string,string>(oldText,text)]++;
+ oldText=text;
+ }
+ text="$";
+ if( !setVokabular.member(text) )setVokabular.insert(text);
+ cTbl[pair<string,string>(oldText,text)]++;
+ oldText=text;
+ }
+ return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
+}
+
+
+
+
+
+void KategProblemSetParameters(KategProblem &p)
+{
+ if( p.katwahl()==K_BEST )
+ {
+ TAOptimization::defaultAnnRate=0.7;
+ RRTOptimization::defaultAnnRate=0.95;
+ GDAOptimization::defaultAlpha=0.05;
+ if( verboseMode )
+ cout << "Parameter-setting like W-DET-BEST\n";
+ }
+ else
+ {
+ TAOptimization::defaultAnnRate=0.4;
+ RRTOptimization::defaultAnnRate=0.6;
+ GDAOptimization::defaultAlpha=0.0125;
+ if( verboseMode )
+ cout << "Parameter-setting like W-DET-DET\n";
+ }
+}
+
+
+
+
+KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue,
+ int auswertung,int nachbarschaft,float relInit)
+{
+ KategProblem &k=
+ *new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft);
+ KategProblemWBC &w=k.wordFreq;
+ Array<int> after(ANZ_WORD,0);
+ Array<int> before(ANZ_WORD,0);
+ Array<FreqArray> twoD(ANZ_WORD);
+ int i;
+ for(i=0;i<ANZ_WORD;i++) twoD[i].init(ANZ_WORD,0);
+
+ for(i=0;i<ANZ_WORD;i++)
+ {
+ massert(after[i]==0);
+ massert(before[i]==0);
+ for(int j=0;j<ANZ_WORD;j++)
+ {
+ massert(twoD[i][j]==0);
+ }
+ }
+ for(i=0;i<ANZ_WORD*ANZ_WORD*relInit;i++)
+ {
+ int x=randomInt(ANZ_WORD);
+ int y=randomInt(ANZ_WORD);
+ if(twoD[x][y]==0)
+ {
+ after[x]++;
+ before[y]++;
+ }
+ twoD[x][y]+=randomInt(10)+1;
+ }
+ for(i=0;i<ANZ_WORD;i++)
+ {
+ w.setAfterWords(i,after[i]);
+ w.setBeforeWords(i,before[i]);
+ }
+
+ for(i=0;i<ANZ_WORD;i++)
+ {
+ for(int j=0;j<ANZ_WORD;j++)
+ if( twoD[i][j] )
+ w.setFreq(i,j,twoD[i][j]);
+ }
+ w.testFull();
+ return k;
+}
+
+
+
+
+char *makeTitle(KategProblem &problem,int verfahren)
+{
+ char x[1024];
+ switch(verfahren)
+ {
+ case HC_OPT:
+ strcpy(x,"HC ");
+ break;
+ case SA_OPT:
+ strcpy(x,"SA ");
+ break;
+ case TA_OPT:
+ strcpy(x,"TA ");
+ break;
+ case GDA_OPT:
+ strcpy(x,"GDA ");
+ break;
+ case RRT_OPT:
+ strcpy(x,"RRT ");
+ break;
+ }
+ problem.makeTitle(x+strlen(x));
+ return strdup(x);
+}
+
+
+
+
+#define MAX_MULTIPLE 10
+
+Array<KategProblem *> &_izrOptimization(Array<KategProblem *> &probs,
+int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord,
+int anzIter,int verfahren)
+{
+ massert(anzprob>1);
+ massert(probs[0]->wordFreq.mindestAnzahl<=1);
+ KategProblem *p0=probs[0];
+
+ int nWords=p0->wordFreq.nWords;
+ int nKats=p0->katFreq.nKats;
+ int minimumNumberOfWords = max(1,int(nWords*0.95));
+
+ int indexOfDurchschnitt;
+ Array<int> newWords(nWords);
+ int useAnzprob=anzprob;
+ do
+ {
+ int w,k;
+ indexOfDurchschnitt=0;
+ for(w=0;w<nWords;w++)
+ newWords[w]=-1;
+ for(k=0;k<useAnzprob;k++)
+ {
+ massert(probs[k]->wordFreq.nWords==nWords);
+ probs[k]->makeKats();
+ }
+
+ for(w=0;w<nWords;w++)
+ {
+ if( newWords[w]==-1 )
+ {
+
+
+
+ leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)];
+ for(k=1;k<useAnzprob;k++)
+ durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)];
+
+
+ int _anzInDurchschnitt=0;
+ int nr=0;
+ forall_set(leda_set<int>,nr,durchschnitt)
+ {
+ _anzInDurchschnitt++;
+ newWords[nr]=indexOfDurchschnitt;
+ }
+ if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 )
+ {
+ cout << "- (";
+ forall_set(leda_set<int>,nr,durchschnitt)
+ {
+ cout << p0->getString(nr);
+ if( p0->wordFreq.n1(nr)==1 )
+ cout << "* ";
+ else
+ cout << " ";
+ }
+ cout << ")\n";
+ }
+
+
+
+
+ for(k=0;k<useAnzprob;k++)
+ {
+ durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)];
+ }
+ indexOfDurchschnitt++;
+ }
+ }
+
+ if(indexOfDurchschnitt>=minimumNumberOfWords)
+ {
+ if(useAnzprob==1)
+ {
+ cout << "useAnzProb==1 => mysterious.\n";
+ break;
+ }
+ useAnzprob--;
+ }
+ }
+ while(indexOfDurchschnitt>=minimumNumberOfWords);
+
+
+ Array<KategProblem *> &neu=*new Array<KategProblem *>(MAX_MULTIPLE*anzprob,(KategProblem *)0);
+ qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem);
+ massert(useAnzprob<=probs.size());
+ double startTime=clockSec();
+ int i, numberOfNew;
+ for(numberOfNew=0; (clockSec()-startTime<timeForOneRed)
+ || (numberOfNew < anzprob) ; numberOfNew++)
+ {
+ int w;
+ if( numberOfNew==anzprob*MAX_MULTIPLE-1 )
+ break;
+ KategProblem *p
+ = neu[numberOfNew]
+ = new KategProblem(indexOfDurchschnitt,nKats-2,
+ p0->initialisierung,p0->auswertung,p0->nachbarschaft);
+
+ for(w=0;w<indexOfDurchschnitt;w++)
+ {
+ p->wordFreq.setAfterWords(w,5);
+ p->wordFreq.setBeforeWords(w,5);
+ }
+ for(w=0;w<nWords;w++)
+ {
+ Array<OneFreq> &after=p0->wordFreq.after[w];
+ int size=after.size();
+ for(i=0;i<size;i++)
+ p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n);
+ }
+ p->wordFreq.testFull(1);
+
+
+
+
+
+
+ p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words());
+ double w1=0.0,w2=0.0;
+ if(numberOfNew<useAnzprob)
+ {
+
+ for(i=0;i<nWords;i++)
+ (p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i);
+ p->_initialize(5);
+ HCOptimization hc(*p,-1);
+ if(verboseMode)
+ {
+ w1=p->nicevalue();
+ cout << "from old category system:" << w1 << endl;
+ }
+ hc.minimize(-1);
+ if(verboseMode)
+ {
+ w2=p->nicevalue();
+ if(w2<w1)
+ cout << "improvement: " << w1-w2 << endl;
+ }
+ }
+ else
+ {
+ p->_initialize(1);
+ double mean;
+ StatVar end,laufzeit,start;
+ solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start);
+ w2=p->value();
+ if(verboseMode)
+ cout << "new category system: " << w2 << " (" << p->nicevalue()
+ << ") Zeit: " << clockSec() << "\n";
+ }
+ }
+ int p;
+ for(p=0;p<probs.size();p++)
+ {
+ if( probs[p] )
+ delete probs[p];
+ }
+ qsort(neu.getPointerToData(),numberOfNew,sizeof(Problem *),compareProblem);
+ massert(numberOfNew<=neu.size());
+ if( verboseMode )
+ cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt
+ << " words. costs: " << neu[0]->value() << " "
+ << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: "
+ << clockSec() << endl;
+ if( indexOfDurchschnitt<=nKats
+ || (clockSec()>maxClock&&maxClock) )
+ {
+ if( clockSec()>maxClock&&maxClock )
+ cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n";
+ for(i=0;i<nWords;i++)
+ katOfWord[i]=neu[0]->katOfWord(newWords[i]);
+ return neu;
+ }
+ else
+ {
+ Array<Kategory> &newKatOfWord=
+ *(new Array<Kategory>(neu[0]->wordFreq.nWords,-1));
+ Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed,
+ maxClock,newKatOfWord,
+ anzIter+1,verfahren);
+ for(i=0;i<nWords;i++)
+ katOfWord[i]=newKatOfWord[newWords[i]];
+ return erg;
+ }
+}
+
+
+
+
+KategProblem *izrOptimization(KategProblem &p,int minN,int firstN,
+ double clockForOneRed,double maxClock,int verfahren)
+{
+ Array<Kategory> katOfWord(p.wordFreq.nWords,-1);
+ int startN;
+ if( clockForOneRed<=0 )
+ startN=firstN;
+ else
+ startN=1000;
+ Array<KategProblem *> probs(startN);
+ double val1=0.0,val2=0.0;
+ double endTime=-1;
+
+ double startTime=clockSec();
+ int i;
+ for(i=0;i<startN;i++)
+ {
+ StatVar end,laufzeit,start;
+ double mean;
+ probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem());
+ solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start);
+ if( i==minN-1 )
+ endTime = clockSec();
+ if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) )
+ break;
+ }
+ if( endTime<0 )
+ endTime=clockSec();
+ massert(i>=firstN);
+
+ qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem);
+ massert(i<=probs.size());
+ if( clockForOneRed<=0 )
+ {
+ clockForOneRed=endTime-startTime;
+ if( verboseMode )
+ cout << "time for one reduction: " << clockForOneRed << endl;
+ }
+ _izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren);
+
+ KategProblem *n=(KategProblem *)(p.makeEqualProblem());
+ n->initLike= katOfWord;
+ n->_initialize(5);
+ if( verboseMode )
+ val1=n->value();
+ HCOptimization hc(*n,-1);
+ hc.minimize(-1);
+ val2=n->value();
+ if( verboseMode )
+ cout << "last improvement: " << val2-val1 << "\n";
+ cout << "final costs: " << val2 << " " << n->nicevalue() << endl;
+ if(PrintBestTo)
+ n->dumpOn(*PrintBestTo);
+ return n;
+}
+
+
+
+
+
+
+
+
+
+
+