diff options
Diffstat (limited to 'mgizapp/src/mkcls/KategProblemWBC.cpp')
-rw-r--r-- | mgizapp/src/mkcls/KategProblemWBC.cpp | 344 |
1 files changed, 344 insertions, 0 deletions
diff --git a/mgizapp/src/mkcls/KategProblemWBC.cpp b/mgizapp/src/mkcls/KategProblemWBC.cpp new file mode 100644 index 0000000..1a0d439 --- /dev/null +++ b/mgizapp/src/mkcls/KategProblemWBC.cpp @@ -0,0 +1,344 @@ +/* + +Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och + +mkcls - a program for making word classes . + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +*/ + + + + +#include <stdlib.h> +#include "KategProblem.h" + +static int oneFreqCompareSteigend(const void *p,const void *j) +{ +#ifdef FREQTYPE_DOUBLE + if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) ) + return -1; + if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) ) + return +1; + else + return 0; +#else + return ((OneFreq *)p)->n - ((OneFreq *)j)->n; +#endif +} +static int oneFreqCompareFallend(const void *p,const void *j) +{ +#ifdef FREQTYPE_DOUBLE + if( (((OneFreq *)p)->n > ((OneFreq *)j)->n) ) + return -1; + if( (((OneFreq *)p)->n < ((OneFreq *)j)->n) ) + return +1; + else + return 0; +#else + return -((OneFreq *)p)->n + ((OneFreq *)j)->n; +#endif +} + + +KategProblemWBC::KategProblemWBC(int n,int minw) +: _n1(n,0),_n2(n,0),with_h_of_words(0),afterFilled(n,0),beforeFilled(n,0),filled(0),fixedWord(n,-1),absteigend(0),nWords(n),nTranspWords(0), + mindestAnzahl(minw),after(n),before(n),minIndex(n,-1),maxIndex(n,-1) + +{ +} + +KategProblemWBC::~KategProblemWBC() + +{ + massert( after.size()==nWords); + if( absteigend ) + delete absteigend; +} + +void KategProblemWBC::init(int specialFixedWord) +{ + + nTranspWords=0; + int i; + for(i=0;i<_n1.size();i++) + { + if( (_n1[i]<mindestAnzahl && _n2[i]<mindestAnzahl && minIndex[i]<=1) ||i==specialFixedWord ) + { + + if(!( fixedWord[i]==1 || fixedWord[i]== -1)) + cerr << "mkcls:KategProblemWBC::init::ERROR: " << i << " " << fixedWord[i] << endl; + fixedWord[i]=1; + } + else if(fixedWord[i]<0) + nTranspWords++; + } + if( absteigend==0 ) + absteigend= &(getSortedList(0)); + + + + + + if(verboseMode && nTranspWords!=_n1.size()-1 ) + cout << "Es sind: " <<nTranspWords<<" transportierbar.\n"; +} + +void KategProblemWBC::set_h_of_words(double s) + +{ + with_h_of_words=1; + h_of_words = -s; +} + +double KategProblemWBC::get_h_of_words() + +{ + if( with_h_of_words ) + return -h_of_words; + else + { + h_of_words=0; + for(int i=0;i<nWords;i++) + h_of_words+=0.5*(kat_h(_n2[i])+kat_h(_n1[i])); + with_h_of_words=1; + return -h_of_words; + } +} + + +void KategProblemWBC::setAfterWords(int w,int anzahl) + +{ + OneFreq o; + o.w=-1; + o.n=0; + afterFilled[w]=0; + after[w].init(anzahl,o,1); +} +void KategProblemWBC::setBeforeWords(int w,int anzahl) + +{ + OneFreq o; + o.w=-1; + o.n=0; + beforeFilled[w]=0; + before[w].init(anzahl,o,1); +} + + +void KategProblemWBC::setFreq(int w1,int w2,FreqType anzahl) + +{ + OneFreq o; + o.n=anzahl; + + o.w=w2; + after[w1][afterFilled[w1]++]=o; + _n1[w1]+=anzahl; + o.w=w1; + before[w2][beforeFilled[w2]++]=o; + _n2[w2]+=anzahl; +} + +void KategProblemWBC::addFreq(int w1,int w2,FreqType anzahl) + +{ + OneFreq o; + o.n=anzahl; + int pos=-1,i; + for(i=0;i<afterFilled[w1];i++) + if(after[w1][i].w==w2) + pos=i; + + if(pos==-1) + { + o.w=w2; + after[w1][afterFilled[w1]++]=o; + } + else + after[w1][pos].n+=anzahl; + _n1[w1]+=anzahl; + + pos=-1; + for(i=0;i<beforeFilled[w2];i++) + if(before[w2][i].w==w1) + pos=i; + if(pos==-1) + { + o.w=w1; + before[w2][beforeFilled[w2]++]=o; + } + else + before[w2][pos].n+=anzahl; + _n2[w2]+=anzahl; +} + + +short KategProblemWBC::testFull(int doIt) + +{ + int enaNom=0; + int afterFilledSum=0,beforeFilledSum=0; + int ret=1,i; + for(i=0;i<nWords;i++) + { + if( n1(i)==1 && n2(i)==1 ) + enaNom++; + afterFilledSum+=afterFilled[i]; + beforeFilledSum+=beforeFilled[i]; + if(afterFilled[i]!=after[i].size()) + { + ret=0; + if( doIt ) + after[i].resize(afterFilled[i]); + } + if(beforeFilled[i]!=before[i].size()) + { + ret=0; + if( doIt ) + before[i].resize(beforeFilled[i]); + } + + } + if( ret==0 && !doIt ) + { + cerr << "Error: Unfilled word bigram statistics.\n"; + exit(1); + } + else + filled=1; + if( verboseMode>1 ) + { + cout << "MEAN(|L(w)|+|R(w)|)=" << (beforeFilledSum/(float)nWords) + +(afterFilledSum/(float)nWords) << endl; + cout << "Hapaslegomena: " << enaNom << endl; + } + int symmetrisch=1; + for(i=0;i<nWords;i++) + { + int j; + massert(before[i].size()==beforeFilled[i]); + massert( after[i].size()== afterFilled[i]); + FreqType sum=0; + for(j=0;j<after[i].size();j++) + sum+=after[i][j].n; + massert( sum==_n1[i] ); + sum=0; + for(j=0;j<before[i].size();j++) + sum+=before[i][j].n; + massert(sum==_n2[i]); + if(_n1[i]!=_n2[i]) + { + symmetrisch=0; + if( verboseMode>1 ) + cout << "Asymmetrie: " << i << " " << _n1[i] << " " << _n2[i] << endl; + } + + } + if(verboseMode && symmetrisch==0) + cout << "Warning: word bigram statistic is not symmetric " + "(this is possibly an error)\n"; + return ret; +} + +Array<Word> &KategProblemWBC::getSortedList(int steigend) + +{ + int siz=_n2.size(),i; + massert(filled); + Array<Word> &sortedList =*new Array<Word>(siz); + Array<OneFreq> list(siz); + int pos=0; + for(i=0;i<siz;i++) + { + if( fixedWord[i]<0 ) + { + list[pos].w=i; + list[pos].n=_n1[i]; + pos++; + } + } + int anzFree=pos; + for(i=0;i<siz;i++) + { + if( fixedWord[i]>=0 ) + { + list[pos].w=i; + list[pos].n=_n1[i]; + pos++; + } + } + massert(pos==siz); + if(steigend ) + qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareSteigend); + else + qsort(list.getPointerToData(),anzFree,sizeof(OneFreq),oneFreqCompareFallend); + massert( anzFree<=list.size() ); + + for(i=0;i<siz;i++) + { + sortedList[i]=list[i].w; + massert(steigend || i==0 || i>=anzFree || list[i-1].n>=list[i].n ); + massert((!steigend) || i==0 || i>=anzFree || list[i-1].n<=list[i].n ); + } + return sortedList; +} + +FreqType KategProblemWBC::numberOfWords() + +{ + FreqType n1=0,n2=0; + for(int i=0;i<_n1.size();i++) + { + n1+=_n1[i]; + n2+=_n2[i]; + } + #ifndef FREQTYPE_DOUBLE + massert(n1==n2); + #endif + return n1; +} + +void KategProblemWBC::setDollar(int n) + +{ + if( fixedWord[n]<0 ) + nTranspWords--; + fixedWord[n]=0; +} + +void KategProblemWBC::initializeIndex(const leda_array<string>&words,char firstChar,int unten,int oben,bool noHapas) +{ + int n=0; + int i; + massert(-1<unten);massert(unten<oben); + if( verboseMode ) + cout << "InitializeIndex: " << firstChar << " u:" << unten << " o:" << oben << " " << noHapas << endl; + over_array(words,i) + { + if( words[i][0]==firstChar && (noHapas || ((short)(n1(i)+0.0001))>=mindestAnzahl || ((short)(n2(i)+0.0001))>=mindestAnzahl) ) + { + minIndex[i]=unten; + maxIndex[i]=oben; + n++; + } + } + if( verboseMode ) + cout << "InitializeIndex gefunden fuer " << n << " Woerter.\n"; +} + |