diff options
Diffstat (limited to 'mkcls-v2/KategProblem.h')
-rw-r--r-- | mkcls-v2/KategProblem.h | 439 |
1 files changed, 439 insertions, 0 deletions
diff --git a/mkcls-v2/KategProblem.h b/mkcls-v2/KategProblem.h new file mode 100644 index 0000000..e5a5a46 --- /dev/null +++ b/mkcls-v2/KategProblem.h @@ -0,0 +1,439 @@ +/* + +Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och + +mkcls - a program for making word classes . + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +*/ + + + + + + +#ifndef KATEG_OPT_H +#define KATEG_OPT_H +#include <string> + +#include <stdlib.h> +#include "Problem.h" + +extern double rhoLo; + +typedef int Kategory; +typedef int Word; + + + +#ifdef FREQTYPE_DOUBLE +typedef double FreqType; +#else +typedef int FreqType; +#endif + + +#include "KategProblemWBC.h" + + +#include "KategProblemKBC.h" + + +enum { + INIT_RAN=1, + INIT_AIO=2, + INIT_LWRW=3, + INIT_FREQ=4, + INIT_OTHER=5 + }; + + +enum { + W_RAN=(8|16), + W_DET_DECR=(16), + W_DET_INCR =(32) +}; +#define CHOOSE_WORD (8|16|32) + + +enum { + K_DET=(64), + K_RAN=(128), + K_BEST=(64|128) +}; +#define CHOOSE_KAT (64|128) + + +enum { + CRITERION_ML=0, + CRITERION_LO=1, + CRITERION_MY=2 +}; + + + +class NWG +{ + private: + Array<FreqType> freq; + + Array<int> timeOfFreq; + + + + + int curTime; + public: + NWG(int n); + void init(); + + int anzNot0; + + + Array<int> not0; + + int word; + + inline void addFreq(int C,FreqType n); + + void sort(); + + FreqType getFreq(int i) + { + if( timeOfFreq[i]==curTime ) + return freq[i]; + else + return 0; + }; +}; + +inline void NWG::addFreq(int g,FreqType n) +{ + if(timeOfFreq[g]==curTime) + freq[g]+=n; + else + { + timeOfFreq[g]=curTime; + freq[g]=n; + not0[anzNot0++]=g; + } +} + + + +struct KategProblemChange : public ProblemChange +{ + void *operator new(size_t size); + void operator delete(void *ptr,size_t size); + + int word; + int toKat; + int fromKat; +}; + +class KategProblem : public Problem +{ + private: + double kat_h_full(int n); + double kat_h_full(double n); + double kat_h_part(int n); + double kat_h_part(double n); + double sigmaVerfaelschung; + short katWasEmpty; + + + + int nwgWord; + + NWG nwg; + NWG ngw; + FreqType nww; + + int ursprung,ziel; + + Array<int> _katOfWord; + + int _maxComp,_maxCompVal; + + double nmo_my(int i,int j); + double nmo(int i,int j); + + + double nmo_lo(int i,int j,int &e0,int &e1); + + + void putWord(int word,int to); + + + void fastPutWord(int word,int to); + + + void setKatOfWord(int w,int k) +{ + if( !(wordFreq.fixedWord[w]==k||wordFreq.fixedWord[w]==-1||k==-1) ) + { + cout << "mkcls::setKatOfWord::ERROR: " << w << " " << k << " " << wordFreq.fixedWord[w] << " " << (*words)[w] << endl; + } + _katOfWord[w]=k; + nwgWord=-1; +}; + + + void fillNWG(int w); + + + inline FreqType nstrich(int i,int j); + + + void vnstrich(int i,int j); + + + + protected: + virtual int _change(ProblemChange **p); + + + virtual void _doChange(ProblemChange &c); + + + virtual void _undoChange(ProblemChange &c); + + + virtual double _value(); + + + double _valueChange(KategProblemChange &k); + + + virtual void incrementDirection(); + + + virtual int maxDimensionVal(void) ; + + + virtual int maxDimension(void) ; + + +public: + leda_array<string> *words; +typedef leda_set<int> intSet; + +leda_array<intSet> *kats; + + KategProblemWBC wordFreq; + KategProblemKBC katFreq; + + Array<int> initLike; + + KategProblem(int aw,int mak,int _initialisierung,int _auswertung, + int _nachbarschaft,int minw=0); + + + virtual ~KategProblem(); + + + virtual void _initialize(int initTyp); + virtual void _initialize(int initTyp,int specialFixedWord); + + + virtual double valueChange(ProblemChange&c); + + + virtual Problem *makeEqualProblem(); + + + virtual double nicevalue(double value=1e100); + + + void makeKats(); + + + virtual void dumpOn(ostream &strm); + + + virtual void dumpInfos(ostream &strm); + + + + + + inline void katwahl(int k); + + + inline void wortwahl(int w); + + + + + + inline int katOfWord(int w); + + + inline short wortwahl(); + + + inline short katwahl() ; + + + virtual int maxNonBetterIterations(); + + + virtual int expectedNumberOfIterations(); + + + const char *getString(int i); + string getTheString(int i); + + + void makeTitle(char x[512]); + + + void fixInitLike(); + +}; + +inline int KategProblem::katOfWord(int w){return _katOfWord[w];}; +inline short KategProblem::wortwahl(){return nachbarschaft&CHOOSE_WORD;}; +inline short KategProblem::katwahl() {return nachbarschaft&CHOOSE_KAT;}; + +inline void KategProblem::katwahl(int k) + { + nachbarschaft = (nachbarschaft&(~CHOOSE_KAT)) | k; + if(k==K_BEST) + _maxCompVal=1; + else + _maxCompVal=katFreq.nKats-2; + }; + +inline void KategProblem::wortwahl(int w) + { + nachbarschaft = (nachbarschaft&(~CHOOSE_WORD)) | w; + }; + + + +inline FreqType KategProblem::nstrich(int i,int j) +{ + FreqType n=0; + + if( i==ursprung ) + n-=nwg.getFreq(j); + if( i==ziel ) + n+=nwg.getFreq(j); + + if( j==ursprung ) + n-=ngw.getFreq(i); + if( j==ziel ) + n+=ngw.getFreq(i); + + if( i==ursprung && j==ursprung ) + n+=nww; + if( i==ziel && j==ziel ) + n+=nww; + + if( i==ursprung && j==ziel ) + n-=nww; + if( i==ziel && j==ursprung ) + n-=nww; + + return n; +} + + + + + +#define MAX_H_TABLE 4000 +extern double h_table[],l_table[],hmy_table[],hmy_sigma; + + +inline double kat_mlog(double x) +{ + if(x<=1e-9) + return 0; + else + return log(x); +} + + +inline double kat_mlog(int s) +{ + if(s<=0) + return 0; + else if( s<MAX_H_TABLE ) + { + massert( s==0 || l_table[s]==log(s) ); + return l_table[s]; + } + else + return log((double)(s)); +} + + + +inline double kat_hlo(int n) +{ + return n*kat_mlog(n-1); +} + +inline double kat_hlo(double n) +{ + return n*kat_mlog(n-1); +} + + +inline double kat_h(int n) +{ + massert(n>=-1); + if(n<=0) + return 0; + else + if(n<MAX_H_TABLE) + { + massert(n==0||fabs(h_table[n]-n*log((double)n))<1e-8); + return h_table[n]; + } + else + return n*log((double)(n)); +} +inline double kat_h(double n) +{ + if(n<=1e-9) + return 0; + else + return n*log(n); +} + + +inline double kat_etaFkt(int _e0,int e1,int immer0,int cats) +{ + int e0 = _e0 - immer0; + int ePlus = cats*cats - _e0; + if(cats*cats-e0>1) + return e1*log( (ePlus-1.0)/(e0+1.0)*rhoLo ); + else + return 0; +} + +double mkat_h_full(int n,double tf); +double mkat_h_part(int n,double cf); + +int Hash(const string& s); + + +#endif + |