diff options
Diffstat (limited to 'mgizapp/src/mkcls/KategProblemTest.cpp')
-rw-r--r-- | mgizapp/src/mkcls/KategProblemTest.cpp | 832 |
1 files changed, 395 insertions, 437 deletions
diff --git a/mgizapp/src/mkcls/KategProblemTest.cpp b/mgizapp/src/mkcls/KategProblemTest.cpp index 3084a0b..ed78e6f 100644 --- a/mgizapp/src/mkcls/KategProblemTest.cpp +++ b/mgizapp/src/mkcls/KategProblemTest.cpp @@ -9,14 +9,14 @@ modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. -This program is distributed in the hope that it will be useful, +This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ @@ -53,26 +53,25 @@ char *strdup(char *a) void writeClasses(Array<Kategory> &katOfWord,KategProblem &problem,ostream &to) -{ - for(int i=0;i<katOfWord.size();i++) - { - if( strcmp(problem.getString(i),"$") ) - if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 ) - to << "$" << "\t" << katOfWord[i] << endl; - else - to << problem.getString(i) << "\t" << katOfWord[i] << endl; - } +{ + for(int i=0; i<katOfWord.size(); i++) { + if( strcmp(problem.getString(i),"$") ) + if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 ) + to << "$" << "\t" << katOfWord[i] << endl; + else + to << problem.getString(i) << "\t" << katOfWord[i] << endl; + } } void mysplit(const string &s,string &s1,string &s2) { unsigned int i=0; - for(;i<s.length();i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break; + for(; i<s.length(); i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break; s1=s.substr(0,i); - for(;i<s.length();i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break; + for(; i<s.length(); i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break; s2=s.substr(i,s.length()-i); - + iassert(s1.size()); iassert(s2.size()); } @@ -84,164 +83,163 @@ int fromCatFile(KategProblem *p,const char *fname,bool verb) leda_h_array<string,int> translation(-1); int maxCat=2; ifstream in(fname); - if(!in) - { - cerr << "Error: File '" << fname << "' cannot be opened.\n"; - exit(1); - } - for(int i=0;i<p->wordFreq.nWords;i++) + if(!in) { + cerr << "Error: File '" << fname << "' cannot be opened.\n"; + exit(1); + } + for(int i=0; i<p->wordFreq.nWords; i++) (p->initLike)[i]= -1; - - + + translation["1"]=1; translation["0"]=0; - + string s; - while( getline(in,s) ) - { - string str,categ; - mysplit(s,str,categ); - int i=p->words->binary_locate(str); - if(i>=0 && (*(p->words))[i]==str ) - { - - if( translation[categ]==-1 ) - translation[categ]=maxCat++; - int cat=translation[categ]; - if( (p->initLike)[i]!= -1 ) - cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n"; - (p->initLike)[i]=cat; - } - else - cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n"; - } - + while( getline(in,s) ) { + string str,categ; + mysplit(s,str,categ); + int i=p->words->binary_locate(str); + if(i>=0 && (*(p->words))[i]==str ) { + + if( translation[categ]==-1 ) + translation[categ]=maxCat++; + int cat=translation[categ]; + if( (p->initLike)[i]!= -1 ) + cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n"; + (p->initLike)[i]=cat; + } else + cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n"; + } + if( verboseMode ) - cout << "We have " << maxCat << " read non-empty categories" - " (with words from the corpus).\n"; - - if(maxCat>p->katFreq.nKats) - { - cerr << "Error: Not enough categories reserved (only " - << p->katFreq.nKats << ", but i need " << maxCat << ").\n"; - exit(1); - } - - + cout << "We have " << maxCat << " read non-empty categories" + " (with words from the corpus).\n"; + + if(maxCat>p->katFreq.nKats) { + cerr << "Error: Not enough categories reserved (only " + << p->katFreq.nKats << ", but i need " << maxCat << ").\n"; + exit(1); + } + + int i=p->words->binary_locate("$"); if( i>=0 && (*(p->words))[i]=="$" ) (p->initLike)[i]=0; - else - if( verboseMode ) - cerr << "Warning: No '$' in vocabulary!\n"; - - + else if( verboseMode ) + cerr << "Warning: No '$' in vocabulary!\n"; + + int errors=0; - for(i=0;i<p->wordFreq.nWords;i++) - if((p->initLike)[i]== -1 ) - { - if( verb ) cerr << "Error: I don't know the category of word " << i - << " (" << (*(p->words))[i] << ") " << ".\n"; - errors=1; - } + for(i=0; i<p->wordFreq.nWords; i++) + if((p->initLike)[i]== -1 ) { + if( verb ) cerr << "Error: I don't know the category of word " << i + << " (" << (*(p->words))[i] << ") " << ".\n"; + errors=1; + } return errors; } KategProblem *makeKategProblem(const leda_h_array<PSS,FreqType>&cTbl,const leda_set<string>&setVokabular, int maxClass,int initialisierung, - int auswertung,int nachbarschaft,int minWordFrequency) + int auswertung,int nachbarschaft,int minWordFrequency) { - + int nwrd=0; leda_array<string>&sVok = *new leda_array<string>(setVokabular.size()); string s; unsigned int ctr=0; - forall_set(leda_set<string>,s,setVokabular) - { - if( verboseMode>2 ) - cout << "mkcls:Wort " << ctr << " " << s << endl; - sVok[ctr++]=s; - } - for(unsigned int z=0;z<ctr-1;z++) + forall_set(leda_set<string>,s,setVokabular) { + if( verboseMode>2 ) + cout << "mkcls:Wort " << ctr << " " << s << endl; + sVok[ctr++]=s; + } + for(unsigned int z=0; z<ctr-1; z++) iassert( sVok[z]<sVok[z+1] ); sVok.sort(); if( verboseMode>2 ) cout << "*****Vocabulary: " << sVok; - + unsigned int vokSize=sVok.size(); - massert(vokSize==ctr); massert(vokSize==setVokabular.size()); - if(verboseMode) - {cout << "Size of vocabulary: " << vokSize << "\n";cout.flush();} - + massert(vokSize==ctr); + massert(vokSize==setVokabular.size()); + if(verboseMode) { + cout << "Size of vocabulary: " << vokSize << "\n"; + cout.flush(); + } + KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung, - auswertung,nachbarschaft,minWordFrequency); + auswertung,nachbarschaft,minWordFrequency); KategProblemWBC &w=k->wordFreq; k->words=&sVok; - + Array<int> after(vokSize,0); Array<int> before(vokSize,0); - - + + nwrd=0; { PSS s; - forall_defined_h2(PSS,FreqType,s,cTbl) - { - const string&ss1=s.first; - const string&ss2=s.second; - if( ss2.length()&&(ss1!="$" || ss2!="$") ) - { - int i1=sVok.binary_search(ss1); - int i2=sVok.binary_search(ss2); - iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 ); - after[i1]++; - before[i2]++; - } - if( verboseMode&&((nwrd++)%10000==0) ) - {cout<<"Statistiken-1 " << nwrd<< ". \r";cout.flush();} + forall_defined_h2(PSS,FreqType,s,cTbl) { + const string&ss1=s.first; + const string&ss2=s.second; + if( ss2.length()&&(ss1!="$" || ss2!="$") ) { + int i1=sVok.binary_search(ss1); + int i2=sVok.binary_search(ss2); + iassert( sVok[i1] == ss1 ); + iassert( sVok[i2] == ss2 ); + after[i1]++; + before[i2]++; + } + if( verboseMode&&((nwrd++)%10000==0) ) { + cout<<"Statistiken-1 " << nwrd<< ". \r"; + cout.flush(); } - } - - for(unsigned int i=0;i<vokSize;i++) - { - w.setAfterWords(i,after[i]); - w.setBeforeWords(i,before[i]); } - - + } + + for(unsigned int i=0; i<vokSize; i++) { + w.setAfterWords(i,after[i]); + w.setBeforeWords(i,before[i]); + } + + { nwrd=0; PSS s; - forall_defined_h2(PSS,FreqType,s,cTbl) - { - const string&ss1=s.first; - const string&ss2=s.second; - FreqType p=cTbl[s]; - if( ss2.length()&&(ss1!="$" || ss2!="$") ) - { - int i1=sVok.binary_search(ss1); - int i2=sVok.binary_search(ss2); - iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 ); - w.setFreq(i1,i2,p); - if( verboseMode>2 ) - cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " " - << ss2 << ":" << i2 << " " << p << endl; - } - if( verboseMode&&((nwrd++)%10000==0) ) - {cout<<"Statistiken-2 " <<nwrd<< ". \r";cout.flush();} - } + forall_defined_h2(PSS,FreqType,s,cTbl) { + const string&ss1=s.first; + const string&ss2=s.second; + FreqType p=cTbl[s]; + if( ss2.length()&&(ss1!="$" || ss2!="$") ) { + int i1=sVok.binary_search(ss1); + int i2=sVok.binary_search(ss2); + iassert( sVok[i1] == ss1 ); + iassert( sVok[i2] == ss2 ); + w.setFreq(i1,i2,p); + if( verboseMode>2 ) + cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " " + << ss2 << ":" << i2 << " " << p << endl; + } + if( verboseMode&&((nwrd++)%10000==0) ) { + cout<<"Statistiken-2 " <<nwrd<< ". \r"; + cout.flush(); + } + } } - + w.testFull(); - if(verboseMode){cout << "Datenintegritaet getestet.\n";cout.flush();} + if(verboseMode) { + cout << "Datenintegritaet getestet.\n"; + cout.flush(); + } return k; } KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung, - int auswertung,int nachbarschaft,int minWordFrequency) + int auswertung,int nachbarschaft,int minWordFrequency) { ifstream file(str); if(!file)return 0; @@ -250,118 +248,110 @@ KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung, double c=0; if( verboseMode )cout << "NGRFILE: " << str << endl; string s1,s2; - while(file >> c >> s1 >> s2) - { - if( s1.length()==0||s2.length()==0 ) - { - cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl; - return 0; - } - if( c==0 ) - { - cerr << "Count ist 0 " << s1 << " " << s2 << endl; - return 0; - } - cTbl[pair<string,string>(s1,s2)]=(FreqType)c; - setVokabular.insert(s1); - setVokabular.insert(s2); - if( verboseMode>1 ) - cout << "R: " << s1 << " " << s2 << " " << c << endl; - c=0; + while(file >> c >> s1 >> s2) { + if( s1.length()==0||s2.length()==0 ) { + cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl; + return 0; + } + if( c==0 ) { + cerr << "Count ist 0 " << s1 << " " << s2 << endl; + return 0; } - + cTbl[pair<string,string>(s1,s2)]=(FreqType)c; + setVokabular.insert(s1); + setVokabular.insert(s2); + if( verboseMode>1 ) + cout << "R: " << s1 << " " << s2 << " " << c << endl; + c=0; + } + return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency); } - - + + KategProblem *fromKModel(const char *str,int maxClass,int initialisierung, - int auswertung,int nachbarschaft,int minWordFrequency) + int auswertung,int nachbarschaft,int minWordFrequency) { string oldText,text,line; ifstream f(str); - if( !f ) - { - cerr << "ERROR: can not open file " << str << ".\n"; - return 0; - } - + if( !f ) { + cerr << "ERROR: can not open file " << str << ".\n"; + return 0; + } + leda_set<string> setVokabular; leda_h_array<PSS,FreqType> cTbl(0); oldText="$"; - while(1) - { - getline(f,line); - if(f.fail() && !f.bad() && !f.eof()) - { - cerr << "WARNING: strange characters in stream (getline) " << endl;f.clear(); - } - if(!f)break; - - istrstream f2(line.c_str()); - while( 1 ) - { - f2 >> text; - if(f2.fail() && !f2.bad() && !f2.eof()) - { - cerr << "WARNING: strange characters in stream (>>) !\n"; - f2.clear(ios::failbit); - } - if(!f2){break;} - - - - - - - if( text == "$" ) - text = "mkcls-mapped-dollar-symbol-$"; - if( !setVokabular.member(text) )setVokabular.insert(text); - cTbl[pair<string,string>(oldText,text)]++; - oldText=text; - } - text="$"; + while(1) { + getline(f,line); + if(f.fail() && !f.bad() && !f.eof()) { + cerr << "WARNING: strange characters in stream (getline) " << endl; + f.clear(); + } + if(!f)break; + + istrstream f2(line.c_str()); + while( 1 ) { + f2 >> text; + if(f2.fail() && !f2.bad() && !f2.eof()) { + cerr << "WARNING: strange characters in stream (>>) !\n"; + f2.clear(ios::failbit); + } + if(!f2) { + break; + } + + + + + + + if( text == "$" ) + text = "mkcls-mapped-dollar-symbol-$"; if( !setVokabular.member(text) )setVokabular.insert(text); cTbl[pair<string,string>(oldText,text)]++; oldText=text; } + text="$"; + if( !setVokabular.member(text) )setVokabular.insert(text); + cTbl[pair<string,string>(oldText,text)]++; + oldText=text; + } return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency); } - + void KategProblemSetParameters(KategProblem &p) { - if( p.katwahl()==K_BEST ) - { - TAOptimization::defaultAnnRate=0.7; - RRTOptimization::defaultAnnRate=0.95; - GDAOptimization::defaultAlpha=0.05; - if( verboseMode ) - cout << "Parameter-setting like W-DET-BEST\n"; - } - else - { - TAOptimization::defaultAnnRate=0.4; - RRTOptimization::defaultAnnRate=0.6; - GDAOptimization::defaultAlpha=0.0125; - if( verboseMode ) - cout << "Parameter-setting like W-DET-DET\n"; - } + if( p.katwahl()==K_BEST ) { + TAOptimization::defaultAnnRate=0.7; + RRTOptimization::defaultAnnRate=0.95; + GDAOptimization::defaultAlpha=0.05; + if( verboseMode ) + cout << "Parameter-setting like W-DET-BEST\n"; + } else { + TAOptimization::defaultAnnRate=0.4; + RRTOptimization::defaultAnnRate=0.6; + GDAOptimization::defaultAlpha=0.0125; + if( verboseMode ) + cout << "Parameter-setting like W-DET-DET\n"; + } } KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue, - int auswertung,int nachbarschaft,float relInit) + int auswertung,int nachbarschaft,float relInit) { KategProblem &k= *new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft); @@ -369,41 +359,35 @@ KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue, Array<int> after(ANZ_WORD,0); Array<int> before(ANZ_WORD,0); Array<FreqArray> twoD(ANZ_WORD); - int i; - for(i=0;i<ANZ_WORD;i++) twoD[i].init(ANZ_WORD,0); - - for(i=0;i<ANZ_WORD;i++) - { - massert(after[i]==0); - massert(before[i]==0); - for(int j=0;j<ANZ_WORD;j++) - { - massert(twoD[i][j]==0); - } - } - for(i=0;i<ANZ_WORD*ANZ_WORD*relInit;i++) - { - int x=randomInt(ANZ_WORD); - int y=randomInt(ANZ_WORD); - if(twoD[x][y]==0) - { - after[x]++; - before[y]++; - } - twoD[x][y]+=randomInt(10)+1; - } - for(i=0;i<ANZ_WORD;i++) - { - w.setAfterWords(i,after[i]); - w.setBeforeWords(i,before[i]); + int i; + for(i=0; i<ANZ_WORD; i++) twoD[i].init(ANZ_WORD,0); + + for(i=0; i<ANZ_WORD; i++) { + massert(after[i]==0); + massert(before[i]==0); + for(int j=0; j<ANZ_WORD; j++) { + massert(twoD[i][j]==0); } - - for(i=0;i<ANZ_WORD;i++) - { - for(int j=0;j<ANZ_WORD;j++) - if( twoD[i][j] ) - w.setFreq(i,j,twoD[i][j]); + } + for(i=0; i<ANZ_WORD*ANZ_WORD*relInit; i++) { + int x=randomInt(ANZ_WORD); + int y=randomInt(ANZ_WORD); + if(twoD[x][y]==0) { + after[x]++; + before[y]++; } + twoD[x][y]+=randomInt(10)+1; + } + for(i=0; i<ANZ_WORD; i++) { + w.setAfterWords(i,after[i]); + w.setBeforeWords(i,before[i]); + } + + for(i=0; i<ANZ_WORD; i++) { + for(int j=0; j<ANZ_WORD; j++) + if( twoD[i][j] ) + w.setFreq(i,j,twoD[i][j]); + } w.testFull(); return k; } @@ -414,24 +398,23 @@ KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue, char *makeTitle(KategProblem &problem,int verfahren) { char x[1024]; - switch(verfahren) - { - case HC_OPT: - strcpy(x,"HC "); - break; - case SA_OPT: - strcpy(x,"SA "); - break; - case TA_OPT: - strcpy(x,"TA "); - break; - case GDA_OPT: - strcpy(x,"GDA "); - break; - case RRT_OPT: - strcpy(x,"RRT "); - break; - } + switch(verfahren) { + case HC_OPT: + strcpy(x,"HC "); + break; + case SA_OPT: + strcpy(x,"SA "); + break; + case TA_OPT: + strcpy(x,"TA "); + break; + case GDA_OPT: + strcpy(x,"GDA "); + break; + case RRT_OPT: + strcpy(x,"RRT "); + break; + } problem.makeTitle(x+strlen(x)); return strdup(x); } @@ -439,11 +422,11 @@ char *makeTitle(KategProblem &problem,int verfahren) -#define MAX_MULTIPLE 10 +#define MAX_MULTIPLE 10 Array<KategProblem *> &_izrOptimization(Array<KategProblem *> &probs, -int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord, -int anzIter,int verfahren) + int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord, + int anzIter,int verfahren) { massert(anzprob>1); massert(probs[0]->wordFreq.mindestAnzahl<=1); @@ -456,184 +439,161 @@ int anzIter,int verfahren) int indexOfDurchschnitt; Array<int> newWords(nWords); int useAnzprob=anzprob; - do - { - int w,k; - indexOfDurchschnitt=0; - for(w=0;w<nWords;w++) - newWords[w]=-1; - for(k=0;k<useAnzprob;k++) - { - massert(probs[k]->wordFreq.nWords==nWords); - probs[k]->makeKats(); - } - - for(w=0;w<nWords;w++) - { - if( newWords[w]==-1 ) - { - - - - leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)]; - for(k=1;k<useAnzprob;k++) - durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)]; - - - int _anzInDurchschnitt=0; - int nr=0; - forall_set(leda_set<int>,nr,durchschnitt) - { - _anzInDurchschnitt++; - newWords[nr]=indexOfDurchschnitt; - } - if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 ) - { - cout << "- ("; - forall_set(leda_set<int>,nr,durchschnitt) - { - cout << p0->getString(nr); - if( p0->wordFreq.n1(nr)==1 ) - cout << "* "; - else - cout << " "; - } - cout << ")\n"; - } - - - - - for(k=0;k<useAnzprob;k++) - { - durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)]; - } - indexOfDurchschnitt++; - } - } - - if(indexOfDurchschnitt>=minimumNumberOfWords) - { - if(useAnzprob==1) - { - cout << "useAnzProb==1 => mysterious.\n"; - break; - } - useAnzprob--; - } + do { + int w,k; + indexOfDurchschnitt=0; + for(w=0; w<nWords; w++) + newWords[w]=-1; + for(k=0; k<useAnzprob; k++) { + massert(probs[k]->wordFreq.nWords==nWords); + probs[k]->makeKats(); + } + + for(w=0; w<nWords; w++) { + if( newWords[w]==-1 ) { + + + + leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)]; + for(k=1; k<useAnzprob; k++) + durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)]; + + + int _anzInDurchschnitt=0; + int nr=0; + forall_set(leda_set<int>,nr,durchschnitt) { + _anzInDurchschnitt++; + newWords[nr]=indexOfDurchschnitt; + } + if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 ) { + cout << "- ("; + forall_set(leda_set<int>,nr,durchschnitt) { + cout << p0->getString(nr); + if( p0->wordFreq.n1(nr)==1 ) + cout << "* "; + else + cout << " "; + } + cout << ")\n"; + } + + + + + for(k=0; k<useAnzprob; k++) { + durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)]; + } + indexOfDurchschnitt++; + } + } + + if(indexOfDurchschnitt>=minimumNumberOfWords) { + if(useAnzprob==1) { + cout << "useAnzProb==1 => mysterious.\n"; + break; + } + useAnzprob--; } - while(indexOfDurchschnitt>=minimumNumberOfWords); - - + } while(indexOfDurchschnitt>=minimumNumberOfWords); + + Array<KategProblem *> &neu=*new Array<KategProblem *>(MAX_MULTIPLE*anzprob,(KategProblem *)0); qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem); massert(useAnzprob<=probs.size()); double startTime=clockSec(); int i, numberOfNew; - for(numberOfNew=0; (clockSec()-startTime<timeForOneRed) - || (numberOfNew < anzprob) ; numberOfNew++) - { - int w; - if( numberOfNew==anzprob*MAX_MULTIPLE-1 ) - break; - KategProblem *p - = neu[numberOfNew] - = new KategProblem(indexOfDurchschnitt,nKats-2, - p0->initialisierung,p0->auswertung,p0->nachbarschaft); - - for(w=0;w<indexOfDurchschnitt;w++) - { - p->wordFreq.setAfterWords(w,5); - p->wordFreq.setBeforeWords(w,5); - } - for(w=0;w<nWords;w++) - { - Array<OneFreq> &after=p0->wordFreq.after[w]; - int size=after.size(); - for(i=0;i<size;i++) - p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n); - } - p->wordFreq.testFull(1); - - - - - - - p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words()); - double w1=0.0,w2=0.0; - if(numberOfNew<useAnzprob) - { - - for(i=0;i<nWords;i++) - (p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i); - p->_initialize(5); - HCOptimization hc(*p,-1); - if(verboseMode) - { - w1=p->nicevalue(); - cout << "from old category system:" << w1 << endl; - } - hc.minimize(-1); - if(verboseMode) - { - w2=p->nicevalue(); - if(w2<w1) - cout << "improvement: " << w1-w2 << endl; - } - } - else - { - p->_initialize(1); - double mean; - StatVar end,laufzeit,start; - solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start); - w2=p->value(); - if(verboseMode) - cout << "new category system: " << w2 << " (" << p->nicevalue() - << ") Zeit: " << clockSec() << "\n"; - } + for(numberOfNew=0; (clockSec()-startTime<timeForOneRed) + || (numberOfNew < anzprob) ; numberOfNew++) { + int w; + if( numberOfNew==anzprob*MAX_MULTIPLE-1 ) + break; + KategProblem *p + = neu[numberOfNew] + = new KategProblem(indexOfDurchschnitt,nKats-2, + p0->initialisierung,p0->auswertung,p0->nachbarschaft); + + for(w=0; w<indexOfDurchschnitt; w++) { + p->wordFreq.setAfterWords(w,5); + p->wordFreq.setBeforeWords(w,5); + } + for(w=0; w<nWords; w++) { + Array<OneFreq> &after=p0->wordFreq.after[w]; + int size=after.size(); + for(i=0; i<size; i++) + p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n); } - int p; - for(p=0;p<probs.size();p++) - { - if( probs[p] ) - delete probs[p]; + p->wordFreq.testFull(1); + + + + + + + p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words()); + double w1=0.0,w2=0.0; + if(numberOfNew<useAnzprob) { + + for(i=0; i<nWords; i++) + (p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i); + p->_initialize(5); + HCOptimization hc(*p,-1); + if(verboseMode) { + w1=p->nicevalue(); + cout << "from old category system:" << w1 << endl; + } + hc.minimize(-1); + if(verboseMode) { + w2=p->nicevalue(); + if(w2<w1) + cout << "improvement: " << w1-w2 << endl; + } + } else { + p->_initialize(1); + double mean; + StatVar end,laufzeit,start; + solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start); + w2=p->value(); + if(verboseMode) + cout << "new category system: " << w2 << " (" << p->nicevalue() + << ") Zeit: " << clockSec() << "\n"; } + } + int p; + for(p=0; p<probs.size(); p++) { + if( probs[p] ) + delete probs[p]; + } qsort(neu.getPointerToData(),numberOfNew,sizeof(Problem *),compareProblem); massert(numberOfNew<=neu.size()); if( verboseMode ) - cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt - << " words. costs: " << neu[0]->value() << " " - << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: " - << clockSec() << endl; - if( indexOfDurchschnitt<=nKats - || (clockSec()>maxClock&&maxClock) ) - { - if( clockSec()>maxClock&&maxClock ) - cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n"; - for(i=0;i<nWords;i++) - katOfWord[i]=neu[0]->katOfWord(newWords[i]); - return neu; - } - else - { - Array<Kategory> &newKatOfWord= - *(new Array<Kategory>(neu[0]->wordFreq.nWords,-1)); - Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed, - maxClock,newKatOfWord, - anzIter+1,verfahren); - for(i=0;i<nWords;i++) - katOfWord[i]=newKatOfWord[newWords[i]]; - return erg; - } + cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt + << " words. costs: " << neu[0]->value() << " " + << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: " + << clockSec() << endl; + if( indexOfDurchschnitt<=nKats + || (clockSec()>maxClock&&maxClock) ) { + if( clockSec()>maxClock&&maxClock ) + cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n"; + for(i=0; i<nWords; i++) + katOfWord[i]=neu[0]->katOfWord(newWords[i]); + return neu; + } else { + Array<Kategory> &newKatOfWord= + *(new Array<Kategory>(neu[0]->wordFreq.nWords,-1)); + Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed, + maxClock,newKatOfWord, + anzIter+1,verfahren); + for(i=0; i<nWords; i++) + katOfWord[i]=newKatOfWord[newWords[i]]; + return erg; + } } KategProblem *izrOptimization(KategProblem &p,int minN,int firstN, - double clockForOneRed,double maxClock,int verfahren) + double clockForOneRed,double maxClock,int verfahren) { Array<Kategory> katOfWord(p.wordFreq.nWords,-1); int startN; @@ -647,31 +607,29 @@ KategProblem *izrOptimization(KategProblem &p,int minN,int firstN, double startTime=clockSec(); int i; - for(i=0;i<startN;i++) - { - StatVar end,laufzeit,start; - double mean; - probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem()); - solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start); - if( i==minN-1 ) - endTime = clockSec(); - if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) ) - break; - } + for(i=0; i<startN; i++) { + StatVar end,laufzeit,start; + double mean; + probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem()); + solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start); + if( i==minN-1 ) + endTime = clockSec(); + if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) ) + break; + } if( endTime<0 ) endTime=clockSec(); massert(i>=firstN); qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem); massert(i<=probs.size()); - if( clockForOneRed<=0 ) - { - clockForOneRed=endTime-startTime; - if( verboseMode ) - cout << "time for one reduction: " << clockForOneRed << endl; - } + if( clockForOneRed<=0 ) { + clockForOneRed=endTime-startTime; + if( verboseMode ) + cout << "time for one reduction: " << clockForOneRed << endl; + } _izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren); - + KategProblem *n=(KategProblem *)(p.makeEqualProblem()); n->initLike= katOfWord; n->_initialize(5); |