Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'mgizapp/src/mkcls/KategProblemTest.cpp')
-rw-r--r--mgizapp/src/mkcls/KategProblemTest.cpp832
1 files changed, 395 insertions, 437 deletions
diff --git a/mgizapp/src/mkcls/KategProblemTest.cpp b/mgizapp/src/mkcls/KategProblemTest.cpp
index 3084a0b..ed78e6f 100644
--- a/mgizapp/src/mkcls/KategProblemTest.cpp
+++ b/mgizapp/src/mkcls/KategProblemTest.cpp
@@ -9,14 +9,14 @@ modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
-This program is distributed in the hope that it will be useful,
+This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
@@ -53,26 +53,25 @@ char *strdup(char *a)
void writeClasses(Array<Kategory> &katOfWord,KategProblem &problem,ostream &to)
-{
- for(int i=0;i<katOfWord.size();i++)
- {
- if( strcmp(problem.getString(i),"$") )
- if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 )
- to << "$" << "\t" << katOfWord[i] << endl;
- else
- to << problem.getString(i) << "\t" << katOfWord[i] << endl;
- }
+{
+ for(int i=0; i<katOfWord.size(); i++) {
+ if( strcmp(problem.getString(i),"$") )
+ if( strcmp(problem.getString(i),"mkcls-mapped-dollar-symbol-$")==0 )
+ to << "$" << "\t" << katOfWord[i] << endl;
+ else
+ to << problem.getString(i) << "\t" << katOfWord[i] << endl;
+ }
}
void mysplit(const string &s,string &s1,string &s2)
{
unsigned int i=0;
- for(;i<s.length();i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break;
+ for(; i<s.length(); i++)if( s[i]==' ' || s[i]=='\t' || s[i]==' ')break;
s1=s.substr(0,i);
- for(;i<s.length();i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break;
+ for(; i<s.length(); i++)if( !(s[i]==' ' || s[i]=='\t' || s[i]==' ') )break;
s2=s.substr(i,s.length()-i);
-
+
iassert(s1.size());
iassert(s2.size());
}
@@ -84,164 +83,163 @@ int fromCatFile(KategProblem *p,const char *fname,bool verb)
leda_h_array<string,int> translation(-1);
int maxCat=2;
ifstream in(fname);
- if(!in)
- {
- cerr << "Error: File '" << fname << "' cannot be opened.\n";
- exit(1);
- }
- for(int i=0;i<p->wordFreq.nWords;i++)
+ if(!in) {
+ cerr << "Error: File '" << fname << "' cannot be opened.\n";
+ exit(1);
+ }
+ for(int i=0; i<p->wordFreq.nWords; i++)
(p->initLike)[i]= -1;
-
-
+
+
translation["1"]=1;
translation["0"]=0;
-
+
string s;
- while( getline(in,s) )
- {
- string str,categ;
- mysplit(s,str,categ);
- int i=p->words->binary_locate(str);
- if(i>=0 && (*(p->words))[i]==str )
- {
-
- if( translation[categ]==-1 )
- translation[categ]=maxCat++;
- int cat=translation[categ];
- if( (p->initLike)[i]!= -1 )
- cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n";
- (p->initLike)[i]=cat;
- }
- else
- cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n";
- }
-
+ while( getline(in,s) ) {
+ string str,categ;
+ mysplit(s,str,categ);
+ int i=p->words->binary_locate(str);
+ if(i>=0 && (*(p->words))[i]==str ) {
+
+ if( translation[categ]==-1 )
+ translation[categ]=maxCat++;
+ int cat=translation[categ];
+ if( (p->initLike)[i]!= -1 )
+ cerr << "Warning: Word '" << ((*(p->words))[i])<< "' is already in a category.\n";
+ (p->initLike)[i]=cat;
+ } else
+ cerr << "Warning: Word '" << str << "' " << i << " is not in training corpus.\n";
+ }
+
if( verboseMode )
- cout << "We have " << maxCat << " read non-empty categories"
- " (with words from the corpus).\n";
-
- if(maxCat>p->katFreq.nKats)
- {
- cerr << "Error: Not enough categories reserved (only "
- << p->katFreq.nKats << ", but i need " << maxCat << ").\n";
- exit(1);
- }
-
-
+ cout << "We have " << maxCat << " read non-empty categories"
+ " (with words from the corpus).\n";
+
+ if(maxCat>p->katFreq.nKats) {
+ cerr << "Error: Not enough categories reserved (only "
+ << p->katFreq.nKats << ", but i need " << maxCat << ").\n";
+ exit(1);
+ }
+
+
int i=p->words->binary_locate("$");
if( i>=0 && (*(p->words))[i]=="$" )
(p->initLike)[i]=0;
- else
- if( verboseMode )
- cerr << "Warning: No '$' in vocabulary!\n";
-
-
+ else if( verboseMode )
+ cerr << "Warning: No '$' in vocabulary!\n";
+
+
int errors=0;
- for(i=0;i<p->wordFreq.nWords;i++)
- if((p->initLike)[i]== -1 )
- {
- if( verb ) cerr << "Error: I don't know the category of word " << i
- << " (" << (*(p->words))[i] << ") " << ".\n";
- errors=1;
- }
+ for(i=0; i<p->wordFreq.nWords; i++)
+ if((p->initLike)[i]== -1 ) {
+ if( verb ) cerr << "Error: I don't know the category of word " << i
+ << " (" << (*(p->words))[i] << ") " << ".\n";
+ errors=1;
+ }
return errors;
}
KategProblem *makeKategProblem(const leda_h_array<PSS,FreqType>&cTbl,const leda_set<string>&setVokabular, int maxClass,int initialisierung,
- int auswertung,int nachbarschaft,int minWordFrequency)
+ int auswertung,int nachbarschaft,int minWordFrequency)
{
-
+
int nwrd=0;
leda_array<string>&sVok = *new leda_array<string>(setVokabular.size());
string s;
unsigned int ctr=0;
- forall_set(leda_set<string>,s,setVokabular)
- {
- if( verboseMode>2 )
- cout << "mkcls:Wort " << ctr << " " << s << endl;
- sVok[ctr++]=s;
- }
- for(unsigned int z=0;z<ctr-1;z++)
+ forall_set(leda_set<string>,s,setVokabular) {
+ if( verboseMode>2 )
+ cout << "mkcls:Wort " << ctr << " " << s << endl;
+ sVok[ctr++]=s;
+ }
+ for(unsigned int z=0; z<ctr-1; z++)
iassert( sVok[z]<sVok[z+1] );
sVok.sort();
if( verboseMode>2 )
cout << "*****Vocabulary: " << sVok;
-
+
unsigned int vokSize=sVok.size();
- massert(vokSize==ctr); massert(vokSize==setVokabular.size());
- if(verboseMode)
- {cout << "Size of vocabulary: " << vokSize << "\n";cout.flush();}
-
+ massert(vokSize==ctr);
+ massert(vokSize==setVokabular.size());
+ if(verboseMode) {
+ cout << "Size of vocabulary: " << vokSize << "\n";
+ cout.flush();
+ }
+
KategProblem *k = new KategProblem(vokSize,maxClass,initialisierung,
- auswertung,nachbarschaft,minWordFrequency);
+ auswertung,nachbarschaft,minWordFrequency);
KategProblemWBC &w=k->wordFreq;
k->words=&sVok;
-
+
Array<int> after(vokSize,0);
Array<int> before(vokSize,0);
-
-
+
+
nwrd=0;
{
PSS s;
- forall_defined_h2(PSS,FreqType,s,cTbl)
- {
- const string&ss1=s.first;
- const string&ss2=s.second;
- if( ss2.length()&&(ss1!="$" || ss2!="$") )
- {
- int i1=sVok.binary_search(ss1);
- int i2=sVok.binary_search(ss2);
- iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
- after[i1]++;
- before[i2]++;
- }
- if( verboseMode&&((nwrd++)%10000==0) )
- {cout<<"Statistiken-1 " << nwrd<< ". \r";cout.flush();}
+ forall_defined_h2(PSS,FreqType,s,cTbl) {
+ const string&ss1=s.first;
+ const string&ss2=s.second;
+ if( ss2.length()&&(ss1!="$" || ss2!="$") ) {
+ int i1=sVok.binary_search(ss1);
+ int i2=sVok.binary_search(ss2);
+ iassert( sVok[i1] == ss1 );
+ iassert( sVok[i2] == ss2 );
+ after[i1]++;
+ before[i2]++;
+ }
+ if( verboseMode&&((nwrd++)%10000==0) ) {
+ cout<<"Statistiken-1 " << nwrd<< ". \r";
+ cout.flush();
}
- }
-
- for(unsigned int i=0;i<vokSize;i++)
- {
- w.setAfterWords(i,after[i]);
- w.setBeforeWords(i,before[i]);
}
-
-
+ }
+
+ for(unsigned int i=0; i<vokSize; i++) {
+ w.setAfterWords(i,after[i]);
+ w.setBeforeWords(i,before[i]);
+ }
+
+
{
nwrd=0;
PSS s;
- forall_defined_h2(PSS,FreqType,s,cTbl)
- {
- const string&ss1=s.first;
- const string&ss2=s.second;
- FreqType p=cTbl[s];
- if( ss2.length()&&(ss1!="$" || ss2!="$") )
- {
- int i1=sVok.binary_search(ss1);
- int i2=sVok.binary_search(ss2);
- iassert( sVok[i1] == ss1 );iassert( sVok[i2] == ss2 );
- w.setFreq(i1,i2,p);
- if( verboseMode>2 )
- cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " "
- << ss2 << ":" << i2 << " " << p << endl;
- }
- if( verboseMode&&((nwrd++)%10000==0) )
- {cout<<"Statistiken-2 " <<nwrd<< ". \r";cout.flush();}
- }
+ forall_defined_h2(PSS,FreqType,s,cTbl) {
+ const string&ss1=s.first;
+ const string&ss2=s.second;
+ FreqType p=cTbl[s];
+ if( ss2.length()&&(ss1!="$" || ss2!="$") ) {
+ int i1=sVok.binary_search(ss1);
+ int i2=sVok.binary_search(ss2);
+ iassert( sVok[i1] == ss1 );
+ iassert( sVok[i2] == ss2 );
+ w.setFreq(i1,i2,p);
+ if( verboseMode>2 )
+ cout << "BIGRAMM-HAEUF: " << ss1 << ":" << i1 << " "
+ << ss2 << ":" << i2 << " " << p << endl;
+ }
+ if( verboseMode&&((nwrd++)%10000==0) ) {
+ cout<<"Statistiken-2 " <<nwrd<< ". \r";
+ cout.flush();
+ }
+ }
}
-
+
w.testFull();
- if(verboseMode){cout << "Datenintegritaet getestet.\n";cout.flush();}
+ if(verboseMode) {
+ cout << "Datenintegritaet getestet.\n";
+ cout.flush();
+ }
return k;
}
KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung,
- int auswertung,int nachbarschaft,int minWordFrequency)
+ int auswertung,int nachbarschaft,int minWordFrequency)
{
ifstream file(str);
if(!file)return 0;
@@ -250,118 +248,110 @@ KategProblem *fromNgrFile(const char *str,int maxClass,int initialisierung,
double c=0;
if( verboseMode )cout << "NGRFILE: " << str << endl;
string s1,s2;
- while(file >> c >> s1 >> s2)
- {
- if( s1.length()==0||s2.length()==0 )
- {
- cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl;
- return 0;
- }
- if( c==0 )
- {
- cerr << "Count ist 0 " << s1 << " " << s2 << endl;
- return 0;
- }
- cTbl[pair<string,string>(s1,s2)]=(FreqType)c;
- setVokabular.insert(s1);
- setVokabular.insert(s2);
- if( verboseMode>1 )
- cout << "R: " << s1 << " " << s2 << " " << c << endl;
- c=0;
+ while(file >> c >> s1 >> s2) {
+ if( s1.length()==0||s2.length()==0 ) {
+ cerr << "ERROR: strings are zero: " << s1.length() <<" " << s1 <<" " << s2.length()<<" " << s2 << endl;
+ return 0;
+ }
+ if( c==0 ) {
+ cerr << "Count ist 0 " << s1 << " " << s2 << endl;
+ return 0;
}
-
+ cTbl[pair<string,string>(s1,s2)]=(FreqType)c;
+ setVokabular.insert(s1);
+ setVokabular.insert(s2);
+ if( verboseMode>1 )
+ cout << "R: " << s1 << " " << s2 << " " << c << endl;
+ c=0;
+ }
+
return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
}
-
-
+
+
KategProblem *fromKModel(const char *str,int maxClass,int initialisierung,
- int auswertung,int nachbarschaft,int minWordFrequency)
+ int auswertung,int nachbarschaft,int minWordFrequency)
{
string oldText,text,line;
ifstream f(str);
- if( !f )
- {
- cerr << "ERROR: can not open file " << str << ".\n";
- return 0;
- }
-
+ if( !f ) {
+ cerr << "ERROR: can not open file " << str << ".\n";
+ return 0;
+ }
+
leda_set<string> setVokabular;
leda_h_array<PSS,FreqType> cTbl(0);
oldText="$";
- while(1)
- {
- getline(f,line);
- if(f.fail() && !f.bad() && !f.eof())
- {
- cerr << "WARNING: strange characters in stream (getline) " << endl;f.clear();
- }
- if(!f)break;
-
- istrstream f2(line.c_str());
- while( 1 )
- {
- f2 >> text;
- if(f2.fail() && !f2.bad() && !f2.eof())
- {
- cerr << "WARNING: strange characters in stream (>>) !\n";
- f2.clear(ios::failbit);
- }
- if(!f2){break;}
-
-
-
-
-
-
- if( text == "$" )
- text = "mkcls-mapped-dollar-symbol-$";
- if( !setVokabular.member(text) )setVokabular.insert(text);
- cTbl[pair<string,string>(oldText,text)]++;
- oldText=text;
- }
- text="$";
+ while(1) {
+ getline(f,line);
+ if(f.fail() && !f.bad() && !f.eof()) {
+ cerr << "WARNING: strange characters in stream (getline) " << endl;
+ f.clear();
+ }
+ if(!f)break;
+
+ istrstream f2(line.c_str());
+ while( 1 ) {
+ f2 >> text;
+ if(f2.fail() && !f2.bad() && !f2.eof()) {
+ cerr << "WARNING: strange characters in stream (>>) !\n";
+ f2.clear(ios::failbit);
+ }
+ if(!f2) {
+ break;
+ }
+
+
+
+
+
+
+ if( text == "$" )
+ text = "mkcls-mapped-dollar-symbol-$";
if( !setVokabular.member(text) )setVokabular.insert(text);
cTbl[pair<string,string>(oldText,text)]++;
oldText=text;
}
+ text="$";
+ if( !setVokabular.member(text) )setVokabular.insert(text);
+ cTbl[pair<string,string>(oldText,text)]++;
+ oldText=text;
+ }
return makeKategProblem(cTbl,setVokabular,maxClass,initialisierung,auswertung,nachbarschaft,minWordFrequency);
}
-
+
void KategProblemSetParameters(KategProblem &p)
{
- if( p.katwahl()==K_BEST )
- {
- TAOptimization::defaultAnnRate=0.7;
- RRTOptimization::defaultAnnRate=0.95;
- GDAOptimization::defaultAlpha=0.05;
- if( verboseMode )
- cout << "Parameter-setting like W-DET-BEST\n";
- }
- else
- {
- TAOptimization::defaultAnnRate=0.4;
- RRTOptimization::defaultAnnRate=0.6;
- GDAOptimization::defaultAlpha=0.0125;
- if( verboseMode )
- cout << "Parameter-setting like W-DET-DET\n";
- }
+ if( p.katwahl()==K_BEST ) {
+ TAOptimization::defaultAnnRate=0.7;
+ RRTOptimization::defaultAnnRate=0.95;
+ GDAOptimization::defaultAlpha=0.05;
+ if( verboseMode )
+ cout << "Parameter-setting like W-DET-BEST\n";
+ } else {
+ TAOptimization::defaultAnnRate=0.4;
+ RRTOptimization::defaultAnnRate=0.6;
+ GDAOptimization::defaultAlpha=0.0125;
+ if( verboseMode )
+ cout << "Parameter-setting like W-DET-DET\n";
+ }
}
KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue,
- int auswertung,int nachbarschaft,float relInit)
+ int auswertung,int nachbarschaft,float relInit)
{
KategProblem &k=
*new KategProblem(ANZ_WORD,ANZ_CLS,initValue,auswertung,nachbarschaft);
@@ -369,41 +359,35 @@ KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue,
Array<int> after(ANZ_WORD,0);
Array<int> before(ANZ_WORD,0);
Array<FreqArray> twoD(ANZ_WORD);
- int i;
- for(i=0;i<ANZ_WORD;i++) twoD[i].init(ANZ_WORD,0);
-
- for(i=0;i<ANZ_WORD;i++)
- {
- massert(after[i]==0);
- massert(before[i]==0);
- for(int j=0;j<ANZ_WORD;j++)
- {
- massert(twoD[i][j]==0);
- }
- }
- for(i=0;i<ANZ_WORD*ANZ_WORD*relInit;i++)
- {
- int x=randomInt(ANZ_WORD);
- int y=randomInt(ANZ_WORD);
- if(twoD[x][y]==0)
- {
- after[x]++;
- before[y]++;
- }
- twoD[x][y]+=randomInt(10)+1;
- }
- for(i=0;i<ANZ_WORD;i++)
- {
- w.setAfterWords(i,after[i]);
- w.setBeforeWords(i,before[i]);
+ int i;
+ for(i=0; i<ANZ_WORD; i++) twoD[i].init(ANZ_WORD,0);
+
+ for(i=0; i<ANZ_WORD; i++) {
+ massert(after[i]==0);
+ massert(before[i]==0);
+ for(int j=0; j<ANZ_WORD; j++) {
+ massert(twoD[i][j]==0);
}
-
- for(i=0;i<ANZ_WORD;i++)
- {
- for(int j=0;j<ANZ_WORD;j++)
- if( twoD[i][j] )
- w.setFreq(i,j,twoD[i][j]);
+ }
+ for(i=0; i<ANZ_WORD*ANZ_WORD*relInit; i++) {
+ int x=randomInt(ANZ_WORD);
+ int y=randomInt(ANZ_WORD);
+ if(twoD[x][y]==0) {
+ after[x]++;
+ before[y]++;
}
+ twoD[x][y]+=randomInt(10)+1;
+ }
+ for(i=0; i<ANZ_WORD; i++) {
+ w.setAfterWords(i,after[i]);
+ w.setBeforeWords(i,before[i]);
+ }
+
+ for(i=0; i<ANZ_WORD; i++) {
+ for(int j=0; j<ANZ_WORD; j++)
+ if( twoD[i][j] )
+ w.setFreq(i,j,twoD[i][j]);
+ }
w.testFull();
return k;
}
@@ -414,24 +398,23 @@ KategProblem &makRandom(int ANZ_WORD,int ANZ_CLS,int initValue,
char *makeTitle(KategProblem &problem,int verfahren)
{
char x[1024];
- switch(verfahren)
- {
- case HC_OPT:
- strcpy(x,"HC ");
- break;
- case SA_OPT:
- strcpy(x,"SA ");
- break;
- case TA_OPT:
- strcpy(x,"TA ");
- break;
- case GDA_OPT:
- strcpy(x,"GDA ");
- break;
- case RRT_OPT:
- strcpy(x,"RRT ");
- break;
- }
+ switch(verfahren) {
+ case HC_OPT:
+ strcpy(x,"HC ");
+ break;
+ case SA_OPT:
+ strcpy(x,"SA ");
+ break;
+ case TA_OPT:
+ strcpy(x,"TA ");
+ break;
+ case GDA_OPT:
+ strcpy(x,"GDA ");
+ break;
+ case RRT_OPT:
+ strcpy(x,"RRT ");
+ break;
+ }
problem.makeTitle(x+strlen(x));
return strdup(x);
}
@@ -439,11 +422,11 @@ char *makeTitle(KategProblem &problem,int verfahren)
-#define MAX_MULTIPLE 10
+#define MAX_MULTIPLE 10
Array<KategProblem *> &_izrOptimization(Array<KategProblem *> &probs,
-int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord,
-int anzIter,int verfahren)
+ int anzprob,double timeForOneRed,double maxClock,Array<Kategory> &katOfWord,
+ int anzIter,int verfahren)
{
massert(anzprob>1);
massert(probs[0]->wordFreq.mindestAnzahl<=1);
@@ -456,184 +439,161 @@ int anzIter,int verfahren)
int indexOfDurchschnitt;
Array<int> newWords(nWords);
int useAnzprob=anzprob;
- do
- {
- int w,k;
- indexOfDurchschnitt=0;
- for(w=0;w<nWords;w++)
- newWords[w]=-1;
- for(k=0;k<useAnzprob;k++)
- {
- massert(probs[k]->wordFreq.nWords==nWords);
- probs[k]->makeKats();
- }
-
- for(w=0;w<nWords;w++)
- {
- if( newWords[w]==-1 )
- {
-
-
-
- leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)];
- for(k=1;k<useAnzprob;k++)
- durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)];
-
-
- int _anzInDurchschnitt=0;
- int nr=0;
- forall_set(leda_set<int>,nr,durchschnitt)
- {
- _anzInDurchschnitt++;
- newWords[nr]=indexOfDurchschnitt;
- }
- if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 )
- {
- cout << "- (";
- forall_set(leda_set<int>,nr,durchschnitt)
- {
- cout << p0->getString(nr);
- if( p0->wordFreq.n1(nr)==1 )
- cout << "* ";
- else
- cout << " ";
- }
- cout << ")\n";
- }
-
-
-
-
- for(k=0;k<useAnzprob;k++)
- {
- durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)];
- }
- indexOfDurchschnitt++;
- }
- }
-
- if(indexOfDurchschnitt>=minimumNumberOfWords)
- {
- if(useAnzprob==1)
- {
- cout << "useAnzProb==1 => mysterious.\n";
- break;
- }
- useAnzprob--;
- }
+ do {
+ int w,k;
+ indexOfDurchschnitt=0;
+ for(w=0; w<nWords; w++)
+ newWords[w]=-1;
+ for(k=0; k<useAnzprob; k++) {
+ massert(probs[k]->wordFreq.nWords==nWords);
+ probs[k]->makeKats();
+ }
+
+ for(w=0; w<nWords; w++) {
+ if( newWords[w]==-1 ) {
+
+
+
+ leda_set<int> durchschnitt=(*p0->kats)[p0->katOfWord(w)];
+ for(k=1; k<useAnzprob; k++)
+ durchschnitt = durchschnitt & (*probs[k]->kats)[probs[k]->katOfWord(w)];
+
+
+ int _anzInDurchschnitt=0;
+ int nr=0;
+ forall_set(leda_set<int>,nr,durchschnitt) {
+ _anzInDurchschnitt++;
+ newWords[nr]=indexOfDurchschnitt;
+ }
+ if( verboseMode && _anzInDurchschnitt>1 && anzIter==0 ) {
+ cout << "- (";
+ forall_set(leda_set<int>,nr,durchschnitt) {
+ cout << p0->getString(nr);
+ if( p0->wordFreq.n1(nr)==1 )
+ cout << "* ";
+ else
+ cout << " ";
+ }
+ cout << ")\n";
+ }
+
+
+
+
+ for(k=0; k<useAnzprob; k++) {
+ durchschnitt = durchschnitt - (*probs[k]->kats)[probs[k]->katOfWord(w)];
+ }
+ indexOfDurchschnitt++;
+ }
+ }
+
+ if(indexOfDurchschnitt>=minimumNumberOfWords) {
+ if(useAnzprob==1) {
+ cout << "useAnzProb==1 => mysterious.\n";
+ break;
+ }
+ useAnzprob--;
}
- while(indexOfDurchschnitt>=minimumNumberOfWords);
-
-
+ } while(indexOfDurchschnitt>=minimumNumberOfWords);
+
+
Array<KategProblem *> &neu=*new Array<KategProblem *>(MAX_MULTIPLE*anzprob,(KategProblem *)0);
qsort(probs.getPointerToData(),useAnzprob,sizeof(KategProblem *),compareProblem);
massert(useAnzprob<=probs.size());
double startTime=clockSec();
int i, numberOfNew;
- for(numberOfNew=0; (clockSec()-startTime<timeForOneRed)
- || (numberOfNew < anzprob) ; numberOfNew++)
- {
- int w;
- if( numberOfNew==anzprob*MAX_MULTIPLE-1 )
- break;
- KategProblem *p
- = neu[numberOfNew]
- = new KategProblem(indexOfDurchschnitt,nKats-2,
- p0->initialisierung,p0->auswertung,p0->nachbarschaft);
-
- for(w=0;w<indexOfDurchschnitt;w++)
- {
- p->wordFreq.setAfterWords(w,5);
- p->wordFreq.setBeforeWords(w,5);
- }
- for(w=0;w<nWords;w++)
- {
- Array<OneFreq> &after=p0->wordFreq.after[w];
- int size=after.size();
- for(i=0;i<size;i++)
- p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n);
- }
- p->wordFreq.testFull(1);
-
-
-
-
-
-
- p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words());
- double w1=0.0,w2=0.0;
- if(numberOfNew<useAnzprob)
- {
-
- for(i=0;i<nWords;i++)
- (p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i);
- p->_initialize(5);
- HCOptimization hc(*p,-1);
- if(verboseMode)
- {
- w1=p->nicevalue();
- cout << "from old category system:" << w1 << endl;
- }
- hc.minimize(-1);
- if(verboseMode)
- {
- w2=p->nicevalue();
- if(w2<w1)
- cout << "improvement: " << w1-w2 << endl;
- }
- }
- else
- {
- p->_initialize(1);
- double mean;
- StatVar end,laufzeit,start;
- solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start);
- w2=p->value();
- if(verboseMode)
- cout << "new category system: " << w2 << " (" << p->nicevalue()
- << ") Zeit: " << clockSec() << "\n";
- }
+ for(numberOfNew=0; (clockSec()-startTime<timeForOneRed)
+ || (numberOfNew < anzprob) ; numberOfNew++) {
+ int w;
+ if( numberOfNew==anzprob*MAX_MULTIPLE-1 )
+ break;
+ KategProblem *p
+ = neu[numberOfNew]
+ = new KategProblem(indexOfDurchschnitt,nKats-2,
+ p0->initialisierung,p0->auswertung,p0->nachbarschaft);
+
+ for(w=0; w<indexOfDurchschnitt; w++) {
+ p->wordFreq.setAfterWords(w,5);
+ p->wordFreq.setBeforeWords(w,5);
+ }
+ for(w=0; w<nWords; w++) {
+ Array<OneFreq> &after=p0->wordFreq.after[w];
+ int size=after.size();
+ for(i=0; i<size; i++)
+ p->wordFreq.addFreq(newWords[w],newWords[after[i].w],after[i].n);
}
- int p;
- for(p=0;p<probs.size();p++)
- {
- if( probs[p] )
- delete probs[p];
+ p->wordFreq.testFull(1);
+
+
+
+
+
+
+ p->wordFreq.set_h_of_words(p0->wordFreq.get_h_of_words());
+ double w1=0.0,w2=0.0;
+ if(numberOfNew<useAnzprob) {
+
+ for(i=0; i<nWords; i++)
+ (p->initLike)[newWords[i]]=probs[numberOfNew]->katOfWord(i);
+ p->_initialize(5);
+ HCOptimization hc(*p,-1);
+ if(verboseMode) {
+ w1=p->nicevalue();
+ cout << "from old category system:" << w1 << endl;
+ }
+ hc.minimize(-1);
+ if(verboseMode) {
+ w2=p->nicevalue();
+ if(w2<w1)
+ cout << "improvement: " << w1-w2 << endl;
+ }
+ } else {
+ p->_initialize(1);
+ double mean;
+ StatVar end,laufzeit,start;
+ solveProblem(0,*p,1,-1,verfahren,mean,end,laufzeit,start);
+ w2=p->value();
+ if(verboseMode)
+ cout << "new category system: " << w2 << " (" << p->nicevalue()
+ << ") Zeit: " << clockSec() << "\n";
}
+ }
+ int p;
+ for(p=0; p<probs.size(); p++) {
+ if( probs[p] )
+ delete probs[p];
+ }
qsort(neu.getPointerToData(),numberOfNew,sizeof(Problem *),compareProblem);
massert(numberOfNew<=neu.size());
if( verboseMode )
- cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt
- << " words. costs: " << neu[0]->value() << " "
- << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: "
- << clockSec() << endl;
- if( indexOfDurchschnitt<=nKats
- || (clockSec()>maxClock&&maxClock) )
- {
- if( clockSec()>maxClock&&maxClock )
- cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n";
- for(i=0;i<nWords;i++)
- katOfWord[i]=neu[0]->katOfWord(newWords[i]);
- return neu;
- }
- else
- {
- Array<Kategory> &newKatOfWord=
- *(new Array<Kategory>(neu[0]->wordFreq.nWords,-1));
- Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed,
- maxClock,newKatOfWord,
- anzIter+1,verfahren);
- for(i=0;i<nWords;i++)
- katOfWord[i]=newKatOfWord[newWords[i]];
- return erg;
- }
+ cout << "Iterierte Zustandsraum-Reduktion: " << indexOfDurchschnitt
+ << " words. costs: " << neu[0]->value() << " "
+ << neu[0]->nicevalue() << " (" << numberOfNew-anzprob << ")" << "time: "
+ << clockSec() << endl;
+ if( indexOfDurchschnitt<=nKats
+ || (clockSec()>maxClock&&maxClock) ) {
+ if( clockSec()>maxClock&&maxClock )
+ cout << "STOP (time limit: " << (clockSec()-maxClock) << " s)\n";
+ for(i=0; i<nWords; i++)
+ katOfWord[i]=neu[0]->katOfWord(newWords[i]);
+ return neu;
+ } else {
+ Array<Kategory> &newKatOfWord=
+ *(new Array<Kategory>(neu[0]->wordFreq.nWords,-1));
+ Array<KategProblem *> &erg=_izrOptimization(neu,anzprob,timeForOneRed,
+ maxClock,newKatOfWord,
+ anzIter+1,verfahren);
+ for(i=0; i<nWords; i++)
+ katOfWord[i]=newKatOfWord[newWords[i]];
+ return erg;
+ }
}
KategProblem *izrOptimization(KategProblem &p,int minN,int firstN,
- double clockForOneRed,double maxClock,int verfahren)
+ double clockForOneRed,double maxClock,int verfahren)
{
Array<Kategory> katOfWord(p.wordFreq.nWords,-1);
int startN;
@@ -647,31 +607,29 @@ KategProblem *izrOptimization(KategProblem &p,int minN,int firstN,
double startTime=clockSec();
int i;
- for(i=0;i<startN;i++)
- {
- StatVar end,laufzeit,start;
- double mean;
- probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem());
- solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start);
- if( i==minN-1 )
- endTime = clockSec();
- if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) )
- break;
- }
+ for(i=0; i<startN; i++) {
+ StatVar end,laufzeit,start;
+ double mean;
+ probs[i] = (KategProblem *)((KategProblem *)p.makeEqualProblem());
+ solveProblem(0,*(probs[i]),1,-1,verfahren,mean,end,laufzeit,start);
+ if( i==minN-1 )
+ endTime = clockSec();
+ if( i>=firstN-1 && (startTime+clockForOneRed>clockSec() || i==999) )
+ break;
+ }
if( endTime<0 )
endTime=clockSec();
massert(i>=firstN);
qsort(probs.getPointerToData(),i,sizeof(KategProblem *),compareProblem);
massert(i<=probs.size());
- if( clockForOneRed<=0 )
- {
- clockForOneRed=endTime-startTime;
- if( verboseMode )
- cout << "time for one reduction: " << clockForOneRed << endl;
- }
+ if( clockForOneRed<=0 ) {
+ clockForOneRed=endTime-startTime;
+ if( verboseMode )
+ cout << "time for one reduction: " << clockForOneRed << endl;
+ }
_izrOptimization(probs,minN,clockForOneRed,maxClock,katOfWord,0,verfahren);
-
+
KategProblem *n=(KategProblem *)(p.makeEqualProblem());
n->initLike= katOfWord;
n->_initialize(5);