/* Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och mkcls - a program for making word classes . This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include "general.h" #include "KategProblem.h" #include "KategProblemTest.h" #include "ProblemTest.h" #include "TAOptimization.h" #include "GDAOptimization.h" #include "RRTOptimization.h" #include "SAOptimization.h" #include "HCOptimization.h" double SigmaVerfaelschung=5.0; int OneWithHapas=1; char *hapaxInitName=0; static int nLaeufe=1,nLaeufeReduce=3; static int optimizeParameterAnzahl=10; static int IterOptVerf=TA_OPT; static int MaxIterOptSteps= -1; static int MaxSecs=0; static int InitValue=INIT_RAN; static int Criterion=CRITERION_ML; static int Wwahl=W_DET_DECR; static int Kwahl=K_BEST; static int NumberCategories=100; static int MinWordFrequency=0; static int IterOptSet=0; static KategProblem *p = 0; char korpusName[1024]="train"; int korpusIsText=1; char *FileForOther=0; void printUsage(int r) { cout << "mkcls - a program for making word classes: Usage: \n" " mkcls [-nnum] [-ptrain] [-Vfile] opt\n" "-V output classes (Default: no file)\n" "-n number of optimization runs (Default: 1); larger number => better results\n" "-p filename of training corpus (Default: 'train')\n" "Example:\n" " mkcls -c80 -n10 -pin -Vout opt\n" " (generates 80 classes for the corpus 'in' and writes the classes in 'out')\n" "Literature: \n" " Franz Josef Och: »Maximum-Likelihood-Schätzung von Wortkategorien mit Verfahren\n" " der kombinatorischen Optimierung« Studienarbeit, Universität Erlangen-Nürnberg,\n" " Germany,1995. \n"; exit(r); } void makeIterOpt() { double maxTime=clockSec()+MaxSecs; if(MaxSecs==0)maxTime=0; double mean; StatVar end,laufzeit,init; solveProblem(1+(PrintBestTo!=0),*p,nLaeufe,MaxIterOptSteps,IterOptVerf, mean,end,laufzeit,init,maxTime); if( verboseMode>1 ) p->dumpOn(cout); } void makeIzrOpt() { double maxTime=clockSec()+MaxSecs; if(MaxSecs==0)maxTime=0; izrOptimization(*p,nLaeufeReduce,nLaeufeReduce,0,maxTime,IterOptVerf); } int makeMetaOpt(int argc,char **argv) { int ret=0; if(argc==4 || argc==3) { int typ=0; if( argc==4 ) { sscanf(argv[3],"%d",&typ); assert(typ>0 && typ<=11 ); } if( isdigit(argv[2][0]) ) { int a; sscanf(argv[2],"%d",&a); switch(a) { case 1: SAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,1); break; case 2: SAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,2); break; case 3: SAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,10); break; case 4: TAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,1); break; case 5: TAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,10); break; case 6: RRTOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,1); break; case 7: RRTOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,10); break; case 8: GDAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,1); break; default: cerr << "Error: Wrong number of parameter (" << argv[2] << ").\n"; printUsage(1); } } else { if(strcasecmp(argv[2],"gda")==0) { GDAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,typ); } else if(strcasecmp(argv[2],"ta")==0) { TAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,typ); } else if(strcasecmp(argv[2],"rrt")==0) { RRTOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,typ); } else if(strcasecmp(argv[2],"sa")==0) { SAOptimization::optimizeValue(*p,nLaeufe, optimizeParameterAnzahl,typ); } else { cerr << "Error: unknown algorithm" << argv[2] << endl; printUsage(1); } } } else { cerr << "Error: wrong number of arguments: " << argc << endl; printUsage(1); } return ret; } void setVerfahren(char *p) { if(strcasecmp(p,"rrt")==0 ) IterOptVerf=RRT_OPT; else if(strcasecmp(p,"ta")==0) IterOptVerf=TA_OPT; else if(strcasecmp(p,"gda")==0) IterOptVerf=GDA_OPT; else if(strcasecmp(p,"sa")==0) IterOptVerf=SA_OPT; else if(strcasecmp(p,"hc")==0) IterOptVerf=HC_OPT; else { cerr << "Error: Unknown iterativ-optimizing algorithm '" << p << "'.\n"; printUsage(1); } } void setInitValue(char *iv,char *fileForOther) { if(strcasecmp(iv,"ran")==0 ) InitValue=INIT_RAN; else if(strcasecmp(iv,"aio")==0) InitValue=INIT_AIO; else if(strcasecmp(iv,"gda")==0) InitValue=INIT_LWRW; else if(strcasecmp(iv,"freq")==0) InitValue=INIT_FREQ; else if(strcasecmp(iv,"other")==0) { InitValue=INIT_OTHER; FileForOther=strdup(fileForOther); } else { cerr << "Error: Unknown initialization '" << p << "'.\n";; printUsage(1); } } void setWwahl(const char *ww) { if(strcasecmp(ww,"ran")==0 ) Wwahl=W_RAN; else if(strcasecmp(ww,"det")==0) Wwahl=W_DET_DECR; else if(strcasecmp(ww,"incr")==0) Wwahl=W_DET_INCR; else { cerr << "Error: Unknown word-selection '" << ww << "'.\n";; printUsage(1); } } void setKwahl(const char *kw) { if( strcasecmp(kw,"det")==0 ) Kwahl=K_DET; else if(strcasecmp(kw,"ran")==0 ) Kwahl=K_RAN; else if(strcasecmp(kw,"best")==0) Kwahl=K_BEST; else { cerr << "Error: Unknown category-selection '" << kw << "'.\n"; printUsage(1); } } void setParameter(const char *nr1,const char *nr2) { int n1; float n2; sscanf(nr1,"%d",&n1); sscanf(nr2,"%f",&n2); IterOptSet=1; switch(n1) { case 1: SAOptimization::defaultAnfAnnRate=n2; if(verboseMode)cout << "Parameter gamma_0 (SA) set to " << SAOptimization::defaultAnfAnnRate << endl; iassert(0<=SAOptimization::defaultAnfAnnRate&& SAOptimization::defaultAnfAnnRate<=1); break; case 2: SAOptimization::defaultEndAnnRate=n2; if(verboseMode)cout << "Parameter gamma_e (SA) set to " << SAOptimization::defaultEndAnnRate << endl; iassert(0<=SAOptimization::defaultEndAnnRate &&SAOptimization::defaultEndAnnRate<=1); break; case 3: SAOptimization::defaultMultiple=n2; if(verboseMode)cout << "Parameter nu_e (SA) set to " << SAOptimization::defaultMultiple << endl; iassert( SAOptimization::defaultMultiple>0 ); break; case 4: TAOptimization::defaultAnnRate=n2; if(verboseMode)cout << "Parameter gamma_{TA} set to " << TAOptimization::defaultAnnRate << endl; iassert(0<=TAOptimization::defaultAnnRate &&TAOptimization::defaultAnnRate<=1); break; case 5: TAOptimization::defaultMultiple=n2; if(verboseMode)cout << "Parameter nu_{TA} set to " << TAOptimization::defaultMultiple << endl; iassert( TAOptimization::defaultMultiple>0 ); break; case 6: RRTOptimization::defaultAnnRate=n2; if(verboseMode)cout << "Parameter gamma_{RRT} set to " << RRTOptimization::defaultAnnRate << endl; iassert(0<=RRTOptimization::defaultAnnRate && RRTOptimization::defaultAnnRate<=1); break; case 7: RRTOptimization::defaultMultiple=n2; if(verboseMode)cout << "Parameter nu_{RRT} set to " << RRTOptimization::defaultMultiple << endl; iassert( RRTOptimization::defaultMultiple>0 ); break; case 8: GDAOptimization::defaultAlpha=n2; if(verboseMode)cout << "Parameter alpha set to " << GDAOptimization::defaultAlpha << endl; iassert(0<=GDAOptimization::defaultAlpha && GDAOptimization::defaultAlpha<1 ); break; default: cerr << "Error: Wrong parameter number " << nr1 << " " << n1 << endl; printUsage(1); } } void setKorpusName(const char *s) { strcpy(korpusName,s); } void setHapaxInitName(const char *s) { hapaxInitName=strdup(s); } void setKorpus() { if( korpusIsText ) { if( (p=fromKModel(korpusName,NumberCategories,InitValue,Criterion,Wwahl|Kwahl, MinWordFrequency))==0) { cerr << "Error: Could not read the file '" << korpusName << "'.\n"; printUsage(1); } } else { if( (p=fromNgrFile(korpusName,NumberCategories,InitValue,Criterion,Wwahl|Kwahl, MinWordFrequency))==0) { cerr << "Error: Could not read the file '" << korpusName << "'.\n"; printUsage(1); } p->wordFreq.initializeIndex(*(p->words),'1',2,1+NumberCategories/2,!OneWithHapas); p->wordFreq.initializeIndex(*(p->words),'2',2+NumberCategories/2,1+NumberCategories,OneWithHapas); } if( IterOptSet==0 ) KategProblemSetParameters(*p); } int main(int argc,char **argv) { double startTime=clockSec(); zufallSeed(); while( argc>1 && argv[1][0]=='-' ) { switch(argv[1][1]) { case 'v': sscanf(argv[1]+2,"%d",&verboseMode); iassert(verboseMode>=0); break; case 'O': sscanf(argv[1]+2,"%d",&OneWithHapas); cout << "OneWithHapas: " << OneWithHapas << endl; break; case 'n': sscanf(argv[1]+2,"%d",&nLaeufe); nLaeufeReduce=nLaeufe; iassert( nLaeufe>=1 ); break; case 'l': Criterion=1; if( argv[1][2] ) { sscanf(argv[1]+2,"%lf",&rhoLo); if( verboseMode ) cout << "Parameter rho (for LO) set to" << rhoLo << ".\n"; iassert(0<=rhoLo && rhoLo<=1); } if( verboseMode ) cout << "Criterion LO used.\n"; break; case 'y': Criterion=2; if( argv[1][2] ) { sscanf(argv[1]+2,"%lf",&SigmaVerfaelschung); if( verboseMode ) cout << "Parameter rho (for LO) set to" << SigmaVerfaelschung << ".\n"; iassert(0=2); break; case 'm': sscanf(argv[1]+2,"%d",&MinWordFrequency); break; case 'e': setParameter(argv[1]+2,argv[2]); argv++,argc--; break; case 'a': setVerfahren(argv[1]+2); break; case 'r': { int s; sscanf(argv[1]+2,"%d",&s); zufallSeed(s); } break; case 'V': if(argv[1][2]) { char str[1024]; strcpy(str,argv[1]+2); PrintBestTo=new ofstream(str); strcat(str,".cats"); PrintBestTo2=new ofstream(str); } else cout << "AUSGABE auf cout\n"; break; case 'M': sscanf(argv[1]+2,"%d",&MaxIterOptSteps); break; case 's': sscanf(argv[1]+2,"%d",&MaxSecs); break; case 'N': sscanf(argv[1]+2,"%d",&optimizeParameterAnzahl); break; case 'o': GraphOutput = new ofstream(argv[1]+2); if( GraphOutput==0 ) cerr << "Warning: Open failed for file '" << argv[1]+2 << "'.\n"; break; default: cerr << "Fehlerhafte Option: " << argv[1] << endl; printUsage(1); } argv++; argc--; } setKorpus(); if( FileForOther ) { fromCatFile(p,FileForOther); p->initialisierung=InitValue; p->_initialize(InitValue); } if( hapaxInitName ) { fromCatFile(p,hapaxInitName,0); p->fixInitLike(); } double start2Time=clockSec(); if(argc>=2 && strcasecmp(argv[1],"opt")==0 ) makeIterOpt(); else if(argc>=2 && strcasecmp(argv[1],"meta-opt")==0) makeMetaOpt(argc,argv); else if(argc>=2 && strcasecmp(argv[1],"izr-opt")==0) makeIzrOpt(); else { makeIterOpt(); } if( verboseMode ) { cout << " full-time: " << clockSec()-startTime << endl; cout << "optimize-time: " << clockSec()-start2Time << endl; } return 0; }