diff options
Diffstat (limited to 'mkcls-v2/mkcls.cpp')
-rw-r--r-- | mkcls-v2/mkcls.cpp | 618 |
1 files changed, 618 insertions, 0 deletions
diff --git a/mkcls-v2/mkcls.cpp b/mkcls-v2/mkcls.cpp new file mode 100644 index 0000000..90ebfde --- /dev/null +++ b/mkcls-v2/mkcls.cpp @@ -0,0 +1,618 @@ +/* + +Copyright (C) 1997,1998,1999,2000,2001 Franz Josef Och + +mkcls - a program for making word classes . + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +*/ + + + + +#include <stdio.h> +#include <iostream> +#include <stdlib.h> +#include <ctype.h> +#include "general.h" + +#include "KategProblem.h" +#include "KategProblemTest.h" + +#include "ProblemTest.h" +#include "TAOptimization.h" +#include "GDAOptimization.h" +#include "RRTOptimization.h" +#include "SAOptimization.h" +#include "HCOptimization.h" + + +double SigmaVerfaelschung=5.0; +int OneWithHapas=1; +char *hapaxInitName=0; + + + + + +static int nLaeufe=1,nLaeufeReduce=3; + + +static int optimizeParameterAnzahl=10; + + +static int IterOptVerf=TA_OPT; + + +static int MaxIterOptSteps= -1; + + +static int MaxSecs=0; + + + + + +static int InitValue=INIT_RAN; + + +static int Criterion=CRITERION_ML; + + +static int Wwahl=W_DET_DECR; + + +static int Kwahl=K_BEST; + + +static int NumberCategories=100; + + +static int MinWordFrequency=0; + + +static int IterOptSet=0; + + +static KategProblem *p = 0; + + +char korpusName[1024]="train"; +int korpusIsText=1; + + +char *FileForOther=0; + +void printUsage(int r) +{ + cout << + "mkcls - a program for making word classes: Usage: \n" + " mkcls [-nnum] [-ptrain] [-Vfile] opt\n" + + + + + + + "-V output classes (Default: no file)\n" + + + "-n number of optimization runs (Default: 1); larger number => better results\n" + + "-p filename of training corpus (Default: 'train')\n" + + + + + + + + + "Example:\n" + " mkcls -c80 -n10 -pin -Vout opt\n" + " (generates 80 classes for the corpus 'in' and writes the classes in 'out')\n" + "Literature: \n" + " Franz Josef Och: »Maximum-Likelihood-Schätzung von Wortkategorien mit Verfahren\n" + " der kombinatorischen Optimierung« Studienarbeit, Universität Erlangen-Nürnberg,\n" + " Germany,1995. \n"; + exit(r); +} + + + + + + + +void makeIterOpt() +{ + double maxTime=clockSec()+MaxSecs; + if(MaxSecs==0)maxTime=0; + double mean; + StatVar end,laufzeit,init; + solveProblem(1+(PrintBestTo!=0),*p,nLaeufe,MaxIterOptSteps,IterOptVerf, + mean,end,laufzeit,init,maxTime); + if( verboseMode>1 ) + p->dumpOn(cout); +} + + + +void makeIzrOpt() +{ + double maxTime=clockSec()+MaxSecs; + if(MaxSecs==0)maxTime=0; + izrOptimization(*p,nLaeufeReduce,nLaeufeReduce,0,maxTime,IterOptVerf); +} + + + +int makeMetaOpt(int argc,char **argv) +{ + int ret=0; + + if(argc==4 || argc==3) + { + int typ=0; + if( argc==4 ) + { + sscanf(argv[3],"%d",&typ); + assert(typ>0 && typ<=11 ); + } + if( isdigit(argv[2][0]) ) + { + int a; + sscanf(argv[2],"%d",&a); + switch(a) + { + case 1: + SAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,1); + break; + case 2: + SAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,2); + break; + case 3: + SAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,10); + break; + case 4: + TAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,1); + break; + case 5: + TAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,10); + break; + case 6: + RRTOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,1); + break; + case 7: + RRTOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,10); + break; + case 8: + GDAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,1); + break; + default: + cerr << "Error: Wrong number of parameter (" << argv[2] + << ").\n"; + printUsage(1); + } + } + else + { + if(strcasecmp(argv[2],"gda")==0) + { + GDAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,typ); + } + else if(strcasecmp(argv[2],"ta")==0) + { + TAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,typ); + } + else if(strcasecmp(argv[2],"rrt")==0) + { + RRTOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,typ); + } + else if(strcasecmp(argv[2],"sa")==0) + { + SAOptimization::optimizeValue(*p,nLaeufe, + optimizeParameterAnzahl,typ); + } + + + + + else + { + cerr << "Error: unknown algorithm" << argv[2] << endl; + printUsage(1); + } + } + } + else + { + cerr << "Error: wrong number of arguments: " << argc << endl; + printUsage(1); + } + return ret; +} + + + + + + + + + + +void setVerfahren(char *p) +{ + if(strcasecmp(p,"rrt")==0 ) + IterOptVerf=RRT_OPT; + else if(strcasecmp(p,"ta")==0) + IterOptVerf=TA_OPT; + else if(strcasecmp(p,"gda")==0) + IterOptVerf=GDA_OPT; + else if(strcasecmp(p,"sa")==0) + IterOptVerf=SA_OPT; + else if(strcasecmp(p,"hc")==0) + IterOptVerf=HC_OPT; + else + { + cerr << "Error: Unknown iterativ-optimizing algorithm '" << p << "'.\n"; + printUsage(1); + } +} + + + +void setInitValue(char *iv,char *fileForOther) +{ + if(strcasecmp(iv,"ran")==0 ) + InitValue=INIT_RAN; + else if(strcasecmp(iv,"aio")==0) + InitValue=INIT_AIO; + else if(strcasecmp(iv,"gda")==0) + InitValue=INIT_LWRW; + else if(strcasecmp(iv,"freq")==0) + InitValue=INIT_FREQ; + else if(strcasecmp(iv,"other")==0) + { + InitValue=INIT_OTHER; + FileForOther=strdup(fileForOther); + } + else + { + cerr << "Error: Unknown initialization '" << p << "'.\n";; + printUsage(1); + } +} + + +void setWwahl(const char *ww) +{ + if(strcasecmp(ww,"ran")==0 ) + Wwahl=W_RAN; + else if(strcasecmp(ww,"det")==0) + Wwahl=W_DET_DECR; + else if(strcasecmp(ww,"incr")==0) + Wwahl=W_DET_INCR; + else + { + cerr << "Error: Unknown word-selection '" << ww << "'.\n";; + printUsage(1); + } +} + + +void setKwahl(const char *kw) +{ + if( strcasecmp(kw,"det")==0 ) + Kwahl=K_DET; + else if(strcasecmp(kw,"ran")==0 ) + Kwahl=K_RAN; + else if(strcasecmp(kw,"best")==0) + Kwahl=K_BEST; + else + { + cerr << "Error: Unknown category-selection '" << kw << "'.\n"; + printUsage(1); + } +} + + +void setParameter(const char *nr1,const char *nr2) +{ + int n1; + float n2; + sscanf(nr1,"%d",&n1); + sscanf(nr2,"%f",&n2); + IterOptSet=1; + switch(n1) + { + case 1: + SAOptimization::defaultAnfAnnRate=n2; + if(verboseMode)cout << "Parameter gamma_0 (SA) set to " + << SAOptimization::defaultAnfAnnRate << endl; + iassert(0<=SAOptimization::defaultAnfAnnRate&& + SAOptimization::defaultAnfAnnRate<=1); + break; + case 2: + SAOptimization::defaultEndAnnRate=n2; + if(verboseMode)cout << "Parameter gamma_e (SA) set to " + << SAOptimization::defaultEndAnnRate << endl; + iassert(0<=SAOptimization::defaultEndAnnRate + &&SAOptimization::defaultEndAnnRate<=1); + break; + case 3: + SAOptimization::defaultMultiple=n2; + if(verboseMode)cout << "Parameter nu_e (SA) set to " + << SAOptimization::defaultMultiple << endl; + iassert( SAOptimization::defaultMultiple>0 ); + break; + case 4: + TAOptimization::defaultAnnRate=n2; + if(verboseMode)cout << "Parameter gamma_{TA} set to " + << TAOptimization::defaultAnnRate << endl; + iassert(0<=TAOptimization::defaultAnnRate + &&TAOptimization::defaultAnnRate<=1); + break; + case 5: + TAOptimization::defaultMultiple=n2; + if(verboseMode)cout << "Parameter nu_{TA} set to " + << TAOptimization::defaultMultiple << endl; + iassert( TAOptimization::defaultMultiple>0 ); + break; + case 6: + RRTOptimization::defaultAnnRate=n2; + if(verboseMode)cout << "Parameter gamma_{RRT} set to " + << RRTOptimization::defaultAnnRate << endl; + iassert(0<=RRTOptimization::defaultAnnRate + && RRTOptimization::defaultAnnRate<=1); + break; + case 7: + RRTOptimization::defaultMultiple=n2; + if(verboseMode)cout << "Parameter nu_{RRT} set to " + << RRTOptimization::defaultMultiple << endl; + iassert( RRTOptimization::defaultMultiple>0 ); + break; + case 8: + GDAOptimization::defaultAlpha=n2; + if(verboseMode)cout << "Parameter alpha set to " + << GDAOptimization::defaultAlpha << endl; + iassert(0<=GDAOptimization::defaultAlpha + && GDAOptimization::defaultAlpha<1 ); + break; + default: + cerr << "Error: Wrong parameter number " << nr1 << " " << n1 << endl; + printUsage(1); + } +} + + + +void setKorpusName(const char *s) +{ + strcpy(korpusName,s); +} + +void setHapaxInitName(const char *s) +{ + hapaxInitName=strdup(s); +} + +void setKorpus() +{ + if( korpusIsText ) + { + if( (p=fromKModel(korpusName,NumberCategories,InitValue,Criterion,Wwahl|Kwahl, + MinWordFrequency))==0) + { + cerr << "Error: Could not read the file '" << korpusName << "'.\n"; + printUsage(1); + } + } + else + { + if( (p=fromNgrFile(korpusName,NumberCategories,InitValue,Criterion,Wwahl|Kwahl, + MinWordFrequency))==0) + { + cerr << "Error: Could not read the file '" << korpusName << "'.\n"; + printUsage(1); + } + p->wordFreq.initializeIndex(*(p->words),'1',2,1+NumberCategories/2,!OneWithHapas); + p->wordFreq.initializeIndex(*(p->words),'2',2+NumberCategories/2,1+NumberCategories,OneWithHapas); + } + if( IterOptSet==0 ) + KategProblemSetParameters(*p); +} + + + + + + +int main(int argc,char **argv) +{ + double startTime=clockSec(); + zufallSeed(); + while( argc>1 && argv[1][0]=='-' ) + { + + switch(argv[1][1]) + { + case 'v': + sscanf(argv[1]+2,"%d",&verboseMode); + iassert(verboseMode>=0); + break; + case 'O': + sscanf(argv[1]+2,"%d",&OneWithHapas); + cout << "OneWithHapas: " << OneWithHapas << endl; + break; + case 'n': + sscanf(argv[1]+2,"%d",&nLaeufe); + nLaeufeReduce=nLaeufe; + iassert( nLaeufe>=1 ); + break; + case 'l': + Criterion=1; + if( argv[1][2] ) + { + sscanf(argv[1]+2,"%lf",&rhoLo); + if( verboseMode ) + cout << "Parameter rho (for LO) set to" << rhoLo << ".\n"; + iassert(0<=rhoLo && rhoLo<=1); + } + if( verboseMode ) + cout << "Criterion LO used.\n"; + break; + case 'y': + Criterion=2; + if( argv[1][2] ) + { + sscanf(argv[1]+2,"%lf",&SigmaVerfaelschung); + if( verboseMode ) + cout << "Parameter rho (for LO) set to" << SigmaVerfaelschung << ".\n"; + iassert(0<SigmaVerfaelschung); + } + if( verboseMode ) + cout << "My special criterion used.\n"; + break; + case 'p': + setKorpusName(argv[1]+2); + assert(argv[2]&&argv[2][0]!='-' || argv[2][0]!='i'); + break; + case 'P': + setKorpusName(argv[1]+2); + korpusIsText=0; + assert(argv[2]&&argv[2][0]!='-' || argv[2][0]!='i'); + break; + case 'i': + setInitValue(argv[1]+2,argv[2]); + if( InitValue==INIT_OTHER ) + argv++,argc--; + break; + case 'h': + setHapaxInitName(argv[1]+2); + break; + case 'k': + setKwahl(argv[1]+2); + break; + case 'w': + setWwahl(argv[1]+2); + break; + case 'c': + sscanf(argv[1]+2,"%d",&NumberCategories); + iassert(NumberCategories>=2); + break; + case 'm': + sscanf(argv[1]+2,"%d",&MinWordFrequency); + break; + case 'e': + setParameter(argv[1]+2,argv[2]); + argv++,argc--; + break; + case 'a': + setVerfahren(argv[1]+2); + break; + case 'r': + { + int s; + sscanf(argv[1]+2,"%d",&s); + zufallSeed(s); + } + break; + case 'V': + if(argv[1][2]) + { + char str[1024]; + strcpy(str,argv[1]+2); + PrintBestTo=new ofstream(str); + strcat(str,".cats"); + PrintBestTo2=new ofstream(str); + } + else + cout << "AUSGABE auf cout\n"; + break; + case 'M': + sscanf(argv[1]+2,"%d",&MaxIterOptSteps); + break; + case 's': + sscanf(argv[1]+2,"%d",&MaxSecs); + break; + case 'N': + sscanf(argv[1]+2,"%d",&optimizeParameterAnzahl); + break; + case 'o': + GraphOutput = new ofstream(argv[1]+2); + if( GraphOutput==0 ) + cerr << "Warning: Open failed for file '" << argv[1]+2 << "'.\n"; + break; + default: + cerr << "Fehlerhafte Option: " << argv[1] << endl; + printUsage(1); + } + argv++; + argc--; + } + + + setKorpus(); + if( FileForOther ) + { + fromCatFile(p,FileForOther); + p->initialisierung=InitValue; + p->_initialize(InitValue); + } + + if( hapaxInitName ) + { + fromCatFile(p,hapaxInitName,0); + p->fixInitLike(); + } + + double start2Time=clockSec(); + + if(argc>=2 && strcasecmp(argv[1],"opt")==0 ) + makeIterOpt(); + else if(argc>=2 && strcasecmp(argv[1],"meta-opt")==0) + makeMetaOpt(argc,argv); + else if(argc>=2 && strcasecmp(argv[1],"izr-opt")==0) + makeIzrOpt(); + + + else + { + makeIterOpt(); + } + + if( verboseMode ) + { + cout << " full-time: " << clockSec()-startTime << endl; + cout << "optimize-time: " << clockSec()-start2Time << endl; + } + return 0; +} + |