diff options
Diffstat (limited to 'irstlm/src/dictionary.h')
-rw-r--r-- | irstlm/src/dictionary.h | 186 |
1 files changed, 0 insertions, 186 deletions
diff --git a/irstlm/src/dictionary.h b/irstlm/src/dictionary.h deleted file mode 100644 index 2e15f22bb..000000000 --- a/irstlm/src/dictionary.h +++ /dev/null @@ -1,186 +0,0 @@ -/****************************************************************************** - IrstLM: IRST Language Model Toolkit - Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -******************************************************************************/ - -#ifndef MF_DICTIONARY_H -#define MF_DICTIONARY_H - -#include <cstring> -#include <iostream> - -#define MAX_WORD 100 -#define LOAD_FACTOR 5 - -#ifndef GROWTH_STEP -#define GROWTH_STEP 100000 -#endif - -#ifndef DICT_INITSIZE -#define DICT_INITSIZE 100000 -#endif - -//Begin of sentence symbol -#ifndef BOS_ -#define BOS_ "<s>" -#endif - - -//End of sentence symbol -#ifndef EOS_ -#define EOS_ "</s>" -#endif - -//Out-Of-Vocabulary symbol -#ifndef OOV_ -#define OOV_ "_unk_" -#endif - -typedef struct{ - char *word; - int code; - int freq; -}dict_entry; - -class strstack; -class htable; - -class dictionary{ - strstack *st; //!< stack of strings - dict_entry *tb; //!< entry table - htable *htb; //!< hash table - int n; //!< number of entries - int N; //!< total frequency - int lim; //!< limit of entries - int oov_code; //!< code assigned to oov words - char* is; //!< interruption symbol list - char ifl; //!< increment flag - int dubv; //!< dictionary size upper bound - int in_oov_lex; //!< flag - int oov_lex_code; //!< dictionary - char* oov_str; //!< oov string - - public: - - friend class dictionary_iter; - - dictionary* oovlex; //<! additional dictionary - - inline int dub(){return dubv;} - inline int dub(int value){return (dubv=value);} - - inline char *OOV(){return (OOV_);} - inline char *BoS(){return (BOS_);} - inline char *EoS(){return (EOS_);} - - inline int oovcode(int v=-1){return oov_code=(v>=0?v:oov_code);} - - inline char *intsymb(char* isymb=NULL){ - if (isymb==NULL) return is; - if (is!=NULL) delete [] is; - is=new char[strlen(isymb+1)]; - strcpy(is,isymb); - return is=isymb; - } - - inline int incflag(){return ifl;} - inline int incflag(int v){return ifl=v;} - inline int oovlexsize(){return oovlex?oovlex->n:0;} - inline int inoovlex(){return in_oov_lex;} - inline int oovlexcode(){return oov_lex_code;} - - - int isprintable(char* w){ - char buffer[MAX_WORD]; - sprintf(buffer,"%s",w); - return strcmp(w,buffer)==0; - } - - inline void genoovcode(){ - int c=encode(OOV()); - std::cerr << "OOV code is "<< c << std::endl; - oovcode(c); - } - - inline dictionary* oovlexp(char *fname=NULL){ - if (fname==NULL) return oovlex; - if (oovlex!=NULL) delete oovlex; - oovlex=new dictionary(fname,DICT_INITSIZE); - return oovlex; - } - - inline int setoovrate(double oovrate){ - encode(OOV()); //be sure OOV code exists - int oovfreq=(int)(oovrate * totfreq()); - std::cerr << "setting OOV rate to: " << oovrate << " -- freq= " << oovfreq << std::endl; - return freq(oovcode(),oovfreq); - } - - - inline int incfreq(int code,int value){N+=value;return tb[code].freq+=value;} - - inline int multfreq(int code,double value){ - N+=(int)(value * tb[code].freq)-tb[code].freq; - return tb[code].freq=(int)(value * tb[code].freq); - } - - inline int freq(int code,int value=-1){ - if (value>=0){ - N+=value-tb[code].freq; - tb[code].freq=value; - } - return tb[code].freq; - } - - inline int totfreq(){return N;} - - void grow(); - //dictionary(int size=400,char* isym=NULL,char* oovlex=NULL); - dictionary(char *filename=NULL,int size=DICT_INITSIZE,char* isymb=NULL,char* oovlex=NULL); - dictionary(dictionary* d); - - ~dictionary(); - void generate(char *filename); - void load(char *filename); - void save(char *filename,int freqflag=0); - void load(std::istream& fd); - void save(std::ostream& fd); - - int size(){return n;}; - int getcode(const char *w); - int encode(const char *w); - char *decode(int c); - void stat(); - - void cleanfreq(){ - for (int i=0;i<n;tb[i++].freq=0); - N=0; - } - -}; - -class dictionary_iter { - public: - dictionary_iter(dictionary *dict); - dict_entry* next(); - private: - dictionary* m_dict; -}; - -#endif - |