/****************************************************************************** IrstLM: IRST Language Model Toolkit Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ #include #include #include #include "mempool.h" #include "htable.h" #include "dictionary.h" #include "index.h" using namespace std; dictionary::dictionary(char *filename,int size,char* isymb,char* oovlexfile){ // unitialized memory if (oovlexfile!=NULL) oovlex=new dictionary(oovlexfile,size,isymb,NULL); else oovlex=(dictionary *)NULL; htb = new htable(size/LOAD_FACTOR); tb = new dict_entry[size]; st = new strstack(size * 10); for (int i=0;i> setw(100) >> buffer; inp.close(); if ((strncmp(buffer,"dict",4)==0) || (strncmp(buffer,"DICT",4)==0)) load(filename); else generate(filename); cerr << "loaded \n"; } void dictionary::generate(char *filename){ char buffer[MAX_WORD]; int k; ifstream inp(filename,ios::in); if (!inp){ cerr << "cannot open " << filename << "\n"; exit(1); } cerr << "dict:"; ifl=1; k=0; while (inp >> setw(MAX_WORD) >> buffer){ if (strlen(buffer)==(MAX_WORD-1)){ cerr << "dictionary: a too long word was read (" << buffer << ")\n"; }; if (strlen(buffer)==0){ cerr << "zero lenght word!\n"; continue; } //if (is && (strlen(buffer)==1) && !index(is,buffer[0])) if (is && (strlen(buffer)==1) && (index(is,buffer[0])!=NULL)) continue; //skip over the interruption symbol incfreq(encode(buffer),1); if (!(++k % 1000000)) cerr << "."; } ifl=0; cerr << "\n"; inp.close(); } void dictionary::load(char* filename){ char header[100]; char buffer[MAX_WORD]; char *addr; int freqflag=0; ifstream inp(filename,ios::in); if (!inp){ cerr << "\ncannot open " << filename << "\n"; exit(1); } cerr << "dict:"; inp.getline(header,100); if (strncmp(header,"DICT",4)==0) freqflag=1; else if (strncmp(header,"dict",4)!=0){ cerr << "\ndictionary file " << filename << " has a wrong header\n"; exit(1); } while (inp >> setw(MAX_WORD) >> buffer){ if (strlen(buffer)==(MAX_WORD-1)){ cerr << "\ndictionary: a too long word was read (" << buffer << ")\n"; exit(1); }; tb[n].word=st->push(buffer); tb[n].code=n; if (freqflag) inp >> tb[n].freq; else tb[n].freq=0; if ((addr=htb->search((char *)&tb[n].word,HT_ENTER))) if (addr!=(char *)&tb[n].word){ cerr << "dictionary::loadtxt wrong entry was found (" << buffer << ") in position " << n << "\n"; exit(1); } N+=tb[n].freq; if (strcmp(buffer,OOV())==0) oov_code=n; if (++n==lim) grow(); } inp.close(); } void dictionary::load(std::istream& inp){ char buffer[MAX_WORD]; char *addr; int size; inp >> size; for (int i=0;i> setw(MAX_WORD) >> buffer; if (strlen(buffer)==MAX_WORD-1){ cerr << "\ndictionary::load found word exceeding max length (" << MAX_WORD << ")" << buffer << "\n"; exit(1); }; tb[n].word=st->push(buffer); tb[n].code=n; inp >> tb[n].freq; N+=tb[n].freq; if ((addr=htb->search((char *)&tb[n].word,HT_ENTER))) if (addr!=(char *)&tb[n].word){ cerr << "dictionary::loadtxt wrong entry was found (" << buffer << ") in position " << n << "\n"; exit(1); } if (strcmp(tb[n].word,OOV())==0) oov_code=n; if (++n==lim) grow(); } inp.getline(buffer,MAX_WORD-1); } void dictionary::save(std::ostream& out){ out << n << "\n"; for (int i=0;ifreq-ae->freq; } dictionary::dictionary(dictionary* d){ //transfer values n=d->n; //total entries N=d->N; //total frequency lim=d->lim; //limit of entries oov_code=-1; //code od oov must be re-defined ifl=0; //increment flag=0; dubv=d->dubv; //dictionary upperbound transferred in_oov_lex=0; //does not copy oovlex; //creates a sorted copy of the table tb = new dict_entry[lim]; htb = new htable(lim/LOAD_FACTOR); st = new strstack(lim * 10); for (int i=0;itb[i].code; tb[i].freq=d->tb[i].freq; tb[i].word=st->push(d->tb[i].word); } //sort all entries according to frequency cerr << "sorting dictionary ..."; qsort(tb,n,sizeof(dict_entry),cmpdictentry); cerr << "done\n"; for (int i=0;ioov_code==tb[i].code) oov_code=i; tb[i].code=i; htb->search((char *)&tb[i].word,HT_ENTER); }; } dictionary::~dictionary(){ delete htb; delete st; delete [] tb; } void dictionary::stat(){ cout << "dictionary class statistics\n"; cout << "size " << n << " used memory " << (lim * sizeof(int) + htb->used() + st->used())/1024 << " Kb\n"; } void dictionary::grow(){ delete htb; cerr << "+\b"; dict_entry *tb2=new dict_entry[lim+GROWTH_STEP]; memcpy(tb2,tb,sizeof(dict_entry) * lim ); delete [] tb; tb=tb2; htb=new htable((lim+GROWTH_STEP)/LOAD_FACTOR); for (int i=0;isearch((char *)&tb[i].word,HT_ENTER); for (int i=lim;isearch((char *)&w,HT_FIND); if (ptr==NULL) return -1; return ptr->code; } int dictionary::encode(const char *w){ //case of strange characters if (strlen(w)==0){cerr << "0";w=OOV();} dict_entry* ptr; if ((ptr=(dict_entry *)htb->search((char *)&w,HT_FIND))!=NULL) return ptr->code; else{ if (!ifl){ //do not extend dictionary if (oov_code==-1){ //did not use OOV yet cerr << "starting to use OOV words [" << w << "]\n"; tb[n].word=st->push(OOV()); htb->search((char *)&tb[n].word,HT_ENTER); tb[n].code=n; tb[n].freq=0; oov_code=n; if (++n==lim) grow(); } //if there is an oov lexicon, check if this word belongs to dict_entry* oovptr; if (oovlex){ if ((oovptr=(dict_entry *)oovlex->htb->search((char *)&w,HT_FIND))!=NULL){ in_oov_lex=1; oov_lex_code=oovptr->code; }else in_oov_lex=0; } return encode(OOV()); } else{ //extend dictionary tb[n].word=st->push((char *)w); htb->search((char *)&tb[n].word,HT_ENTER); tb[n].code=n; tb[n].freq=0; if (++n==lim) grow(); return n-1; } } } char *dictionary::decode(int c){ if (c>=0 && c < n) return tb[c].word; else{ cerr << "decode: code out of boundary\n"; return OOV(); } } dictionary_iter::dictionary_iter(dictionary *dict) : m_dict(dict) { m_dict->htb->scan(HT_INIT); } dict_entry* dictionary_iter::next() { return (dict_entry*)m_dict->htb->scan(HT_CONT); } /* main(int argc,char **argv){ dictionary d(argv[1],40000); d.stat(); cout << "ROMA" << d.decode(0) << "\n"; cout << "ROMA:" << d.encode("ROMA") << "\n"; d.save(argv[2]); } */