Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'irstlm/src/dictionary.cpp')
-rw-r--r--irstlm/src/dictionary.cpp418
1 files changed, 0 insertions, 418 deletions
diff --git a/irstlm/src/dictionary.cpp b/irstlm/src/dictionary.cpp
deleted file mode 100644
index ab53116a2..000000000
--- a/irstlm/src/dictionary.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-/******************************************************************************
- IrstLM: IRST Language Model Toolkit
- Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this library; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-******************************************************************************/
-
-#include <iomanip>
-#include <iostream>
-#include <fstream>
-#include "mempool.h"
-#include "htable.h"
-#include "dictionary.h"
-#include "index.h"
-
-using namespace std;
-
-dictionary::dictionary(char *filename,int size,char* isymb,char* oovlexfile){
-
- // unitialized memory
- if (oovlexfile!=NULL)
- oovlex=new dictionary(oovlexfile,size,isymb,NULL);
- else
- oovlex=(dictionary *)NULL;
-
- htb = new htable(size/LOAD_FACTOR);
- tb = new dict_entry[size];
- st = new strstack(size * 10);
-
- for (int i=0;i<size;i++) tb[i].freq=0;
-
- is=(char*) NULL;
- intsymb(isymb);
-
- oov_code = -1;
- in_oov_lex=0;
- n = 0;
- N = 0;
- dubv = 0;
- lim = size;
- ifl=0; //increment flag
-
- if (filename==NULL) return;
-
- std::ifstream inp(filename,ios::in);
-
- if (!inp){
- cerr << "cannot open " << filename << "\n";
- exit(1);
- }
-
- char buffer[100];
-
- inp >> setw(100) >> buffer;
-
- inp.close();
-
- if ((strncmp(buffer,"dict",4)==0) ||
- (strncmp(buffer,"DICT",4)==0))
- load(filename);
- else
- generate(filename);
-
- cerr << "loaded \n";
-
-
-}
-
-
-
-void dictionary::generate(char *filename){
-
- char buffer[MAX_WORD];
- int k;
-
- ifstream inp(filename,ios::in);
-
- if (!inp){
- cerr << "cannot open " << filename << "\n";
- exit(1);
- }
-
- cerr << "dict:";
-
- ifl=1; k=0;
- while (inp >> setw(MAX_WORD) >> buffer){
-
- if (strlen(buffer)==(MAX_WORD-1)){
- cerr << "dictionary: a too long word was read ("
- << buffer << ")\n";
- };
-
-
- if (strlen(buffer)==0){
- cerr << "zero lenght word!\n";
- continue;
- }
-
- //if (is && (strlen(buffer)==1) && !index(is,buffer[0]))
- if (is && (strlen(buffer)==1) && (index(is,buffer[0])!=NULL))
- continue; //skip over the interruption symbol
-
- incfreq(encode(buffer),1);
-
- if (!(++k % 1000000)) cerr << ".";
- }
- ifl=0;
- cerr << "\n";
-
- inp.close();
-
-}
-
-void dictionary::load(char* filename){
- char header[100];
- char buffer[MAX_WORD];
- char *addr;
- int freqflag=0;
-
- ifstream inp(filename,ios::in);
-
- if (!inp){
- cerr << "\ncannot open " << filename << "\n";
- exit(1);
- }
-
- cerr << "dict:";
-
- inp.getline(header,100);
- if (strncmp(header,"DICT",4)==0)
- freqflag=1;
- else
- if (strncmp(header,"dict",4)!=0){
- cerr << "\ndictionary file " << filename << " has a wrong header\n";
- exit(1);
- }
-
-
- while (inp >> setw(MAX_WORD) >> buffer){
-
- if (strlen(buffer)==(MAX_WORD-1)){
- cerr << "\ndictionary: a too long word was read ("
- << buffer << ")\n";
- };
-
- tb[n].word=st->push(buffer);
- tb[n].code=n;
-
- if (freqflag)
- inp >> tb[n].freq;
- else
- tb[n].freq=0;
-
- if ((addr=htb->search((char *)&tb[n].word,HT_ENTER)))
- if (addr!=(char *)&tb[n].word){
- cerr << "dictionary::loadtxt wrong entry was found ("
- << buffer << ") in position " << n << "\n";
- exit(1);
- }
-
- N+=tb[n].freq;
-
- if (strcmp(buffer,OOV())==0) oov_code=n;
-
- if (++n==lim) grow();
-
- }
-
- inp.close();
-}
-
-
-void dictionary::load(std::istream& inp){
-
- char buffer[MAX_WORD];
- char *addr;
- int size;
-
- inp >> size;
-
- for (int i=0;i<size;i++){
-
- inp >> buffer;
-
- tb[n].word=st->push(buffer);
- tb[n].code=n;
- inp >> tb[n].freq;
- N+=tb[n].freq;
-
- if ((addr=htb->search((char *)&tb[n].word,HT_ENTER)))
- if (addr!=(char *)&tb[n].word){
- cerr << "dictionary::loadtxt wrong entry was found ("
- << buffer << ") in position " << n << "\n";
- exit(1);
- }
-
- if (strcmp(tb[n].word,OOV())==0)
- oov_code=n;
-
- if (++n==lim) grow();
- }
- inp.getline(buffer,MAX_WORD-1);
-}
-
-void dictionary::save(std::ostream& out){
- out << n << "\n";
- for (int i=0;i<n;i++)
- out << tb[i].word << " " << tb[i].freq << "\n";
-}
-
-
-int cmpdictentry(const void *a,const void *b){
- dict_entry *ae=(dict_entry *)a;
- dict_entry *be=(dict_entry *)b;
- return be->freq-ae->freq;
-}
-
-dictionary::dictionary(dictionary* d){
-
- //transfer values
-
- n=d->n; //total entries
- N=d->N; //total frequency
- lim=d->lim; //limit of entries
- oov_code=-1; //code od oov must be re-defined
- ifl=0; //increment flag=0;
- dubv=d->dubv; //dictionary upperbound transferred
- in_oov_lex=0; //does not copy oovlex;
-
-
- //creates a sorted copy of the table
-
- tb = new dict_entry[lim];
- htb = new htable(lim/LOAD_FACTOR);
- st = new strstack(lim * 10);
-
- for (int i=0;i<n;i++){
- tb[i].code=d->tb[i].code;
- tb[i].freq=d->tb[i].freq;
- tb[i].word=st->push(d->tb[i].word);
- }
-
- //sort all entries according to frequency
- cerr << "sorting dictionary ...";
- qsort(tb,n,sizeof(dict_entry),cmpdictentry);
- cerr << "done\n";
-
- for (int i=0;i<n;i++){
-
- //eventually re-assign oov code
- if (d->oov_code==tb[i].code) oov_code=i;
-
- tb[i].code=i;
- htb->search((char *)&tb[i].word,HT_ENTER);
- };
-
-}
-
-
-
-dictionary::~dictionary(){
- delete htb;
- delete st;
- delete [] tb;
-}
-
-void dictionary::stat(){
- cout << "dictionary class statistics\n";
- cout << "size " << n
- << " used memory "
- << (lim * sizeof(int) +
- htb->used() +
- st->used())/1024 << " Kb\n";
-}
-
-void dictionary::grow(){
-
- delete htb;
-
- cerr << "+\b";
-
- dict_entry *tb2=new dict_entry[lim+GROWTH_STEP];
-
- memcpy(tb2,tb,sizeof(dict_entry) * lim );
-
- delete [] tb; tb=tb2;
-
- htb=new htable((lim+GROWTH_STEP)/LOAD_FACTOR);
-
- for (int i=0;i<lim;i++)
-
- htb->search((char *)&tb[i].word,HT_ENTER);
-
- for (int i=lim;i<lim+GROWTH_STEP;i++) tb[i].freq=0;
-
- lim+=GROWTH_STEP;
-
-
-}
-
-void dictionary::save(char *filename,int freqflag){
-
- std::ofstream out(filename,ios::out);
-
- if (!out){
- cerr << "cannot open " << filename << "\n";
- }
-
- // header
- if (freqflag)
- out << "DICTIONARY 0 " << n << "\n";
- else
- out << "dictionary 0 " << n << "\n";
-
- for (int i=0;i<n;i++){
- out << tb[i].word;
- if (freqflag)
- out << " " << tb[i].freq;
- out << "\n";
- }
-
- out.close();
-}
-
-
-int dictionary::getcode(const char *w){
- dict_entry* ptr=(dict_entry *)htb->search((char *)&w,HT_FIND);
- if (ptr==NULL) return -1;
- return ptr->code;
-}
-
-int dictionary::encode(const char *w){
-
- //case of strange characters
- if (strlen(w)==0){cerr << "0";w=OOV();}
-
- dict_entry* ptr;
-
- if ((ptr=(dict_entry *)htb->search((char *)&w,HT_FIND))!=NULL)
- return ptr->code;
- else{
- if (!ifl){ //do not extend dictionary
- if (oov_code==-1){ //did not use OOV yet
- cerr << "starting to use OOV words [" << w << "]\n";
- tb[n].word=st->push(OOV());
- htb->search((char *)&tb[n].word,HT_ENTER);
- tb[n].code=n;
- tb[n].freq=0;
- oov_code=n;
- if (++n==lim) grow();
- }
- //if there is an oov lexicon, check if this word belongs to
- dict_entry* oovptr;
- if (oovlex){
- if ((oovptr=(dict_entry *)oovlex->htb->search((char *)&w,HT_FIND))!=NULL){
- in_oov_lex=1;
- oov_lex_code=oovptr->code;
- }else
- in_oov_lex=0;
- }
- return encode(OOV());
- }
- else{ //extend dictionary
- tb[n].word=st->push((char *)w);
- htb->search((char *)&tb[n].word,HT_ENTER);
- tb[n].code=n;
- tb[n].freq=0;
- if (++n==lim) grow();
- return n-1;
- }
- }
-}
-
-
-char *dictionary::decode(int c){
- if (c>=0 && c < n)
- return tb[c].word;
- else{
- cerr << "decode: code out of boundary\n";
- return OOV();
- }
-}
-
-
-dictionary_iter::dictionary_iter(dictionary *dict) : m_dict(dict) {
- m_dict->htb->scan(HT_INIT);
-}
-
-dict_entry* dictionary_iter::next() {
- return (dict_entry*)m_dict->htb->scan(HT_CONT);
-}
-
-
-
-
-
-/*
-main(int argc,char **argv){
- dictionary d(argv[1],40000);
- d.stat();
- cout << "ROMA" << d.decode(0) << "\n";
- cout << "ROMA:" << d.encode("ROMA") << "\n";
- d.save(argv[2]);
-}
-*/