diff options
author | Hieu Hoang <hieu@hoang.co.uk> | 2013-11-25 13:56:37 +0400 |
---|---|---|
committer | Hieu Hoang <hieu@hoang.co.uk> | 2013-11-25 13:56:37 +0400 |
commit | a146dbec8f0391e247db1ae4c9b7af5c225436f9 (patch) | |
tree | 1fa97934675448cdcffb26b4737887d551822a39 /Src |
initial add of salm to github
Diffstat (limited to 'Src')
55 files changed, 10193 insertions, 0 deletions
diff --git a/Src/IndexSA/IndexSA.cpp b/Src/IndexSA/IndexSA.cpp new file mode 100755 index 0000000..3013d4c --- /dev/null +++ b/Src/IndexSA/IndexSA.cpp @@ -0,0 +1,58 @@ +/** +* Main function to index a corpus according to its suffix array +* Revision: $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ + +#include "stdio.h" +#include "stdlib.h" + +#include <cstring> +#include <string> +#include <iostream> +#include <fstream> +#include "_MonoCorpus.h" +#include "salm_shared.h" + +using namespace std; + +IndexType * corpus; //because the compare function needs to see this, make it global +TextLenType actualCorpusSize; + +int main(int argc, char* argv[]){ + + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + + fprintf(stderr,"\nUsage:"); + fprintf(stderr,"\n%s fileNameStem [existingIDVocFileName]\n",argv[0]); + + exit(0); + } + + C_MonoCorpus corpus; + + char vocFileName[1024]; + sprintf(vocFileName, "%s.id_voc", argv[1]); + + if(argc==2){ //no existing vocabulary given + cerr<<"Initialize vocabulary file: "<<vocFileName<<endl; + corpus.initializeVocabulary(argv[1]); + corpus.loadCorpusAndSort(argv[1], vocFileName, true); + } + else{ + if(strcmp(vocFileName, argv[2])!=0){ + cerr<<"Error! ExistingIDVocFileName has to be called: "<<vocFileName<<" and cover all the words in the corpus."<<endl; + exit(-1); + } + corpus.loadCorpusAndSort(argv[1], argv[2], false); + } + + corpus.output(argv[1]); + + return 0; +} + diff --git a/Src/IndexSA/IndexSA.cpp~ b/Src/IndexSA/IndexSA.cpp~ new file mode 100755 index 0000000..d8ad043 --- /dev/null +++ b/Src/IndexSA/IndexSA.cpp~ @@ -0,0 +1,57 @@ +/** +* Main function to index a corpus according to its suffix array +* Revision: $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ + +#include "stdio.h" +#include "stdlib.h" + +#include <string> +#include <iostream> +#include <fstream> +#include "_MonoCorpus.h" +#include "salm_shared.h" + +using namespace std; + +IndexType * corpus; //because the compare function needs to see this, make it global +TextLenType actualCorpusSize; + +int main(int argc, char* argv[]){ + + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + + fprintf(stderr,"\nUsage:"); + fprintf(stderr,"\n%s fileNameStem [existingIDVocFileName]\n",argv[0]); + + exit(0); + } + + C_MonoCorpus corpus; + + char vocFileName[1024]; + sprintf(vocFileName, "%s.id_voc", argv[1]); + + if(argc==2){ //no existing vocabulary given + cerr<<"Initialize vocabulary file: "<<vocFileName<<endl; + corpus.initializeVocabulary(argv[1]); + corpus.loadCorpusAndSort(argv[1], vocFileName, true); + } + else{ + if(strcmp(vocFileName, argv[2])!=0){ + cerr<<"Error! ExistingIDVocFileName has to be called: "<<vocFileName<<" and cover all the words in the corpus."<<endl; + exit(-1); + } + corpus.loadCorpusAndSort(argv[1], argv[2], false); + } + + corpus.output(argv[1]); + + return 0; +} + diff --git a/Src/IndexSA/_MonoCorpus.cpp b/Src/IndexSA/_MonoCorpus.cpp new file mode 100755 index 0000000..ab53813 --- /dev/null +++ b/Src/IndexSA/_MonoCorpus.cpp @@ -0,0 +1,440 @@ +/** +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ + +#include "_MonoCorpus.h" +#include "malloc.h" +#include "time.h" + +#include <fstream> +#include <iostream> +#include <cstring> +#include <string> +#include <algorithm> + +using namespace std; + +extern IndexType * corpus; +extern TextLenType actualCorpusSize; + +bool operator<(const C_SuffixPointer& a, const C_SuffixPointer& b) +{ + bool stillEqual = true; + TextLenType currentPosOfA = a.pointer; + TextLenType currentPosOfB = b.pointer; + + if(currentPosOfA==currentPosOfB){ + return false; + } + + while(stillEqual){ + if(corpus[currentPosOfA]<corpus[currentPosOfB]){ + return true; + } + + if(corpus[currentPosOfA]>corpus[currentPosOfB]){ + return false; + } + + //then still equal at these two positions + currentPosOfA++; + currentPosOfB++; + + if(currentPosOfA>=actualCorpusSize){ + currentPosOfA=0; + } + + if(currentPosOfB>=actualCorpusSize){ + currentPosOfB=0; + } + } + + //equal + return false; +} + + +C_SuffixPointer::C_SuffixPointer() +{ + +} + +//copy constructor +C_SuffixPointer::C_SuffixPointer(const C_SuffixPointer & obj) +{ + this->pointer = obj.pointer; +} + +C_SuffixPointer::~C_SuffixPointer() +{ + +} + + +C_SuffixPointer::C_SuffixPointer(TextLenType pointer) +{ + this->pointer = pointer; +} +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_MonoCorpus::C_MonoCorpus() +{ + this->currentPosInCorpus = 0; + this->maxVocIdFromCorpus = 0; +} + +C_MonoCorpus::~C_MonoCorpus() +{ + free(corpus); + free(this->suffix); + free(this->offsetList); +} + + +/** +* Initialize an IDVocabulary file +**/ +void C_MonoCorpus::initializeVocabulary(char *fileNameStem) +{ + C_IDVocabulary tmpVoc; + tmpVoc.addingReservedWords(); + + char vocFileName[1024]; + sprintf(vocFileName, "%s.id_voc", fileNameStem); + + tmpVoc.outputToFile(vocFileName); +} + + +void C_MonoCorpus::loadCorpusAndSort(const char *fileName, const char * idVocFileName, bool vocNeedsToBeUpdated) +{ + IndexType id = 0; + + //load vocabulary + this->voc = new C_IDVocabulary(idVocFileName); + this->vocNeedsToBeUpdated = vocNeedsToBeUpdated; + + this->vocIdForSentIdPlaceHolder = this->voc->returnId(C_String("_SENT_ID_PLACEHOLDER_")); + if(this->vocIdForSentIdPlaceHolder==0){ + cerr<<"ID vocabulary does not have the type _SENT_ID_PLACEHOLDER_, error!\n Add this word to the universal vocabulary and try again!\n"; + exit(-1); + } + if(this->vocIdForSentIdPlaceHolder>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = this->vocIdForSentIdPlaceHolder; + } + + this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_")); + if(this->vocIdForSentStart==0){ + cerr<<"ID vocabulary does not have the type _SENTENCE_START_, error!\n Add this word to the universal vocabulary and try again!\n"; + exit(-1); + } + if(this->vocIdForSentStart>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = this->vocIdForSentStart; + } + + this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_")); + if(this->vocIdForSentEnd==0){ + cerr<<"ID vocabulary does not have the type _END_OF_SENTENCE_, error!\n Add this word to the universal vocabulary and try again!\n"; + exit(-1); + } + if(this->vocIdForSentEnd>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = this->vocIdForSentEnd; + } + + this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_")); + if(this->vocIdForCorpusEnd==0){ + cerr<<"ID vocabulary does not have the type _END_OF_CORPUS_, error!\n Add this word to the universal vocabulary and try again!\n"; + exit(-1); + } + if(this->vocIdForCorpusEnd>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = this->vocIdForCorpusEnd; + } + + ifstream textStream1; + textStream1.open(fileName); + + if(textStream1==NULL){ + fprintf(stderr,"Text %s does not exist. Exit!\n",fileName); + exit(-1); + } + + long ltime1, ltime2; + time( <ime1 ); + + string aLine; + unsigned int sentNumber = 1; + unsigned int sentLen = 0; + unsigned int corpusSize = 0; + + char * thisToken; + char delimit[] =" \t\r\n"; + + //first, scan the corpus to estimate the size and check if each line is shorter than 256 words + getline(textStream1, aLine); + while(!textStream1.eof()){ + + if(aLine.length()>0){ + sentLen = 0; + + thisToken = strtok((char*) aLine.c_str(), delimit ); + while( thisToken != NULL ) { + + if(this->vocNeedsToBeUpdated){ + id = this->voc->getId(C_String(thisToken)); + } + else{ //the provided vocabulary should cover all the words in this corpus + id = this->voc->returnId(C_String(thisToken)); + + if(id==0){ //word does not exist + cerr<<"Vocabulary: "<<idVocFileName<<" does not cover all the words in the corpus!"<<endl; + cerr<<"Word: "<<thisToken<<" does not exist in the voc!\n"; + exit(-1); + } + } + + + + sentLen++; + + if(id>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = id; + } + + if(sentLen>=256){ + cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n"; + exit(-1); + } + + // While there are tokens in "string" + // Get next token: + thisToken = strtok( NULL, delimit); + } + corpusSize+=sentLen; + + sentLen = 0; + sentNumber++; + } + else{ + cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this message if this is the last sentence.\n"; + } + getline(textStream1, aLine); + } + + sentNumber--; + unsigned int estimatedSize = corpusSize+3*sentNumber+1000; //with some redundancy + cerr<<sentNumber<<" sentences and "<<corpusSize<<" words in corpus\n"; + cerr<<"Reserve "<<estimatedSize*2<<" bytes in RAM for sorting\n"; + textStream1.close(); + + + //second pass, convert the corpus into vocIDs and create suffix array + ifstream textStream2; + textStream2.open(fileName); + + this->allocateMem(estimatedSize); + this->currentPosInCorpus = 0; + sentNumber = 1; + + getline(textStream2, aLine); + while(!textStream2.eof()){ + + if(aLine.length()>0){ + sentLen = 0; + + //add sentId + //offset at this position will store the acutal sentence length + corpus[this->currentPosInCorpus]=this->vocIdForSentIdPlaceHolder; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->currentPosInCorpus++; + + //add <s> + sentLen++; //not real sentence length, but to keep track of offset + corpus[this->currentPosInCorpus]=this->vocIdForSentStart; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen; + this->currentPosInCorpus++; + + thisToken = strtok((char*) aLine.c_str(), delimit ); + while( thisToken != NULL ) { + + id = this->voc->returnId(C_String(thisToken)); + if(id==0){ + cerr<<"Word \""<<thisToken<<"\" is not listed in the IDVocabulary.\n"; + exit(-1); + } + + sentLen++; + + if(id>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = id; + } + + corpus[this->currentPosInCorpus]=id; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen; + this->currentPosInCorpus++; + + if(sentLen>=256){ + cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n"; + exit(-1); + } + + // While there are tokens in "string" + // Get next token: + thisToken = strtok( NULL, delimit); + } + + //add <sentEnd> + corpus[this->currentPosInCorpus]=this->vocIdForSentEnd; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->offsetList[this->currentPosInCorpus] = (unsigned char) (sentLen + 1); + this->offsetList[this->currentPosInCorpus - sentLen - 1] = (unsigned char) (sentLen-1); //write the sentLen to sent begin correspond to <sentId> + this->currentPosInCorpus++; + + sentLen = 0; + sentNumber++; + } + else{ + cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this if this is the last sentence.\n"; + } + + aLine[0]=0; + getline(textStream2, aLine); + } + textStream2.close(); + + //add <endOfCorpus> to the end of data + corpus[this->currentPosInCorpus]=this->vocIdForCorpusEnd; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->offsetList[this->currentPosInCorpus] = (unsigned char) 0; + this->currentPosInCorpus++; + + actualCorpusSize = this->currentPosInCorpus; + + time( <ime2 ); + cerr<<"\nCorpus loaded in: "<<ltime2-ltime1<<" seconds."<<endl; + cerr<<"Total "<<sentNumber-1<<" sentences loaded.\n"; + + //replace the sentId place holder to actual sentId + time( <ime1 ); + cerr<<"Inserting sentence IDs into the corpus...\n"; + IndexType sentId = this->maxVocIdFromCorpus+1; + for(TextLenType i=0;i<actualCorpusSize;i++){ + if(corpus[i]==this->vocIdForSentIdPlaceHolder){ + corpus[i]=sentId; + sentId++; + } + } + time( <ime2 ); + cerr<<"\nSentence IDs inserted in: "<<ltime2-ltime1<<" seconds."<<endl; + + //sorting + time( <ime1 ); + cerr<<"Sorting the suffix...\n"; + sort(this->suffix, this->suffix+actualCorpusSize); + time( <ime2 ); + cerr<<"\nCorpus sorted in: "<<ltime2-ltime1<<" seconds."<<endl; + cerr<<"Done."<<endl; + +} + +void C_MonoCorpus::allocateMem(TextLenType corpusSize) +{ + corpus = (IndexType *) malloc(sizeof(IndexType)*corpusSize); + + if(corpus==0){ + cerr<<"Failed to allocate memory for corpus. Quit!\n"; + exit(-1); + } + + this->suffix = (C_SuffixPointer *) malloc(sizeof(C_SuffixPointer)*corpusSize); + if(this->suffix==0){ + cerr<<"Failed to allocate memory for suffix. Quit!\n"; + exit(-1); + } + + this->offsetList = (unsigned char *) malloc(sizeof(unsigned char)*corpusSize); + if(this->offsetList==0){ + cerr<<"Failed to allocate memory for offset. Quit!\n"; + exit(-1); + } + +} + + +void C_MonoCorpus::outputCorpus(char *filename) +{ + cerr<<"Writing corpus to file: "<<filename<<endl; + ofstream textOutStream; + textOutStream.open(filename, ios::binary); + + //first, write down the corpus size + textOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType)); + + for(TextLenType i=0; i<actualCorpusSize;i++){ + textOutStream.write((char *)&(corpus[i]), sizeof(IndexType)); + } + + textOutStream.close(); + +} + +void C_MonoCorpus::outputOffset(char *filename) +{ + cerr<<"Writing offset to file: "<<filename<<endl; + + ofstream offsetOutStream; + offsetOutStream.open(filename, ios::binary); + + //first, write down the corpus size + offsetOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType)); + + for(TextLenType i=0; i<actualCorpusSize; i++){ + offsetOutStream.write((char *)& (this->offsetList[i]), sizeof(unsigned char)); + } + offsetOutStream.close(); +} + +void C_MonoCorpus::outputSuffix(char *filename) +{ + cerr<<"Writing suffix information to file: "<<filename<<endl; + + ofstream saOutStream; + saOutStream.open(filename, ios::binary); + + //first, write down the corpus size + saOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType)); + + for(TextLenType i=0;i<actualCorpusSize; i++){ + saOutStream.write((char *) & (this->suffix[i].pointer), sizeof(TextLenType)); + } + + saOutStream.close(); +} + +void C_MonoCorpus::output(char *filename) +{ + char outputVocFileName[1024]; + char outputCorpusFileName[1024]; + char outputOffsetFileName[1024]; + char outputSuffixFileName[1024]; + + + if(this->vocNeedsToBeUpdated){ + sprintf(outputVocFileName, "%s.id_voc", filename); + this->voc->outputToFile(outputVocFileName); + } + + sprintf(outputCorpusFileName, "%s.sa_corpus", filename); + sprintf(outputOffsetFileName, "%s.sa_offset", filename); + sprintf(outputSuffixFileName, "%s.sa_suffix", filename); + + + this->outputCorpus(outputCorpusFileName); + this->outputOffset(outputOffsetFileName); + this->outputSuffix(outputSuffixFileName); +} + diff --git a/Src/IndexSA/_MonoCorpus.cpp~ b/Src/IndexSA/_MonoCorpus.cpp~ new file mode 100755 index 0000000..3e3a29b --- /dev/null +++ b/Src/IndexSA/_MonoCorpus.cpp~ @@ -0,0 +1,439 @@ +/** +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ + +#include "_MonoCorpus.h" +#include "malloc.h" +#include "time.h" + +#include <fstream> +#include <iostream> +#include <string> +#include <algorithm> + +using namespace std; + +extern IndexType * corpus; +extern TextLenType actualCorpusSize; + +bool operator<(const C_SuffixPointer& a, const C_SuffixPointer& b) +{ + bool stillEqual = true; + TextLenType currentPosOfA = a.pointer; + TextLenType currentPosOfB = b.pointer; + + if(currentPosOfA==currentPosOfB){ + return false; + } + + while(stillEqual){ + if(corpus[currentPosOfA]<corpus[currentPosOfB]){ + return true; + } + + if(corpus[currentPosOfA]>corpus[currentPosOfB]){ + return false; + } + + //then still equal at these two positions + currentPosOfA++; + currentPosOfB++; + + if(currentPosOfA>=actualCorpusSize){ + currentPosOfA=0; + } + + if(currentPosOfB>=actualCorpusSize){ + currentPosOfB=0; + } + } + + //equal + return false; +} + + +C_SuffixPointer::C_SuffixPointer() +{ + +} + +//copy constructor +C_SuffixPointer::C_SuffixPointer(const C_SuffixPointer & obj) +{ + this->pointer = obj.pointer; +} + +C_SuffixPointer::~C_SuffixPointer() +{ + +} + + +C_SuffixPointer::C_SuffixPointer(TextLenType pointer) +{ + this->pointer = pointer; +} +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_MonoCorpus::C_MonoCorpus() +{ + this->currentPosInCorpus = 0; + this->maxVocIdFromCorpus = 0; +} + +C_MonoCorpus::~C_MonoCorpus() +{ + free(corpus); + free(this->suffix); + free(this->offsetList); +} + + +/** +* Initialize an IDVocabulary file +**/ +void C_MonoCorpus::initializeVocabulary(char *fileNameStem) +{ + C_IDVocabulary tmpVoc; + tmpVoc.addingReservedWords(); + + char vocFileName[1024]; + sprintf(vocFileName, "%s.id_voc", fileNameStem); + + tmpVoc.outputToFile(vocFileName); +} + + +void C_MonoCorpus::loadCorpusAndSort(const char *fileName, const char * idVocFileName, bool vocNeedsToBeUpdated) +{ + IndexType id = 0; + + //load vocabulary + this->voc = new C_IDVocabulary(idVocFileName); + this->vocNeedsToBeUpdated = vocNeedsToBeUpdated; + + this->vocIdForSentIdPlaceHolder = this->voc->returnId(C_String("_SENT_ID_PLACEHOLDER_")); + if(this->vocIdForSentIdPlaceHolder==0){ + cerr<<"ID vocabulary does not have the type _SENT_ID_PLACEHOLDER_, error!\n Add this word to the universal vocabulary and try again!\n"; + exit(-1); + } + if(this->vocIdForSentIdPlaceHolder>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = this->vocIdForSentIdPlaceHolder; + } + + this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_")); + if(this->vocIdForSentStart==0){ + cerr<<"ID vocabulary does not have the type _SENTENCE_START_, error!\n Add this word to the universal vocabulary and try again!\n"; + exit(-1); + } + if(this->vocIdForSentStart>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = this->vocIdForSentStart; + } + + this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_")); + if(this->vocIdForSentEnd==0){ + cerr<<"ID vocabulary does not have the type _END_OF_SENTENCE_, error!\n Add this word to the universal vocabulary and try again!\n"; + exit(-1); + } + if(this->vocIdForSentEnd>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = this->vocIdForSentEnd; + } + + this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_")); + if(this->vocIdForCorpusEnd==0){ + cerr<<"ID vocabulary does not have the type _END_OF_CORPUS_, error!\n Add this word to the universal vocabulary and try again!\n"; + exit(-1); + } + if(this->vocIdForCorpusEnd>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = this->vocIdForCorpusEnd; + } + + ifstream textStream1; + textStream1.open(fileName); + + if(textStream1==NULL){ + fprintf(stderr,"Text %s does not exist. Exit!\n",fileName); + exit(-1); + } + + long ltime1, ltime2; + time( <ime1 ); + + string aLine; + unsigned int sentNumber = 1; + unsigned int sentLen = 0; + unsigned int corpusSize = 0; + + char * thisToken; + char delimit[] =" \t\r\n"; + + //first, scan the corpus to estimate the size and check if each line is shorter than 256 words + getline(textStream1, aLine); + while(!textStream1.eof()){ + + if(aLine.length()>0){ + sentLen = 0; + + thisToken = strtok((char*) aLine.c_str(), delimit ); + while( thisToken != NULL ) { + + if(this->vocNeedsToBeUpdated){ + id = this->voc->getId(C_String(thisToken)); + } + else{ //the provided vocabulary should cover all the words in this corpus + id = this->voc->returnId(C_String(thisToken)); + + if(id==0){ //word does not exist + cerr<<"Vocabulary: "<<idVocFileName<<" does not cover all the words in the corpus!"<<endl; + cerr<<"Word: "<<thisToken<<" does not exist in the voc!\n"; + exit(-1); + } + } + + + + sentLen++; + + if(id>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = id; + } + + if(sentLen>=256){ + cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n"; + exit(-1); + } + + // While there are tokens in "string" + // Get next token: + thisToken = strtok( NULL, delimit); + } + corpusSize+=sentLen; + + sentLen = 0; + sentNumber++; + } + else{ + cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this message if this is the last sentence.\n"; + } + getline(textStream1, aLine); + } + + sentNumber--; + unsigned int estimatedSize = corpusSize+3*sentNumber+1000; //with some redundancy + cerr<<sentNumber<<" sentences and "<<corpusSize<<" words in corpus\n"; + cerr<<"Reserve "<<estimatedSize*2<<" bytes in RAM for sorting\n"; + textStream1.close(); + + + //second pass, convert the corpus into vocIDs and create suffix array + ifstream textStream2; + textStream2.open(fileName); + + this->allocateMem(estimatedSize); + this->currentPosInCorpus = 0; + sentNumber = 1; + + getline(textStream2, aLine); + while(!textStream2.eof()){ + + if(aLine.length()>0){ + sentLen = 0; + + //add sentId + //offset at this position will store the acutal sentence length + corpus[this->currentPosInCorpus]=this->vocIdForSentIdPlaceHolder; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->currentPosInCorpus++; + + //add <s> + sentLen++; //not real sentence length, but to keep track of offset + corpus[this->currentPosInCorpus]=this->vocIdForSentStart; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen; + this->currentPosInCorpus++; + + thisToken = strtok((char*) aLine.c_str(), delimit ); + while( thisToken != NULL ) { + + id = this->voc->returnId(C_String(thisToken)); + if(id==0){ + cerr<<"Word \""<<thisToken<<"\" is not listed in the IDVocabulary.\n"; + exit(-1); + } + + sentLen++; + + if(id>this->maxVocIdFromCorpus){ + this->maxVocIdFromCorpus = id; + } + + corpus[this->currentPosInCorpus]=id; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen; + this->currentPosInCorpus++; + + if(sentLen>=256){ + cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n"; + exit(-1); + } + + // While there are tokens in "string" + // Get next token: + thisToken = strtok( NULL, delimit); + } + + //add <sentEnd> + corpus[this->currentPosInCorpus]=this->vocIdForSentEnd; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->offsetList[this->currentPosInCorpus] = (unsigned char) (sentLen + 1); + this->offsetList[this->currentPosInCorpus - sentLen - 1] = (unsigned char) (sentLen-1); //write the sentLen to sent begin correspond to <sentId> + this->currentPosInCorpus++; + + sentLen = 0; + sentNumber++; + } + else{ + cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this if this is the last sentence.\n"; + } + + aLine[0]=0; + getline(textStream2, aLine); + } + textStream2.close(); + + //add <endOfCorpus> to the end of data + corpus[this->currentPosInCorpus]=this->vocIdForCorpusEnd; + this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus); + this->offsetList[this->currentPosInCorpus] = (unsigned char) 0; + this->currentPosInCorpus++; + + actualCorpusSize = this->currentPosInCorpus; + + time( <ime2 ); + cerr<<"\nCorpus loaded in: "<<ltime2-ltime1<<" seconds."<<endl; + cerr<<"Total "<<sentNumber-1<<" sentences loaded.\n"; + + //replace the sentId place holder to actual sentId + time( <ime1 ); + cerr<<"Inserting sentence IDs into the corpus...\n"; + IndexType sentId = this->maxVocIdFromCorpus+1; + for(TextLenType i=0;i<actualCorpusSize;i++){ + if(corpus[i]==this->vocIdForSentIdPlaceHolder){ + corpus[i]=sentId; + sentId++; + } + } + time( <ime2 ); + cerr<<"\nSentence IDs inserted in: "<<ltime2-ltime1<<" seconds."<<endl; + + //sorting + time( <ime1 ); + cerr<<"Sorting the suffix...\n"; + sort(this->suffix, this->suffix+actualCorpusSize); + time( <ime2 ); + cerr<<"\nCorpus sorted in: "<<ltime2-ltime1<<" seconds."<<endl; + cerr<<"Done."<<endl; + +} + +void C_MonoCorpus::allocateMem(TextLenType corpusSize) +{ + corpus = (IndexType *) malloc(sizeof(IndexType)*corpusSize); + + if(corpus==0){ + cerr<<"Failed to allocate memory for corpus. Quit!\n"; + exit(-1); + } + + this->suffix = (C_SuffixPointer *) malloc(sizeof(C_SuffixPointer)*corpusSize); + if(this->suffix==0){ + cerr<<"Failed to allocate memory for suffix. Quit!\n"; + exit(-1); + } + + this->offsetList = (unsigned char *) malloc(sizeof(unsigned char)*corpusSize); + if(this->offsetList==0){ + cerr<<"Failed to allocate memory for offset. Quit!\n"; + exit(-1); + } + +} + + +void C_MonoCorpus::outputCorpus(char *filename) +{ + cerr<<"Writing corpus to file: "<<filename<<endl; + ofstream textOutStream; + textOutStream.open(filename, ios::binary); + + //first, write down the corpus size + textOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType)); + + for(TextLenType i=0; i<actualCorpusSize;i++){ + textOutStream.write((char *)&(corpus[i]), sizeof(IndexType)); + } + + textOutStream.close(); + +} + +void C_MonoCorpus::outputOffset(char *filename) +{ + cerr<<"Writing offset to file: "<<filename<<endl; + + ofstream offsetOutStream; + offsetOutStream.open(filename, ios::binary); + + //first, write down the corpus size + offsetOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType)); + + for(TextLenType i=0; i<actualCorpusSize; i++){ + offsetOutStream.write((char *)& (this->offsetList[i]), sizeof(unsigned char)); + } + offsetOutStream.close(); +} + +void C_MonoCorpus::outputSuffix(char *filename) +{ + cerr<<"Writing suffix information to file: "<<filename<<endl; + + ofstream saOutStream; + saOutStream.open(filename, ios::binary); + + //first, write down the corpus size + saOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType)); + + for(TextLenType i=0;i<actualCorpusSize; i++){ + saOutStream.write((char *) & (this->suffix[i].pointer), sizeof(TextLenType)); + } + + saOutStream.close(); +} + +void C_MonoCorpus::output(char *filename) +{ + char outputVocFileName[1024]; + char outputCorpusFileName[1024]; + char outputOffsetFileName[1024]; + char outputSuffixFileName[1024]; + + + if(this->vocNeedsToBeUpdated){ + sprintf(outputVocFileName, "%s.id_voc", filename); + this->voc->outputToFile(outputVocFileName); + } + + sprintf(outputCorpusFileName, "%s.sa_corpus", filename); + sprintf(outputOffsetFileName, "%s.sa_offset", filename); + sprintf(outputSuffixFileName, "%s.sa_suffix", filename); + + + this->outputCorpus(outputCorpusFileName); + this->outputOffset(outputOffsetFileName); + this->outputSuffix(outputSuffixFileName); +} + diff --git a/Src/IndexSA/_MonoCorpus.h b/Src/IndexSA/_MonoCorpus.h new file mode 100755 index 0000000..4c834b0 --- /dev/null +++ b/Src/IndexSA/_MonoCorpus.h @@ -0,0 +1,60 @@ +#if !defined(__MonoCorpus__H__INCLUDED_) +#define __MonoCorpus__H__INCLUDED_ + +#include "_IDVocabulary.h" +#include "salm_shared.h" + +/** +* \ingroup index +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +* Defines the wrapper class for the comparing function +**/ +class C_SuffixPointer +{ +public: + C_SuffixPointer(const C_SuffixPointer &); + C_SuffixPointer(); + ~C_SuffixPointer(); + C_SuffixPointer(TextLenType pointer); + TextLenType pointer; +}; + +/** +* \ingroup index +* Monolingual corpus class for loading the corpus from file, sort it according to the suffix array order +* and convert it to the binary format for suffix array applications +**/ +class C_MonoCorpus +{ +public: + void initializeVocabulary(char * fileNameStem); + void output(char * filename); + void loadCorpusAndSort(const char * fileName, const char * idVocFileName, bool vocNeedsToBeUpdated); + + C_MonoCorpus(); + virtual ~C_MonoCorpus(); + +private: + IndexType maxVocIdFromCorpus; + void outputSuffix(char * filename); + void outputOffset(char * filename); + void outputCorpus(char * filename); + + IndexType vocIdForSentIdPlaceHolder; + IndexType vocIdForSentStart; + IndexType vocIdForSentEnd; + IndexType vocIdForCorpusEnd; + + TextLenType currentPosInCorpus; + void allocateMem(TextLenType corpusSize); + + C_SuffixPointer * suffix; + unsigned char * offsetList; + C_IDVocabulary * voc; + + bool vocNeedsToBeUpdated; + +}; + +#endif // !defined(__MonoCorpus__H__INCLUDED_) diff --git a/Src/SALM-API-Description.txt b/Src/SALM-API-Description.txt new file mode 100755 index 0000000..c36f60c --- /dev/null +++ b/Src/SALM-API-Description.txt @@ -0,0 +1,24 @@ +/**
+* \defgroup index Indexing the corpus
+* \defgroup search Search Applications
+* \defgroup scan Scan Applications
+* \defgroup lm Suffix Array Language Model
+* \defgroup utils Utilities
+*
+* \mainpage SALM API Documentation
+* Author: <a href=mailto:joy+salm@cs.cmu.edu > Ying (Joy) Zhang </a>
+* \section intro Introduction
+*
+* There are three main modules in <a href=http://projectile.is.cs.cmu.edu/research/public/tools/salm/salm.htm > SALM </a> : Indexing, Searching and Scanning.
+* To start, use IndexSA to index the corpus according to its suffix array.
+* This is the first step for all applications.
+* Once the corpus is indexed. We can use SALM to do all kinds of interesting process on this corpus.
+* \section search Applications based on searching the corpus
+* These applications searches for the occurrences of an n-gram or all the embedded n-grams of a sentence in the corpus.
+* \section scan Applications based on scanning the corpus
+* These applications scan through the corpus in a linear time and collects information such as the type/token frequency of the n-grams in the data.
+* \section lm Suffix Array Language Model
+* An online language model based on the suffix array indexing. Suffix array language model can use arbitrarily long history and very large corpus.
+* \section utils Utilities
+* Utility functions such as updating the universal ID vocabulary after observing a new corpus
+**/
diff --git a/Src/Shared/_IDVocabulary.cpp b/Src/Shared/_IDVocabulary.cpp new file mode 100755 index 0000000..a34b043 --- /dev/null +++ b/Src/Shared/_IDVocabulary.cpp @@ -0,0 +1,219 @@ +/** +* _IDVocabulary.cpp: implementation of the C_IDVocabulary class. +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ + + +#include "_IDVocabulary.h" +#include <fstream> +#include <iostream> +#include <cstring> +#include <memory.h> +#include <stdlib.h> + +using namespace std; + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_IDVocabulary::C_IDVocabulary() +{ + this->maxIdInVoc = 0; +} + +C_IDVocabulary::C_IDVocabulary(const char * fileName) +{ + + this->maxIdInVoc = 0; + + this->loadFromFile(fileName); +} + +C_IDVocabulary::~C_IDVocabulary() +{ + +} + +/// Return the vocID of word "text" if it exist in the vocabulary +/// Otherwise return 0 +IndexType C_IDVocabulary::returnId(C_String text) +{ + IndexType id; + + map<C_String, IndexType, ltstr>::iterator iterText2Id; + iterText2Id = this->text2id.find(text); + + if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk> + id = 0; + } + else{ + id = iterText2Id->second; + } + + return id; +} + +/// Return the text of the word given its vocID +/// return <UNK> if specified vocID does not exist +C_String C_IDVocabulary::getText(IndexType id) +{ + map<IndexType, C_String>::iterator iterId2Text; + iterId2Text = this->id2text.find(id); + + if(iterId2Text==this->id2text.end()){ + return C_String("<UNK>"); + } + + return iterId2Text->second; +} + +IndexType C_IDVocabulary::getSize() +{ + return this->text2id.size(); +} + + +/// Load the vocabulary file into memory +/// The format of the vocabulary file is: +/// word vocID +// in each line. +void C_IDVocabulary::loadFromFile(const char *fileName) +{ + + ifstream existingVocFile; + existingVocFile.open(fileName); + + if(!existingVocFile){ + cerr<<"Can not open existing vocabulary file "<<fileName<<endl; + exit(0); + } + + cerr<<"Loading existing vocabulary file: "<<fileName<<endl; + + char aLine[1024]; + char * aToken; + char delimit[] = " \t\r\n"; + IndexType vocId = 0; + + while(!existingVocFile.eof()){ + existingVocFile.getline(aLine, 1024, '\n'); + + if(strlen(aLine)>0){ //a meaningful word, esp for the last line during reading file + vector<C_String> tokensInLine; + + aToken = strtok(aLine, delimit); + while( aToken != NULL ) { + tokensInLine.push_back(C_String(aToken)); + aToken = strtok( NULL, delimit); + } + + if(tokensInLine.size()!=2){ + cerr<<"Not valid format for Vocabulary: "<<aLine<<endl; + } + + vocId = atoi(tokensInLine[1].toString()); + + if(vocId>this->maxIdInVoc){ + this->maxIdInVoc = vocId; + } + + this->text2id.insert(make_pair(tokensInLine[0], vocId)); + this->id2text.insert(make_pair(vocId, tokensInLine[0] )); + + } + + aLine[0]=0; + } + cerr<<"Total "<<this->text2id.size()<<" word types loaded\n"; + cerr<<"Max VocID="<<this->maxIdInVoc<<endl; +} + +/// Return the maximum ID from all words in the vocabulary +/// Usually equals to the size of the vocabulary if the vocabulary is created from this corpus only. +/// If the vocabulary includes words from other corpora and the vocabulary only lists words in this corpus, +/// then max voc ID could be different from the vocabulary size +IndexType C_IDVocabulary::returnMaxID() +{ + return this->maxIdInVoc; +} + +IndexType C_IDVocabulary::returnNullWordID() +{ + return 0; +} + +/** +* Output the vocabulary to a file +**/ +void C_IDVocabulary::outputToFile(char *filename) +{ + + ofstream outputVocFile; + outputVocFile.open(filename); + + if(!outputVocFile){ + cerr<<"Can not open "<<filename<<" to write vocabulary\n"; + exit(-1); + } + + map<C_String, IndexType, ltstr>::iterator iterText2Id; + + iterText2Id = this->text2id.begin(); + while(iterText2Id!=this->text2id.end()){ + outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl; + iterText2Id++; + } + + outputVocFile.close(); +} + +/// Reserver vocID 0-NUMBER_OF_RESERVED_WORDS_IN_VOC for special words that might be useful for applications +/// Here we reserved 5 words: +/// _SENT_ID_PLACEHOLDER_ 1 +/// _END_OF_SENTENCE_ 2 +/// _TOO_LONG_TOKEN_ 3 +/// _SENTENCE_START_ 4 +/// _END_OF_CORPUS_ 5 +/// You can add other special words to the list as long as the assignment of vocID and its interpretation is consistent between application and indexing +void C_IDVocabulary::addingReservedWords() +{ + this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1); + this->insertWord(C_String("_END_OF_SENTENCE_"), 2); + this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3); + this->insertWord(C_String("_SENTENCE_START_"), 4); + this->insertWord(C_String("_END_OF_CORPUS_"), 5); + + char reservedWord[20]; + for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){ + memset(reservedWord, 0, 20); + sprintf(reservedWord, "_RESERVED_WORDS_%d", i); + this->insertWord(C_String(reservedWord), i); + } +} + +void C_IDVocabulary::insertWord(C_String text, IndexType id) +{ + this->text2id.insert(make_pair(text, id)); + this->id2text.insert(make_pair(id, text)); + +} + +/** +* Check if the word already exist in the voc, +* if so, return the vocID of the word, +* otherwise assign an ID to this word and insert it into the voc +**/ +IndexType C_IDVocabulary::getId(C_String text) +{ + IndexType id = this->returnId(text); + if(id==0){ + this->maxIdInVoc++; + this->insertWord(text, this->maxIdInVoc); + return this->maxIdInVoc; + } + + //else, already exist + return id; +} diff --git a/Src/Shared/_IDVocabulary.cpp~ b/Src/Shared/_IDVocabulary.cpp~ new file mode 100755 index 0000000..d5e6a14 --- /dev/null +++ b/Src/Shared/_IDVocabulary.cpp~ @@ -0,0 +1,218 @@ +/** +* _IDVocabulary.cpp: implementation of the C_IDVocabulary class. +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ + + +#include "_IDVocabulary.h" +#include <fstream> +#include <iostream> +#include <cstring> +#include <memory.h> + +using namespace std; + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_IDVocabulary::C_IDVocabulary() +{ + this->maxIdInVoc = 0; +} + +C_IDVocabulary::C_IDVocabulary(const char * fileName) +{ + + this->maxIdInVoc = 0; + + this->loadFromFile(fileName); +} + +C_IDVocabulary::~C_IDVocabulary() +{ + +} + +/// Return the vocID of word "text" if it exist in the vocabulary +/// Otherwise return 0 +IndexType C_IDVocabulary::returnId(C_String text) +{ + IndexType id; + + map<C_String, IndexType, ltstr>::iterator iterText2Id; + iterText2Id = this->text2id.find(text); + + if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk> + id = 0; + } + else{ + id = iterText2Id->second; + } + + return id; +} + +/// Return the text of the word given its vocID +/// return <UNK> if specified vocID does not exist +C_String C_IDVocabulary::getText(IndexType id) +{ + map<IndexType, C_String>::iterator iterId2Text; + iterId2Text = this->id2text.find(id); + + if(iterId2Text==this->id2text.end()){ + return C_String("<UNK>"); + } + + return iterId2Text->second; +} + +IndexType C_IDVocabulary::getSize() +{ + return this->text2id.size(); +} + + +/// Load the vocabulary file into memory +/// The format of the vocabulary file is: +/// word vocID +// in each line. +void C_IDVocabulary::loadFromFile(const char *fileName) +{ + + ifstream existingVocFile; + existingVocFile.open(fileName); + + if(!existingVocFile){ + cerr<<"Can not open existing vocabulary file "<<fileName<<endl; + exit(0); + } + + cerr<<"Loading existing vocabulary file: "<<fileName<<endl; + + char aLine[1024]; + char * aToken; + char delimit[] = " \t\r\n"; + IndexType vocId = 0; + + while(!existingVocFile.eof()){ + existingVocFile.getline(aLine, 1024, '\n'); + + if(strlen(aLine)>0){ //a meaningful word, esp for the last line during reading file + vector<C_String> tokensInLine; + + aToken = strtok(aLine, delimit); + while( aToken != NULL ) { + tokensInLine.push_back(C_String(aToken)); + aToken = strtok( NULL, delimit); + } + + if(tokensInLine.size()!=2){ + cerr<<"Not valid format for Vocabulary: "<<aLine<<endl; + } + + vocId = atoi(tokensInLine[1].toString()); + + if(vocId>this->maxIdInVoc){ + this->maxIdInVoc = vocId; + } + + this->text2id.insert(make_pair(tokensInLine[0], vocId)); + this->id2text.insert(make_pair(vocId, tokensInLine[0] )); + + } + + aLine[0]=0; + } + cerr<<"Total "<<this->text2id.size()<<" word types loaded\n"; + cerr<<"Max VocID="<<this->maxIdInVoc<<endl; +} + +/// Return the maximum ID from all words in the vocabulary +/// Usually equals to the size of the vocabulary if the vocabulary is created from this corpus only. +/// If the vocabulary includes words from other corpora and the vocabulary only lists words in this corpus, +/// then max voc ID could be different from the vocabulary size +IndexType C_IDVocabulary::returnMaxID() +{ + return this->maxIdInVoc; +} + +IndexType C_IDVocabulary::returnNullWordID() +{ + return 0; +} + +/** +* Output the vocabulary to a file +**/ +void C_IDVocabulary::outputToFile(char *filename) +{ + + ofstream outputVocFile; + outputVocFile.open(filename); + + if(!outputVocFile){ + cerr<<"Can not open "<<filename<<" to write vocabulary\n"; + exit(-1); + } + + map<C_String, IndexType, ltstr>::iterator iterText2Id; + + iterText2Id = this->text2id.begin(); + while(iterText2Id!=this->text2id.end()){ + outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl; + iterText2Id++; + } + + outputVocFile.close(); +} + +/// Reserver vocID 0-NUMBER_OF_RESERVED_WORDS_IN_VOC for special words that might be useful for applications +/// Here we reserved 5 words: +/// _SENT_ID_PLACEHOLDER_ 1 +/// _END_OF_SENTENCE_ 2 +/// _TOO_LONG_TOKEN_ 3 +/// _SENTENCE_START_ 4 +/// _END_OF_CORPUS_ 5 +/// You can add other special words to the list as long as the assignment of vocID and its interpretation is consistent between application and indexing +void C_IDVocabulary::addingReservedWords() +{ + this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1); + this->insertWord(C_String("_END_OF_SENTENCE_"), 2); + this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3); + this->insertWord(C_String("_SENTENCE_START_"), 4); + this->insertWord(C_String("_END_OF_CORPUS_"), 5); + + char reservedWord[20]; + for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){ + memset(reservedWord, 0, 20); + sprintf(reservedWord, "_RESERVED_WORDS_%d", i); + this->insertWord(C_String(reservedWord), i); + } +} + +void C_IDVocabulary::insertWord(C_String text, IndexType id) +{ + this->text2id.insert(make_pair(text, id)); + this->id2text.insert(make_pair(id, text)); + +} + +/** +* Check if the word already exist in the voc, +* if so, return the vocID of the word, +* otherwise assign an ID to this word and insert it into the voc +**/ +IndexType C_IDVocabulary::getId(C_String text) +{ + IndexType id = this->returnId(text); + if(id==0){ + this->maxIdInVoc++; + this->insertWord(text, this->maxIdInVoc); + return this->maxIdInVoc; + } + + //else, already exist + return id; +} diff --git a/Src/Shared/_IDVocabulary.h b/Src/Shared/_IDVocabulary.h new file mode 100755 index 0000000..fa50add --- /dev/null +++ b/Src/Shared/_IDVocabulary.h @@ -0,0 +1,55 @@ +#if !defined(__IDVocabulary_H__INCLUDED_) +#define __IDVocabulary_H__INCLUDED_ + +#include "_String.h" +#include <string> +#include <map> +#include <vector> +#include "salm_shared.h" + +using namespace std; + + +struct ltstr +{ + bool operator()(C_String s1, C_String s2) const + { + return s1<s2; + } +}; + +/** +* Vocabulary class +* Mapping between words and their IDs +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +class C_IDVocabulary +{ + +public: + ///Return the ID of word "text", if the word does not exist, add the word into the voc and return the newly assigned ID + IndexType getId(C_String text); + + void addingReservedWords(); + void outputToFile(char * filename); + IndexType returnNullWordID(); + IndexType returnMaxID(); + IndexType returnId(C_String text); + + IndexType getSize(); + C_String getText(IndexType); + + C_IDVocabulary(); + C_IDVocabulary(const char * fileName); + virtual ~C_IDVocabulary(); + +private: + void insertWord(C_String text, IndexType id); + void loadFromFile(const char * fileName); + IndexType maxIdInVoc; + map<C_String, IndexType, ltstr> text2id; + map<IndexType, C_String> id2text; +}; + +#endif // !defined(__IDVocabulary_H__INCLUDED_) diff --git a/Src/Shared/_String.cpp b/Src/Shared/_String.cpp new file mode 100755 index 0000000..75ba8e8 --- /dev/null +++ b/Src/Shared/_String.cpp @@ -0,0 +1,253 @@ +/** +* _String.cpp: implementation of the C_String class. +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ + +#include "_String.h" +#include "malloc.h" +#include "string.h" +#include "stdio.h" +#include "stdlib.h" + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_String::C_String() +{ + this->content = (char *) malloc(sizeof(char)); + this->content[0]='\0'; + this->hasContent = true; +} + +void C_String::freeContent() +{ + if(this->hasContent){ + this->hasContent = false; + free(this->content); + } +} + +C_String::~C_String() +{ + this->freeContent(); +} + +/** +* Copy constructor from a char string +**/ +C_String::C_String(char * str1) +{ + + this->content = (char *) malloc(sizeof(char)*strlen(str1)+1); + if(this->content==NULL){ + fprintf(stderr,"Memory allocation error, Quit.\n"); + } + + strcpy(this->content, str1); + + this->hasContent = true; +} + + +C_String::C_String(C_String const &strObj1) +{ + this->hasContent = false; + copy(strObj1); +} + +C_String::C_String(const C_String & obj1, const C_String & obj2) +{ + this->freeContent(); + + int len1 = strlen(obj1.content); + int len2 = strlen(obj2.content); + + int fullLen = len1+len2; + this->content = (char *) malloc(sizeof(char)*len1 + sizeof(char)*len2 + 1); + + if(this->content==NULL){ + fprintf(stderr,"Memory allocation error, Quit.\n"); + } + + char * pointer = (char*) this->content; + strcpy(pointer, obj1.content); //copy first part + pointer += len1; + strcpy(pointer, obj2.content); //copy second part + + this->content[fullLen]='\0'; + + this->hasContent = true; +} + +void C_String::operator=(const C_String &strObj2) +{ + copy(strObj2); +} + +void C_String::copy(const C_String &strObj) +{ + this->freeContent(); + + this->content = (char *) malloc(sizeof(char)*strlen(strObj.content)+1); + if(this->content==NULL){ + fprintf(stderr,"Memory allocation error, Quit.\n"); + } + + strcpy(this->content, strObj.content); + this->hasContent = true; +} + +void C_String::copy(const C_String &strObj, int copyLen) +{ + this->freeContent(); + + this->content = (char *) malloc(sizeof(char)*(copyLen+1) ); + if(this->content==NULL){ + fprintf(stderr,"Memory allocation error, Quit.\n"); + } + + for(int i=0;i<copyLen;i++){ + this->content[i]=strObj.getCharAtPos(i); + } + + this->content[copyLen]='\0'; + + this->hasContent = true; + +} + +void C_String::print2stream(FILE *stream) +{ + fprintf(stream, content); +} + + +int C_String::length() const +{ + if(this->hasContent){ + return strlen(this->content); + } + + return 0; +} + +bool C_String::operator==(const C_String &obj1) const +{ + if(strcmp(this->content, obj1.content)==0){ + return true; + }; + + return false; +} + +bool C_String::operator!=(const C_String &obj1) const +{ + if(strcmp(this->content, obj1.content)!=0){ + return true; + }; + + return false; +} + +bool C_String::operator<(const C_String &obj1) const +{ + if(strcmp(this->content, obj1.content)<0){ + return true; + }; + + return false; +} + +char * C_String::toString() const +{ + return this->content; +} + +void C_String::clear() +{ + this->freeContent(); + + this->content = (char *) malloc(sizeof(char)); + this->content[0]='\0'; + this->hasContent = true; +} + + +char C_String::getCharAtPos(int pos) const +{ + if(pos>=this->length()){ + fprintf(stderr,"Can not get char at pos %d, out of bound! Exit.\n", pos); + exit(0); + } + + return this->content[pos]; +} + + +void C_String::appending(const C_String &obj) +{ + int len1 = 0; + + if(this->hasContent){ + len1 = strlen(this->content); + } + + int len2 = strlen(obj.content); + + int fullLen = len1+len2; + + char * newContent = (char *) malloc(sizeof(char)*fullLen + 1); + + if(newContent==NULL){ + fprintf(stderr,"Memory allocation error, Quit.\n"); + } + + char * pointer = newContent; + if(this->hasContent){ + strcpy(pointer, content); //copy first part + pointer += len1; + } + + strcpy(pointer, obj.content); //copy second part + newContent[fullLen]='\0'; + + //free old content + this->freeContent(); + + //point to new content + this->content = newContent; + + this->hasContent = true; +} + +void C_String::appending(const char nextChar) +{ + int len1 = 0; + + if(this->hasContent){ + len1 = strlen(this->content); + } + + int fullLen = len1+1; + + char * newContent = (char *) malloc(sizeof(char)*fullLen + 1); + + if(newContent==NULL){ + fprintf(stderr,"Memory allocation error, Quit.\n"); + } + + strcpy(newContent, content); //copy first part + + newContent[len1]=nextChar; //copy second part + newContent[fullLen]='\0'; + + //free old content + this->freeContent(); + + //point to new content + this->content = newContent; + + this->hasContent = true; +} diff --git a/Src/Shared/_String.h b/Src/Shared/_String.h new file mode 100755 index 0000000..d8f633d --- /dev/null +++ b/Src/Shared/_String.h @@ -0,0 +1,45 @@ +#if !defined(__STRING_H__INCLUDED_) +#define __STRING_H__INCLUDED_ + +/** +* Definition of class C_String +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +#include "stdio.h" + +class C_String +{ +public: + + char getCharAtPos(int) const; + void clear(); + char * toString() const; + int length() const; + void print2stream(FILE *); + + C_String(const C_String & obj1, const C_String & obj2); + C_String(C_String const&); + C_String(char *); + C_String(); + + bool operator==(const C_String &) const; + bool operator!=(const C_String &) const; + bool operator<(const C_String &) const; + void operator=(const C_String &strObj2); + + void appending(const C_String & obj); + void appending(const char nextChar); + + virtual ~C_String(); + +private: + void freeContent(); + void copy(const C_String &); + void copy(const C_String &strObj, int copyLen); + + bool hasContent; + char * content; +}; + +#endif // !defined(__STRING_H__INCLUDED_) diff --git a/Src/Shared/salm_shared.h b/Src/Shared/salm_shared.h new file mode 100755 index 0000000..2c0e186 --- /dev/null +++ b/Src/Shared/salm_shared.h @@ -0,0 +1,36 @@ +/** +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +#if !defined(_SA_common_h) +#define _SA_common_h + +#include "math.h" + +typedef unsigned int IndexType; +typedef unsigned int TextLenType; +typedef unsigned short int SearchLenType; + +//constants +const int SIZE_ONE_READ = 16384; //when loading the data, each I/O read in SIZE_ONE_READ data points +const int MAX_TOKEN_LEN = 1024; //length of the longest word + +const int NUMBER_OF_RESERVED_WORDS_IN_VOC = 100; + +/// for language modeling +const double SALM_PROB_UNK = 0.00000000023283064365386962890625; // 1/4G +const double SALM_LOG_PROB_UNK = log(SALM_PROB_UNK); +const double SALM_LOG_0 = -20; + +/** +* \ingroup scan +**/ +typedef struct s_nGramScanningInfoElement +{ + IndexType vocId; + TextLenType freqThreshForOutput; + TextLenType freqSoFar; +}S_nGramScanningInfoElement; + +#endif + diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp new file mode 100755 index 0000000..ab2915d --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp @@ -0,0 +1,63 @@ +#include "stdio.h" +#include "stdlib.h" +#include "time.h" +#include "_SuffixArrayLanguageModel.h" +#include <iostream> +#include <sstream> +#include <string> +#include <cstring> + +using namespace std; + +/** +* A simple example of using the C_SuffixArrayLanguageModel class to calculate the LM prob of input sentences +* +* Revision $Rev: 3816 $ +* Last Modified $LastChangedDate: 2007-07-06 14:36:11 -0400 (Fri, 06 Jul 2007) $ +**/ +int main(int argc, char * argv[]){ + if(argc<2){ + cerr<<"\nUsage:\n\t"<<argv[0]<<" configurationFileName < sentences\n"; + exit(0); + } + + C_SuffixArrayLanguageModel salm(argv[1]); + + long ltime1, ltime2; + time( <ime1 ); + + string aWord; + char aLine[10240]; + while(!cin.eof()){ + cin.getline(aLine, 10240, '\n'); + + if(strlen(aLine)>0){ + istringstream inputLine(aLine, istringstream::in); + LMState lmState = salm.beginOfSentenceState(); + + LMState nextState; + double logProb = 0; + + while(! inputLine.eof()){ + inputLine>>aWord; + if(aWord.length()>0){ + IndexType vocId = salm.returnVocId(C_String((char *) aWord.c_str())); + logProb+=salm.logProb(lmState, vocId, nextState); + lmState = nextState; + } + aWord=""; + } + + logProb+=salm.logProbEnd(lmState); + cout<<"LogProb="<<logProb<<endl; + + } + + aLine[0]=0; + } + + time( <ime2 ); + cerr<<"\n"<<ltime2-ltime1<<" seconds spent."<<endl; + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~ new file mode 100755 index 0000000..95e7993 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~ @@ -0,0 +1,62 @@ +#include "stdio.h" +#include "stdlib.h" +#include "time.h" +#include "_SuffixArrayLanguageModel.h" +#include <iostream> +#include <sstream> +#include <string> + +using namespace std; + +/** +* A simple example of using the C_SuffixArrayLanguageModel class to calculate the LM prob of input sentences +* +* Revision $Rev: 3816 $ +* Last Modified $LastChangedDate: 2007-07-06 14:36:11 -0400 (Fri, 06 Jul 2007) $ +**/ +int main(int argc, char * argv[]){ + if(argc<2){ + cerr<<"\nUsage:\n\t"<<argv[0]<<" configurationFileName < sentences\n"; + exit(0); + } + + C_SuffixArrayLanguageModel salm(argv[1]); + + long ltime1, ltime2; + time( <ime1 ); + + string aWord; + char aLine[10240]; + while(!cin.eof()){ + cin.getline(aLine, 10240, '\n'); + + if(strlen(aLine)>0){ + istringstream inputLine(aLine, istringstream::in); + LMState lmState = salm.beginOfSentenceState(); + + LMState nextState; + double logProb = 0; + + while(! inputLine.eof()){ + inputLine>>aWord;
+ if(aWord.length()>0){ + IndexType vocId = salm.returnVocId(C_String((char *) aWord.c_str())); + logProb+=salm.logProb(lmState, vocId, nextState); + lmState = nextState; + }
+ aWord=""; + } + + logProb+=salm.logProbEnd(lmState); + cout<<"LogProb="<<logProb<<endl; + + } + + aLine[0]=0; + } + + time( <ime2 ); + cerr<<"\n"<<ltime2-ltime1<<" seconds spent."<<endl; + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt new file mode 100755 index 0000000..17cd5a8 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt @@ -0,0 +1,5 @@ +June 27, 2007
+
+Working branch of applying KN smoothing in LM.
+Not finished yet.
+Do not distribute!
\ No newline at end of file diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp new file mode 100755 index 0000000..583b222 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp @@ -0,0 +1,1113 @@ +/** +* Revision $Rev: 3665 $ +* $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ + +#include "_SuffixArrayLanguageModel.h" +#include <iostream> +#include <fstream> +#include <set> + +#include "math.h" + +using namespace std; + +C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel() +{ + +} + +C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel() +{ + +} + + +/** +* Construct the suffix array language model object +* Take the configuration filename as the parameter for the constructor +* +* The configuration file is of the following format for each line: +* +* Keyword<tab>value +* <p> +* Note: keywords are all case sensitive. +* <ul> +* <li> <b>CORPUS</b> filename of the corpus for LM training. It should be the same as used in IndexSA +* <li> <b>N</b> Highest order of n considered for n-gram LM. Default value = <i>5</i> +* <li> <b>SMOOTHING_STRATEGY</b> Smoothing strategy. +* <ul> +* <li> <i>k</i> : default value. Modified Kneser-Ney Smoothing @see <a href=http://acl.ldc.upenn.edu/P/P96/P96-1041.pdf> An Empirical Study of Smoothing Techniques for Language Modeling </a> +* <li> <i>g</i> : Good-Turing discounting @see <a href=http://l2r.cs.uiuc.edu/~danr/Teaching/CS598-05/Papers/Gale-Sampson-smoothgoodturing.pdf> Good Turing without Tears</a> +* </ul> +* <li> <b>INTERPOLATION_STRATEGY</b> : Interpolation strategy +* <ul> +* <li> <i>e</i> : Probability of the next word predicted by histories of different orders are equally interpolated +* <li> <i>m</i> : Use the maximum conditional probability from all different order of history as the probability for the next word +* <li> <i>i</i> : Use deleted interpolation based on heuristics developed by IBM +* </ul> +* <li> <b>MAX_FREQ_DISC</b>: <br> +* <i>default</i>=50<br> +* If the frequency of an n-gram is lower than this value and SMOOTHING is set, discounting will be applied. <br> +* If this value is set to 0 or negative values, smoothing/discounting will not be used. <br> +* <li> <b>PURGE_CACHE</b>: Check entries in the cache after "PURGE_CACHE" number of sentences have been processed. Default = 100. +* <li> <b>FRESH_TIME</b>: Entries in the cache that are not used since "current time - FRESH_TIME" will be purged from the cache. Mesured in seconds of wall clock time. +** </ul> +* @param Configuration File Name +* @param corpusFileNameStem The training corpus filename used by IndexSA. +**/ +C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName) +{ + + fstream cfgFile; + cfgFile.open(cfgFileName,ios::in); + + if(!cfgFile){ + fprintf(stderr,"Configuration file %s does not exist! quit!!\n", cfgFileName); + exit(-1); + } + + //----------------------------------------------------------------------------- + //reading parameters + char paraName[1024]; + char corpusFileNameStem[1024]; + + corpusFileNameStem[0]='\0'; + + //default values for member variables + this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob + this->smoothingStrategy = 'k'; //default smoothing strategy: modified Kneser-Ney smoothing + this->maxFreqForDiscounting = 50; //default, freq that is lower than this value will not be applied with discounting + this->maxN= 5; // default value; consider up to 5 words + + this->numberOfSentSeenToPurgeCache = 100; //default value, purge cache after processing 100 sentences + this->freshTime = 50; //entries in the cache that are older than 50 seconds are subject to purging + this->sentenceProcessedSoFar = 0; + this->typeOfBigrams = 0; + + while(!cfgFile.eof()){ + cfgFile>>paraName; + + if(strcmp(paraName,"CORPUS")==0){ + cfgFile>>corpusFileNameStem; + } + else if(strcmp(paraName, "SMOOTHING_STRATEGY")==0){ + cfgFile>>this->smoothingStrategy; + } + else if(strcmp(paraName,"N")==0){ + cfgFile>>this->maxN; + } + else if(strcmp(paraName,"MAX_FREQ_DISC")==0){ + cfgFile>>this->maxFreqForDiscounting; + } + else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){ + cfgFile>>this->interpolationStrategy; + } + else if(strcmp(paraName,"FRESH_TIME")==0){ + cfgFile>>this->freshTime; + } + else if(strcmp(paraName, "PURGE_CACHE")==0){ + cfgFile>>this->numberOfSentSeenToPurgeCache; + } + + paraName[0]=0; + + } + + + if(strlen(corpusFileNameStem)==0){ + cerr<<"CORPUS not specified in the configuration file! Quit!"<<endl; + exit(-1); + } + + + this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class + //corpusName, with vocabulary, no offset, + + + this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN); + + //initialize the scanning list + for(int i=0;i<this->maxN;i++){ + this->nGramScanningList[i].freqSoFar=0; + this->nGramScanningList[i].vocId = 0; + this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output + } + + //get vocID for sentEnd + this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_")); + + if(this->vocIdForSentEnd==0){ + cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_")); + if(this->vocIdForSentStart==0){ + cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_")); + if(this->vocIdForCorpusEnd==0){ + cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n"; + exit(0); + } + + if(this->maxFreqForDiscounting<=0){ + this->applyDiscounting = false; + } + else{ + if(this->maxFreqForDiscounting<3){ + cerr<<"MAX_FREQ_DISC has to be at least 3!"<<endl; + exit(-1); + } + + this->applyDiscounting = true; + this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map + } + +} + +/** +* Set strategy to interploate the conditional probabilities of next word given different order of histories +* 'e' for equal weighted interpolation of unigram, bigram, trigram... probabiblities +* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history) +* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable +**/ +void C_SuffixArrayLanguageModel::setParam_interpolationStrategy(char interpolationStrategy) +{ + this->interpolationStrategy = interpolationStrategy; +} + +/** +* Set the value for parameter :numberOfSentSeenToPurgeCache +* LM will purge the entries in the cache that have not been used in 'freshTime' +**/ +void C_SuffixArrayLanguageModel::setParam_numberOfSentSeenToPurgeCache(int numberOfSentSeenToPurgeCache) +{ + this->numberOfSentSeenToPurgeCache = numberOfSentSeenToPurgeCache; +} + +/** +* Set the value for parameter: freshTime +* LM will purge the entries in the cache that have not been used in 'freshTime' +**/ +void C_SuffixArrayLanguageModel::setParam_freshTime(long freshTime) +{ + this->freshTime = freshTime; +} + +/** +* Similar to the function in C_SuffixArrayScanningBase +* Scan the corpus to obtain count of counts information +* and construct the discounting using Good-Turing smoothing +* Also, estimate the Y, D1, D2, D3+ values as needed for the modified Kneser-Ney smoothing +**/ +void C_SuffixArrayLanguageModel::constructDiscountingMap() +{ + unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting); + this->typeOfBigrams = 0; + + if(countOfCountsTable==NULL){ + cerr<<"Count of counts table can not be initialized. Exit\n"; + exit(0); + } + + for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){ + countOfCountsTable[c]=0; + } + + + int i,j; + bool stillMeaningful = true; + TextLenType saPos=0; + + while(stillMeaningful && ( saPos<this->corpusSize ) ){ + + TextLenType posInCorpus = this->suffix_list[saPos]; + IndexType wordInCorpus = this->corpus_list[posInCorpus]; + + if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting + + if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested + + bool quit =false; + i=0; + + while(!quit && (i<this->maxN)){ + wordInCorpus = this->corpus_list[posInCorpus+i]; + if( + (wordInCorpus<this->sentIdStart)&& + (wordInCorpus!=this->vocIdForSentEnd)&& + (wordInCorpus!=this->vocIdForSentStart)&& + (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match + + this->nGramScanningList[i].freqSoFar++; + } + else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type + + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + + + + for(j=i;j<this->maxN;j++){ + + + if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ //perform actions depends on actionType + + if(j==1){ //a new bigram type, this information is important for KN-smoothing + this->typeOfBigrams++; + } + + + freqSoFar = this->nGramScanningList[j].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){ + //increase the count for (j+1)-gram with freq freqSoFar + countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++; + } + } + + //finished output, now clear the list from point of i + if((posInCorpus+j)<this->corpusSize){ + wordInCorpus = this->corpus_list[posInCorpus+j]; + } + else{ + wordInCorpus = 0; //out of bound for corpus + } + + if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){ + wordInCorpus=0; //write 0 for <sentId>, <s> and </s> + this->nGramScanningList[j].freqSoFar = 0; + } + else{ + this->nGramScanningList[j].freqSoFar = 1; + } + + this->nGramScanningList[j].vocId = wordInCorpus; + } + + quit=true; //at i+1 gram, already not match, no need to check for longer + } + + i++; + } + } + } + else{ + stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text + } + + saPos++; + } + + //at the end of corpus (according to suffix order) + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + for(i=0;i<this->maxN;i++){ + if(this->nGramScanningList[i].vocId==0){ //invalide word + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ + + if(i==1){ + this->typeOfBigrams++; + } + + freqSoFar = this->nGramScanningList[i].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){ + //increase the count for (i+1)-gram with freq freqSoFar + countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++; + } + } + } + + //now, use Good-Turing discounting to create frequency mapping + //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted + this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting); + + for(i=0;i<this->maxN;i++){ + //for (i+1)-gram + + unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting; + double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting; + + for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq + //for all (freq+1) ngrams + if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists + discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]); + } + else{ + discountingMapForThisN[freq] = -1; + } + } + + discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency + } + + + //estimate the Y, D1, D2 and D3+ values for each order of n. + //these values will be used for KN-smoothing to estimate the gamma, the discounting factor + this->Y = (double *) malloc(sizeof(double) * this->maxN); + this->D1 = (double *) malloc(sizeof(double) * this->maxN); + this->D2 = (double *) malloc(sizeof(double) * this->maxN); + this->D3plus = (double *) malloc(sizeof(double) * this->maxN); + + for(i=0;i<this->maxN;i++){ + unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting; + double n1 = ccTableForThisN[0]; //number of n-gram types that have freq equals 1 + double n2 = ccTableForThisN[1]; //number of n-gram types that have freq equals 2; + double n3 = ccTableForThisN[2]; //number of n-gram types that have freq equals 3; + double n4 = ccTableForThisN[3]; //number of n-gram types that have freq equals 4; + + this->Y[i] = n1/(n1+2*n2); //for (i+1)-gram + this->D1[i] = 1-2*Y[i]*n2/n1; + this->D2[i] = 2-3*Y[i]*n3/n2; + this->D3plus[i] = 3 - 4*Y[i]*n4/n3; + } + + free(countOfCountsTable); +} + +///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1] +///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord +///only need to get freq(w_n | history) of different history +///return in freq table, freq(history+Wn, history) for all the matched n +///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history +/// 3-gram freq, freq of 3-gram history +///freqTable should have length of 2*n +///return the longest match with this updated n-gram +void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + vector<IndexType> nGram; + + if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk> + if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram + currentMatchStart++; + currentMatchLen--; + } + + for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){ + nGram.push_back(this->corpus_list[pos]); + } + } + + nGram.push_back(nextWord); + + int sentLen = nGram.size(); + + //construct the n-gram search table + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram); + + int startPosForNgram; + int startPosForLongestMatchingWithNextWord; + int cellIndexForLongestMatchingWithNextWord; + + bool stillMatched = true; + bool atLeastOneMatched = false; + + int indexForNgram; + + unsigned int totalOccurrences; + unsigned int totalOccurrencesOfHistory; + + //for unigram + indexForNgram = sentLen - 1; + if(table[indexForNgram].found){ + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + if(this->smoothingStrategy=='g'){ //if use Good-Turing for discounting + freqTable[0] = this->discountFreq_GT(1, totalOccurrences); + } + else{ + freqTable[0] = totalOccurrences; + } + + freqTable[1] = this->corpusSize; + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = sentLen-1; + atLeastOneMatched = true; + } + else{ + stillMatched = false; + } + + int n=2; //considering 2-gram and longer n-gram now + startPosForNgram = sentLen - 2; + while((stillMatched)&&(startPosForNgram>=0)){ + + indexForNgram = (n-1) * sentLen + startPosForNgram; + int indexForHistory = (n-2) * sentLen + startPosForNgram; + + if(table[indexForNgram].found){ + + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1; + + + if(this->applyDiscounting){ + freqTable[2*n-2] = this->discountFreq_GT(n, totalOccurrences); + } + else{ + freqTable[2*n-2] = (double)totalOccurrences; + } + + freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history + + if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = startPosForNgram; + } + } + else{ + stillMatched = false; + } + + startPosForNgram--; + n++; + } + + if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord' + updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA]; + updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord); + } + else{ + updatedMatchingStart = (TextLenType) -1; + updatedMatchingLen = 0; + } + + free(table); + +} + + +void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, S_ContextTypeInfo * contextTypeInfo, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + vector<IndexType> nGram; + + if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk> + if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram + currentMatchStart++; + currentMatchLen--; + } + + for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){ + nGram.push_back(this->corpus_list[pos]); + } + } + + nGram.push_back(nextWord); + + int sentLen = nGram.size(); + + //construct the n-gram search table + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram); + + int startPosForNgram; + int startPosForLongestMatchingWithNextWord; + int cellIndexForLongestMatchingWithNextWord; + + bool stillMatched = true; + bool atLeastOneMatched = false; + + int indexForNgram; + + unsigned int totalOccurrences; + unsigned int totalOccurrencesOfHistory; + + //for unigram + indexForNgram = sentLen - 1; + if(table[indexForNgram].found){ + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + + freqTable[0] = totalOccurrences; + freqTable[1] = this->corpusSize; + + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = sentLen-1; + atLeastOneMatched = true; + } + else{ + stillMatched = false; + } + + int n=2; //considering 2-gram and longer n-gram now for token freq + startPosForNgram = sentLen - n; + while((stillMatched)&&(startPosForNgram>=0)){ + + indexForNgram = (n-1) * sentLen + startPosForNgram; + int indexForHistory = (n-2) * sentLen + startPosForNgram; + + if(table[indexForNgram].found){ + + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1; + + + freqTable[2*n-2] = (double)totalOccurrences; + freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history + + if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = startPosForNgram; + } + } + else{ + stillMatched = false; + } + + startPosForNgram--; + n++; + } + + if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord' + updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA]; + updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord); + } + else{ + updatedMatchingStart = (TextLenType) -1; + updatedMatchingLen = 0; + } + + + //estimate the context type information which will be used for KN-smoothing + for(n=2;n<=sentLen;n++){ + startPosForNgram = sentLen - n; + TextLenType w_in2_i1_startPos_in_SA = 0; + TextLenType w_in2_i1_endPos_in_SA = 0; + + if(n>2){ + int indexForW_in2_i1 = (n-3) * sentLen + startPosForNgram + 1; //the location information for w_{i-n+2}^{i-1} of length n-2 + w_in2_i1_startPos_in_SA = table[indexForW_in2_i1].startPosInSA; + w_in2_i1_endPos_in_SA = table[indexForW_in2_i1].endingPosInSA; + } + + int indexForW_in1_i1 = (n-2) * sentLen + startPosForNgram; //the location information of w_{i-n+1}^{i-1} of length n-1 + + this->scanCorpusForContextTypeInfo(n, nextWord, + w_in2_i1_startPos_in_SA, w_in2_i1_endPos_in_SA, + table[indexForW_in1_i1].startPosInSA, table[indexForW_in1_i1].endingPosInSA, + contextTypeInfo[n-1]); + } + + free(table); + + +} + +///given observedFreq of n-gram, return discounted freq using Good-Turing smoothing +double C_SuffixArrayLanguageModel::discountFreq_GT(int n, unsigned int observedFreq) +{ + if(n>=this->maxN){ //do not discount + return (double) observedFreq; + } + + if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq + return (double) observedFreq; + } + + //else, check the discount map + double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1]; + + if(discountedFreq>0){ + return discountedFreq; + } + + //else, no discounting + return (double) observedFreq; +} + + +///Start a new sentence now, clear up the sentence LM state +///Increase the count of 'sentenceProcessedSoFar' +///If LM has processed 'numberOfSentSeenToPurgeCache' sentences +///it is time to check if old entries in the cache should be cleaned +LMState C_SuffixArrayLanguageModel::beginOfSentenceState() +{ + long currentTime; + time(¤tTime); + + this->resetLmStates(); + this->initialLmState(); + + this->sentenceProcessedSoFar++; + + if(this->sentenceProcessedSoFar==this->numberOfSentSeenToPurgeCache){ + //purge the cache + this->purgeCache(currentTime-this->freshTime); + + this->sentenceProcessedSoFar = 0; + } + + return 0; +} + +void C_SuffixArrayLanguageModel::initialLmState() +{ + //add sentence start + S_LMStateInfo sentStartNode; + sentStartNode.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s> + sentStartNode.len = 1; + + this->allLMStates.push_back(sentStartNode); + this->lmStateInfo2Id.insert(make_pair(sentStartNode, 0)); +} + +void C_SuffixArrayLanguageModel::resetLmStates() +{ + this->buffer.clear(); + this->allLMStates.clear(); + this->lmStateInfo2Id.clear(); +} + +/** +* Purge entries in the cache that are not visited after "lastVisitedTime" +* @param lastVisitedTime Entries in the cache that are older than 'lastVisitedTime' parameter will be purged +**/ +void C_SuffixArrayLanguageModel::purgeCache(long lastVisitedTime) +{ + //cerr<<this->cached_sa_access.size()<<" entries in cache, purged to "; + + map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key>::iterator iter1,iter2; + + iter1 = this->cached_sa_access.begin(); + + while(iter1!=this->cached_sa_access.end()){ + iter2=iter1; + iter2++; + + if(iter1->second.lastTimedUsed<lastVisitedTime){ + this->cached_sa_access.erase(iter1); + } + + iter1=iter2; + } + //cerr<<this->cached_sa_access.size()<<" entries"<<endl; +} + +/** +* Given the current history (as represented by the 'lmState' +* caculate the log prob of nextWord given this history P(nextword|history) +* and return the updated language model state with next word appended to the history +* @param lmState Current language model state +* @param nextWord The vocId of the next word (the word to be predicted) +* @param &nextState Returning the updated language model state when the next word is appended +**/ +double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState) +{ + + //first check if we have already seen this before + map< pair<LMState, IndexType>, S_BufferedLmInfo>::iterator iterBuffer; + iterBuffer = this->buffer.find( make_pair( lmState, nextWord) ); + + if(iterBuffer==this->buffer.end()){ //we haven't seen this lmState+word yet + //search for it in the corpus + S_LMStateInfo lmStateInfo = this->allLMStates[lmState]; + TextLenType updatedMatchingStart; + unsigned char updatedMatchingLen; + + double logProb = this->logProbOfNgramFromCorpusInfo(lmStateInfo.posInCorpus, lmStateInfo.len, nextWord, updatedMatchingStart, updatedMatchingLen); + + + S_LMStateInfo updatedLmStateInfo; + updatedLmStateInfo.posInCorpus = updatedMatchingStart; + updatedLmStateInfo.len = updatedMatchingLen; + + int updatedLmStateId; + map<S_LMStateInfo, int, lt_lmStateInfo>::iterator iterLmStateInfo2Id; + iterLmStateInfo2Id = this->lmStateInfo2Id.find(updatedLmStateInfo); + if(iterLmStateInfo2Id==this->lmStateInfo2Id.end()){ //this updated lm state does not exist yet + this->allLMStates.push_back(updatedLmStateInfo); + updatedLmStateId = this->allLMStates.size()-1; + this->lmStateInfo2Id.insert(make_pair(updatedLmStateInfo, updatedLmStateId)); + } + else{ + updatedLmStateId = iterLmStateInfo2Id->second; + } + + //buffer this + S_BufferedLmInfo bufferedLmInfo; + bufferedLmInfo.logProb = logProb; + bufferedLmInfo.nextState = updatedLmStateId; + + this->buffer.insert(make_pair( make_pair(lmState, nextWord), bufferedLmInfo)); + + //updated next state + nextState = updatedLmStateId; + + return logProb; + } + + nextState = iterBuffer->second.nextState; + + return iterBuffer->second.logProb; +} + + +/** +* Given the history as lmState and append a phrase as a vector of IndexType, +* calculate the LM prob and update the lm state +* @param lmState Current language model state +* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted) +* @param &nextState Returning the updated language model state when the next word is appended +**/ +double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState) +{ + double logProb = 0; + for(int i=0;i<phrase.size();i++){ + logProb+=this->logProb(lmState, phrase[i], nextState); + lmState = nextState; + } + + return logProb; +} + +/** +* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s> +**/ +double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState) +{ + LMState dummyNextState; + return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState); +} + +/** +* Extend the current matched n-gram with next word, calculate the prob and update the updated range +* the n-gram is represented by its position in the suffix array and the length +* @param currentMatchStart Starting position of the current matched n-gram in corpus +* @param currentMatchLen Length of the matched n-gram \ +* @param nextWord Vocabulary ID of the next word (the word to be predicted) +* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus +* @param &updatedMatchingLen The length of the extended n-gram +**/ +double C_SuffixArrayLanguageModel::logProbOfNgramFromCorpusInfo(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + long currentTime; + time(¤tTime); + + double logProb; + + //first check if information is already in cache + S_CachedSA_Access_Key accessKey; + accessKey.currentMatchStart = currentMatchStart; + accessKey.currentMatchLen = currentMatchLen; + accessKey.nextWord = nextWord; + + map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key>::iterator iter_cached_sa_access; + + iter_cached_sa_access = this->cached_sa_access.find(accessKey); + + if(iter_cached_sa_access==this->cached_sa_access.end()){ //information not in cache yet + double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN)); + memset(freqTable, 0, 2*this->maxN*sizeof(double)); + + S_ContextTypeInfo * contextTypeInfo = (S_ContextTypeInfo *) malloc(sizeof(S_ContextTypeInfo)*this->maxN); + + switch(this->smoothingStrategy){ + case 'k': //for Modified Kneser-Ney smoothing + + this->calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, contextTypeInfo, updatedMatchingStart, updatedMatchingLen); + logProb = this->calcLogProb_kneserNeySmoothing(freqTable, contextTypeInfo); + break; + default: //all other cases including 'g' (Good-Turing smoothing) + this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen); + logProb = this->calcLogProb(freqTable); + } + + free(freqTable); + free(contextTypeInfo); + + //insert the info into the cache + S_Cached_SA_Access_Info accessInfo; + accessInfo.updatedMatchingStart = updatedMatchingStart; + accessInfo.updatedMatchingLen = updatedMatchingLen; + accessInfo.logProb = logProb; + accessInfo.lastTimedUsed = currentTime; + + this->cached_sa_access.insert(make_pair(accessKey, accessInfo)); + + return logProb; + } + + //otherwise, already exist in the cache, just update the last touched time + updatedMatchingStart = iter_cached_sa_access->second.updatedMatchingStart; + updatedMatchingLen = iter_cached_sa_access->second.updatedMatchingLen; + logProb = iter_cached_sa_access->second.logProb; + + return logProb; +} + +double C_SuffixArrayLanguageModel::calcLogProb(double *freq) +{ + switch(this->interpolationStrategy){ + case 'e': + return this->calcLogProb_equalWeightedInterpolation(freq); + break; + case 'i': + return this->calcLogProb_ibmHeuristicInterpolation(freq); + break; + case 'm': + return this->calcLogProb_maxProbInterpolation(freq); + break; + default: + cerr<<"Unknown interpolation strategy!\n"; + exit(0); + } +} + +double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq) +{ + double prob = 0.0; + + + if(freq[0]>0){ + + int i=0; + bool stillMatched = true; + + while(stillMatched && (i<this->maxN)){ + if(freq[2*i]>0){ + prob+=freq[2*i]/freq[2*i+1]; + } + else{ + stillMatched = false; + } + + i++; + } + + return log(prob/(double)this->maxN); + } + else{ //unknown word + return SALM_LOG_PROB_UNK; + } +} + +double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq) +{ + double prob = 0.0; + if(freq[0]==0){ //unknown word + return SALM_LOG_PROB_UNK; + } + + double remainingWeightSum = 1.0; + + //find the first non-zero match + int i = this->maxN - 1; + + while(freq[2*i]==0){ //will stop for sure because freq[0]!=0 + i--; + } + + for(int j=i;j>=0;j--){ + //for (j+1)-gram + double historyFreq = freq[2*j+1]; + double logHistoryFreq = log(historyFreq); + if(logHistoryFreq>1){ + logHistoryFreq = 1.0; //cap it to 1 + } + + double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history + double adjustedWeights = remainingWeightSum * reliability; + + prob+=adjustedWeights * freq[2*i]/freq[2*i+1]; + + remainingWeightSum -= adjustedWeights; + } + + return log(prob); +} + +double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq) +{ + double maxProb = 0.0; + + if(freq[0]>0){ + + int i=0; + bool stillMatched = true; + + while(stillMatched && (i<this->maxN)){ + if(freq[2*i]>0){ + double prob=freq[2*i]/freq[2*i+1]; + + if(prob>maxProb){ + maxProb = prob; + } + } + else{ + stillMatched = false; + } + + i++; + } + + return log(maxProb); + } + else{ //unknown word + return SALM_LOG_PROB_UNK; + } +} + +/** +* Follow the implementation described in page 23 of Chen & Goodman tech report (section 4.1.6 and 4.1.7) +* Use notation described in James 2000 pp3 for MODKN-COUNT +**/ +double C_SuffixArrayLanguageModel::calcLogProb_kneserNeySmoothing(double *freq, S_ContextTypeInfo * contextTypeFreq) +{ + double prob = 0.0; + int i; + + if(freq[0]>0){ + contextTypeFreq[i]. + } + + //unknown word + return SALM_LOG_PROB_UNK; +} + + +IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord) +{ + return this->voc->returnId(aWord); +} + + +/** +* Scan corpus to collect important context-type information needed for KN-smoothing +* Knowing where n-gram w_(i-n+2)^(i-1) occurs, scan corpus for N_{1+}(dot w_{i-n+2}^i) +* and N_{1+}(dot w_{i-n+2}^{i-1} dot) +* Also, collect type freq of n-grams w_{i-n+1}^{i-1} that occur exactly 1, 2 and 3+ times +* to estimate the discounting factor gammar +* +* @see Chen & Goodman 1998 page 19-20 for detailed description +* +* @param n order of n-gram +* @param w_in1 VocId of w<sub>i-n+1</sub> +* @param w_i VocId of w<sub>i</sub>, the next word to be predicted +* @param leftBoundaryOfSaRangeFor_w_in2_i1 +* @param rightBoundaryOfSaRangeFor_w_in2_i1 [leftBoundaryOfSaRangeFor_w_in2_i1, rightBoundaryOfSaRangeFor_w_in2_i1] is the range of suffix array positions that correspond to the locations of phrase w<sub>i-n+2</sub><sup>i-1</sup> +* @param leftBoundaryOfSaRangeFor_w_in1 +* @param rigthBoundaryOfSaRangeFor_w_i1 [leftBoundaryOfSaRangeFor_w_in1, rigthBoundaryOfSaRangeFor_w_i1] is the range of suffix array positions that correspond to the locations of phrase w<sub>i-n+1</sub><sup>i-1</sup> +* @return S_ContextTypeInfo containing the context type information +**/ +void C_SuffixArrayLanguageModel::scanCorpusForContextTypeInfo(int n, IndexType w_i, TextLenType leftBoundaryOfSaRangeFor_w_in2_i1, TextLenType rightBoundaryOfSaRangeFor_w_in2_i1, TextLenType leftBoundaryOfSaRangeFor_w_in1_i1, TextLenType rigthBoundaryOfSaRangeFor_w_in1_i1, S_ContextTypeInfo & result) +{ + + TextLenType i; + TextLenType posInCorpus; + IndexType nextWordInCorpus; + int n1 = n-1; //this value will be used frequently here + + //first scan the corpus for all the word types that follow w_{i-n+1}^{i-1} + //to collect N1(w_in1^i1 dot) N2, and N3+ info needed + result.N1_w_in1_i1_dot = 0; + result.N2_w_in1_i1_dot = 0; + result.N3plus_w_in1_i1_dot = 0; + + int freqOfCurrentType = -1; //freq of 'dot' with current type + IndexType currentNextWordType = 0; + for(i=leftBoundaryOfSaRangeFor_w_in1_i1;i<=rigthBoundaryOfSaRangeFor_w_in1_i1;i++){ + posInCorpus = this->suffix_list[i] + n1; + //suffix_list[i] is the position of w_{i-n+1} in the corpus + //suffix_list[i]+n-1 is hte position of the word (the dot in the equation) that follows w_{i-n+1}^{i-1} + + nextWordInCorpus = this->corpus_list[posInCorpus]; + freqOfCurrentType++; + if(nextWordInCorpus!=currentNextWordType){ + + if(freqOfCurrentType==1){ + result.N1_w_in1_i1_dot++; + } + else if(freqOfCurrentType==2){ + result.N2_w_in1_i1_dot++; + } + else{ //freq of this type is >=3 + result.N3plus_w_in1_i1_dot++; + } + + currentNextWordType = nextWordInCorpus; + freqOfCurrentType=0; + } + } + + //for the last type in the range + freqOfCurrentType++; + + if(freqOfCurrentType==1){ + result.N1_w_in1_i1_dot++; + } + else if(freqOfCurrentType==2){ + result.N2_w_in1_i1_dot++; + } + else{ //freq of this type is >=3 + result.N3plus_w_in1_i1_dot++; + } + + + //step 2, scan the corpus for N_{1+}(dot w_{i-n+2}^{i}) and N_{1+}(dot w_{i-n+2}^{i-1} dot) + IndexType precedingWord; + IndexType followingWord; + if(n==2){ //the special case + result.N1plus_dot_w_in2_i1_dot = this->typeOfBigrams; + + //check if we have the N_1+(dot w_i) information already + map<IndexType, unsigned int>::iterator iterTypeFreqPrecedingWord; + iterTypeFreqPrecedingWord = this->typeFreqPrecedingWord.find(w_i); + + if(iterTypeFreqPrecedingWord==this->typeFreqPrecedingWord.end()){ //does not exist yet + TextLenType startPosInSA = this->level1Buckets[w_i].first; + TextLenType endPosInSA = this->level1Buckets[w_i].last; + + set<IndexType> wordTypePrecedesW_i; + for(i=startPosInSA;i<=endPosInSA;i++){ + posInCorpus = this->suffix_list[i] - 1; + precedingWord = this->corpus_list[posInCorpus]; + + wordTypePrecedesW_i.insert(precedingWord); + } + + result.N1plus_dot_w_in2_i = (double) wordTypePrecedesW_i.size(); + + //and save this for future references + this->typeFreqPrecedingWord.insert(make_pair(w_i, wordTypePrecedesW_i.size())); + } + else{ //already has the information in typeFreqPrecedingWord + result.N1plus_dot_w_in2_i = (double) (iterTypeFreqPrecedingWord->second); + } + } + else{ + set<IndexType> wordTypesPrecedesW_in2_i; + set< pair<IndexType, IndexType> > wordTypesSurroundW_in2_i1; + + for(i=leftBoundaryOfSaRangeFor_w_in2_i1;i<=rightBoundaryOfSaRangeFor_w_in2_i1;i++){ + posInCorpus = this->suffix_list[i] -1; //pos of preceding word (w_{i-n+1}) in the corpus + precedingWord = this->corpus_list[posInCorpus]; + + posInCorpus+=n1; //pos of following word w_i in the corpus + followingWord = this->corpus_list[posInCorpus]; + + pair<IndexType, IndexType> tmpPair = make_pair(precedingWord, followingWord); + + //if w_i equals next word, add the preceding word to set + if(followingWord==w_i){ + wordTypesPrecedesW_in2_i.insert(precedingWord); + } + + //add the pair to set + wordTypesSurroundW_in2_i1.insert(tmpPair); + + } + + + result.N1plus_dot_w_in2_i = wordTypesPrecedesW_in2_i.size(); + result.N1plus_dot_w_in2_i1_dot = wordTypesSurroundW_in2_i1.size(); + } + + result.valid = true; +} diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h new file mode 100755 index 0000000..9f9155a --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h @@ -0,0 +1,210 @@ +#if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__) +#define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__ + + +#include "_SuffixArraySearchApplicationBase.h" +#include "salm_shared.h" +#include "time.h" + +/** +* \ingroup lm +* Context type information needed in KN-smoothing +**/ +typedef struct s_contextTypeInfo{ + double N1plus_dot_w_in2_i; //Goodman and Chen 98, eq 23 + double N1plus_dot_w_in2_i1_dot; + double N1_w_in1_i1_dot; //Goodman and Chen 98, eq 19 + double N2_w_in1_i1_dot; + double N3plus_w_in1_i1_dot; + bool valid; +}S_ContextTypeInfo; + + +/** +* \ingroup lm +**/ +typedef unsigned int LMState; + + +/** +* \ingroup lm +**/ +typedef struct s_lmStateInfo{ + TextLenType posInCorpus; + unsigned char len; +}S_LMStateInfo; + +/** +* \ingroup lm +**/ +typedef struct s_bufferedLmInfo{ + int nextState; + double logProb; +}S_BufferedLmInfo; + + +/** +* \ingroup lm +**/ +struct lt_lmStateInfo +{ + bool operator()(S_LMStateInfo a, S_LMStateInfo b) const{ + if(a.posInCorpus<b.posInCorpus){ + return true; + } + + if(a.posInCorpus>b.posInCorpus){ + return false; + } + + if(a.len<b.len){ + return true; + } + + return false; + } +}; + + +/** +* \ingroup lm +* structure for elements in the cache for accessing the suffix array for LM prob +**/ +typedef struct s_cached_SA_access_key{ + TextLenType currentMatchStart; + unsigned char currentMatchLen; + IndexType nextWord; +}S_CachedSA_Access_Key; + +typedef struct s_cached_SA_access_info{ + TextLenType updatedMatchingStart; + unsigned char updatedMatchingLen; + double logProb; + long lastTimedUsed; +}S_Cached_SA_Access_Info; + +struct lt_s_cached_SA_access_key +{ + bool operator()(S_CachedSA_Access_Key a, S_CachedSA_Access_Key b) const{ + if(a.currentMatchStart<b.currentMatchStart){ + return true; + } + + if(a.currentMatchStart>b.currentMatchStart){ + return false; + } + + if(a.currentMatchLen<b.currentMatchLen){ + return true; + } + + if(a.currentMatchLen>b.currentMatchLen){ + return false; + } + + if(a.nextWord<b.nextWord){ + return true; + } + + return false; + } +}; + + +/** +* \ingroup lm +* C_SuffixArrayLanguageModel inherit the C_SuffixArraySearchApplicationBase class and C_SuffixArrayScanningBase +* to provide functionalities of estimating the likelihood of a sentence given an indexed training corpus +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase +{ + +public: + IndexType returnVocId(C_String aWord); + + /// At the beginning of a sentence, return the LMState and reset the cache + LMState beginOfSentenceState(); + + /// Calculate the log prob of a word predicted by the history LM state + double logProb(LMState lmState, IndexType nextWord, LMState & nextState); + + /// The log prob of a phrase extending the history as a LMState + double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState); + + /// End of sentence + double logProbEnd(LMState lmState); + + /// Constructors + C_SuffixArrayLanguageModel(const char * cfgFileName); + C_SuffixArrayLanguageModel(); + ~C_SuffixArrayLanguageModel(); + + +private: + void scanCorpusForContextTypeInfo(int n, IndexType w_i, TextLenType leftBoundaryOfSaRangeFor_w_in2_i1, TextLenType rightBoundaryOfSaRangeFor_w_in2_i1, TextLenType leftBoundaryOfSaRangeFor_w_in1_i1, TextLenType rigthBoundaryOfSaRangeFor_w_in1_i1, S_ContextTypeInfo & result); + + void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen); + void calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, S_ContextTypeInfo * contextTypeInfo, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen); + + //Log prob calculation + double logProbOfNgramFromCorpusInfo(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen); + double calcLogProb(double *freq); + double calcLogProb_equalWeightedInterpolation(double *freq); + double calcLogProb_ibmHeuristicInterpolation(double *freq); + double calcLogProb_maxProbInterpolation(double * freq); + double calcLogProb_kneserNeySmoothing(double *freq, S_ContextTypeInfo * contextTypeFreq); + + ///parameter and settings + ///set the interploation strategy + void setParam_interpolationStrategy(char interpolationStrategy); + + ///set the number of sentences processed by the LM before purging the cache + void setParam_numberOfSentSeenToPurgeCache(int numberOfSentSeenToPurgeCache); + + ///set the fresh time thresh for the cache entries + void setParam_freshTime(long freshTime); + + char smoothingStrategy; + char interpolationStrategy; + int maxN; + IndexType vocIdForSentStart; + IndexType vocIdForSentEnd; + IndexType vocIdForCorpusEnd; + + + ///Discounting + void constructDiscountingMap(); + double discountFreq_GT(int n, unsigned int observedFreq); + + double * Y; // following the notation of Chen&Goodman 98, Eq. 26 + double * D1; + double * D2; + double * D3plus; + double typeOfBigrams; //will be needed for KN-smoothing + + double *discountingMap; + bool applyDiscounting; + int maxFreqForDiscounting; + S_nGramScanningInfoElement * nGramScanningList; + map<IndexType, unsigned int> typeFreqPrecedingWord; + + ///LM State and related functions + void resetLmStates(); + void initialLmState(); + map< pair<LMState, IndexType>, S_BufferedLmInfo> buffer; + vector<S_LMStateInfo> allLMStates; + map<S_LMStateInfo, int, lt_lmStateInfo> lmStateInfo2Id; + + //caching information for SA access + unsigned int sentenceProcessedSoFar; + long freshTime; + unsigned int numberOfSentSeenToPurgeCache; + map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key> cached_sa_access; + void purgeCache(long lastVisitedTime); + +}; + +#endif diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp new file mode 100755 index 0000000..0a94ff0 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp @@ -0,0 +1,691 @@ +/** +* Revision $Rev: 3815 $ +* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $ +**/ + +#include "_SuffixArrayLanguageModel.h" +#include <iostream> +#include <fstream> +#include <stdlib.h> +#include <memory.h> +#include <cstring> + +#include "math.h" + +using namespace std; + + +C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel() +{ + +} + +C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel() +{ + +} + + +/** +* Construct the suffix array language model object +* Using the training data corpusFileNameStem that has been indexed by IndexSA +* Consider at most maxN-gram in language modeling +* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting +* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history +* @param cfgFileName Configuration file that specifies the value of parameters for SALM +* +* Each line in the configuration file is a Keyword Value pair. Legal keywords are: +* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified! +* N : Highest order of n considered for n-gram LM estimation, default value = 5 +* MAX_FREQ_DISC : When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1. +* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories +* 'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities +* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history) +* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable +**/ +C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName) +{ + + fstream cfgFile; + cfgFile.open(cfgFileName,ios::in); + + if(!cfgFile){ + fprintf(stderr,"Configuration file does not exist! quit!!\n"); + exit(0); + } + + //----------------------------------------------------------------------------- + //reading parameters + char paraName[1024]; + char corpusFileNameStem[1024]; + corpusFileNameStem[0]=0; + this->maxFreqForDiscounting=-1; + + this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob + this->maxN = 5; // default value; consider up to 5 words + + while(!cfgFile.eof()){ + cfgFile>>paraName; + + if(strcmp(paraName,"CORPUS")==0){ + cfgFile>>corpusFileNameStem; + } + else if(strcmp(paraName,"N")==0){ + cfgFile>>this->maxN; + } + else if(strcmp(paraName,"MAX_FREQ_DISC")==0){ + cfgFile>>maxFreqForDiscounting; + } + else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){ + cfgFile>>this->interpolationStrategy; + } + + paraName[0]=0; + + } + + //load corpus and suffix array + if(strlen(corpusFileNameStem)==0){ + cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n"; + exit(-1); + } + this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset, + + + //if apply discounting construct the discounting map + if(this->maxFreqForDiscounting<=0){ + this->applyDiscounting = false; + } + else{ + this->applyDiscounting = true; + this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map + } + + //get vocID for sentEnd + this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_")); + + if(this->vocIdForSentEnd==0){ + cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_")); + if(this->vocIdForSentStart==0){ + cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_")); + if(this->vocIdForCorpusEnd==0){ + cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n"; + exit(0); + } + + this->interpolationStrategy = 'e'; //default: interpolation strategy: equally weighted n-gram conditional prob + +} + + +/** +* Similar to the function in C_SuffixArrayScanningBase +* Scan the corpus to obtain count of counts information +* and construct the discounting using Good-Turing smoothing +**/ +void C_SuffixArrayLanguageModel::constructDiscountingMap() +{ + int i,j; + unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting); + + if(countOfCountsTable==NULL){ + cerr<<"Count of counts table can not be initialized. Exit\n"; + exit(0); + } + + //initialize count of counts table + for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){ + countOfCountsTable[c]=0; + } + + //initialize the scanning list + S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN); + for(i=0;i<this->maxN;i++){ + nGramScanningList[i].freqSoFar=0; + nGramScanningList[i].vocId = 0; + nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output + } + + bool stillMeaningful = true; + TextLenType saPos=0; + + while(stillMeaningful && ( saPos<this->corpusSize ) ){ + + TextLenType posInCorpus = this->suffix_list[saPos]; + IndexType wordInCorpus = this->corpus_list[posInCorpus]; + + if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting + + if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested + + bool quit =false; + i=0; + + while(!quit && (i<this->maxN)){ + wordInCorpus = this->corpus_list[posInCorpus+i]; + if( + (wordInCorpus<this->sentIdStart)&& + (wordInCorpus!=this->vocIdForSentEnd)&& + (wordInCorpus!=this->vocIdForSentStart)&& + (wordInCorpus==nGramScanningList[i].vocId)){ //still match + + nGramScanningList[i].freqSoFar++; + } + else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type + + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + + for(j=i;j<this->maxN;j++){ + + + if(nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ //perform actions depends on actionType + + freqSoFar = nGramScanningList[j].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){ + //increase the count for (j+1)-gram with freq freqSoFar + countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++; + } + } + + //finished output, now clear the list from point of i + if((posInCorpus+j)<this->corpusSize){ + wordInCorpus = this->corpus_list[posInCorpus+j]; + } + else{ + wordInCorpus = 0; //out of bound for corpus + } + + if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){ + wordInCorpus=0; //write 0 for <sentId>, <s> and </s> + nGramScanningList[j].freqSoFar = 0; + } + else{ + nGramScanningList[j].freqSoFar = 1; + } + + nGramScanningList[j].vocId = wordInCorpus; + } + + quit=true; //at i+1 gram, already not match, no need to check for longer + } + + i++; + } + } + } + else{ + stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text + } + + saPos++; + } + + //at the end of corpus (according to suffix order) + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + for(i=0;i<this->maxN;i++){ + if(nGramScanningList[i].vocId==0){ //invalide word + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ + + freqSoFar = nGramScanningList[i].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){ + //increase the count for (i+1)-gram with freq freqSoFar + countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++; + } + } + } + + //now, use Good-Turing discounting to create frequency mapping + //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted + this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting); + + for(i=0;i<this->maxN;i++){ + //for (i+1)-gram + + unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting; + double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting; + + for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq + //for all (freq+1) ngrams + if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists + discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]); + } + else{ + discountingMapForThisN[freq] = -1; + } + } + + discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency + } + + + free(countOfCountsTable); + +} + +///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1] +///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord +///only need to get freq(w_n | history) of different history +///return in freq table, freq(history+Wn, history) for all the matched n +///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history +/// 3-gram freq, freq of 3-gram history +///freqTable should have length of 2*n +///return the longest match with this updated n-gram +void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + vector<IndexType> nGram; + + if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk> + if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram + currentMatchStart++; + currentMatchLen--; + } + + for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){ + nGram.push_back(this->corpus_list[pos]); + } + } + + nGram.push_back(nextWord); + + int sentLen = nGram.size(); + + //construct the n-gram search table + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram); + + int startPosForNgram; + int startPosForLongestMatchingWithNextWord; + int cellIndexForLongestMatchingWithNextWord; + + bool stillMatched = true; + bool atLeastOneMatched = false; + + int indexForNgram; + + unsigned int totalOccurrences; + unsigned int totalOccurrencesOfHistory; + + //for unigram + indexForNgram = sentLen - 1; + if(table[indexForNgram].found){ + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + if(this->applyDiscounting){ + freqTable[0] = this->discountFreq(1, totalOccurrences); + } + else{ + freqTable[0] = totalOccurrences; + } + + freqTable[1] = this->corpusSize; + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = sentLen-1; + atLeastOneMatched = true; + } + else{ + stillMatched = false; + } + + int n=2; //considering 2-gram and longer n-gram now + startPosForNgram = sentLen - 2; + while((stillMatched)&&(startPosForNgram>=0)){ + + indexForNgram = (n-1) * sentLen + startPosForNgram; + int indexForHistory = (n-2) * sentLen + startPosForNgram; + + if(table[indexForNgram].found){ + + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1; + + + if(this->applyDiscounting){ + freqTable[2*n-2] = this->discountFreq(n, totalOccurrences); + } + else{ + freqTable[2*n-2] = (double)totalOccurrences; + } + + freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history + + if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = startPosForNgram; + } + } + else{ + stillMatched = false; + } + + startPosForNgram--; + n++; + } + + if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord' + updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA]; + updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord); + } + else{ + updatedMatchingStart = (TextLenType) -1; + updatedMatchingLen = 0; + } + + free(table); + +} + + +//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing +double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq) +{ + if(n>=this->maxN){ //do not discount + return (double) observedFreq; + } + + if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq + return (double) observedFreq; + } + + //else, check the discount map + double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1]; + + if(discountedFreq>0){ + return discountedFreq; + } + + //else, no discounting + return (double) observedFreq; +} + + +///Start a new sentence now, clear up the sentence LM state +LMState C_SuffixArrayLanguageModel::beginOfSentenceState() +{ + + this->resetLmStates(); + this->initialLmState(); + + return 0; +} + +void C_SuffixArrayLanguageModel::initialLmState() +{ + //add sentence start + S_LMStateInfo sentStartNode; + sentStartNode.locationInCorpus.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s> + sentStartNode.locationInCorpus.len = 1; + sentStartNode.cachedNextWordExtension.clear(); + + this->allLMStates.push_back(sentStartNode); + this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0)); +} + +void C_SuffixArrayLanguageModel::resetLmStates() +{ + this->allLMStates.clear(); + this->ngramLocation2LmStateId.clear(); +} + + +/** +* Given the current history (as represented by the 'lmState' +* caculate the log prob of nextWord given this history P(nextword|history) +* and return the updated language model state with next word appended to the history +* @param lmState Current language model state +* @param nextWord The vocId of the next word (the word to be predicted) +* @param &nextState Returning the updated language model state when the next word is appended +**/ +double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState) +{ + if(lmState>=this->allLMStates.size()){ + cerr<<"Invalid LM State: "<<lmState<<endl; + exit(-1); + } + + //first check if we have already seen this 'nextWord' before + map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache; + iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord ); + + if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet + + //search for it in the corpus + S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus; + S_NgramLocationInCorpus updatedNgramLocation; + + double logProb = this->logProbFromFreq( + correspondingNgramLocation.posInCorpus, + correspondingNgramLocation.len, + nextWord, + updatedNgramLocation.posInCorpus, + updatedNgramLocation.len); + + //caching the logprob of 'nextword' given the lmState + int updatedLmStateId; + map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId; + iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation); + if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){ //this updated lm state does not exist yet + S_LMStateInfo newLmStateNode; + + newLmStateNode.locationInCorpus = updatedNgramLocation; + newLmStateNode.cachedNextWordExtension.clear(); + + this->allLMStates.push_back(newLmStateNode); + updatedLmStateId = this->allLMStates.size() -1 ; + this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId)); + } + else{ + updatedLmStateId = iterNgramLocation2LmStateId->second; + } + + //cache this + S_CachedLmInfo cachedLmInfo; + cachedLmInfo.logProb = logProb; + cachedLmInfo.nextState = updatedLmStateId; + + this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo)); + + //updated next state + nextState = updatedLmStateId; + + return logProb; + } + + nextState = iterNextWordExtensionCache->second.nextState; + + return iterNextWordExtensionCache->second.logProb; +} + + +/** +* Given the history as lmState and append a phrase as a vector of IndexType, +* calculate the LM prob and update the lm state +* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase. +* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function. + * @param lmState Current language model state +* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted) +* @param &nextState Returning the updated language model state when the next word is appended +**/ +double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState) +{ + double logProb = 0; + + if (phrase.size() == 0) { + nextState = lmState; + return logProb; + } + + for(int i=0;i<phrase.size();i++){ + logProb+=this->logProb(lmState, phrase[i], nextState); + lmState = nextState; + } + + return logProb; +} + +/** +* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s> +**/ +double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState) +{ + LMState dummyNextState; + return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState); +} + +/** +* Extend the current matched n-gram with next word, calculate the prob and update the updated range +* the n-gram is represented by its position in the suffix array and the length +* @param currentMatchStart Starting position of the current matched n-gram in corpus +* @param currentMatchLen Length of the matched n-gram \ +* @param nextWord Vocabulary ID of the next word (the word to be predicted) +* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus +* @param &updatedMatchingLen The length of the extended n-gram +**/ +double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + + double logProb; + + double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN)); + memset(freqTable, 0, 2*this->maxN*sizeof(double)); + + this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen); + + logProb = this->calcLogProb(freqTable); + + free(freqTable); + + return logProb; + +} + +double C_SuffixArrayLanguageModel::calcLogProb(double *freq) +{ + switch(this->interpolationStrategy){ + case 'e': + return this->calcLogProb_equalWeightedInterpolation(freq); + break; + case 'i': + return this->calcLogProb_ibmHeuristicInterpolation(freq); + break; + case 'm': + return this->calcLogProb_maxProbInterpolation(freq); + break; + default: + cerr<<"Unknown interpolation strategy!\n"; + exit(0); + } +} + +double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq) +{ + double prob = 0.0; + + + if(freq[0]>0){ + + int i=0; + bool stillMatched = true; + + while(stillMatched && (i<this->maxN)){ + if(freq[2*i]>0){ + prob+=freq[2*i]/freq[2*i+1]; + } + else{ + stillMatched = false; + } + + i++; + } + + return log(prob/(double)this->maxN); + } + else{ //unknown word + return SALM_LOG_PROB_UNK; + } +} + +double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq) +{ + double prob = 0.0; + if(freq[0]==0){ //unknown word + return SALM_LOG_PROB_UNK; + } + + double remainingWeightSum = 1.0; + + //find the first non-zero match + int i = this->maxN - 1; + + while(freq[2*i]==0){ //will stop for sure because freq[0]!=0 + i--; + } + + for(int j=i;j>=0;j--){ + //for (j+1)-gram + double historyFreq = freq[2*j+1]; + double logHistoryFreq = log(historyFreq); + if(logHistoryFreq>1){ + logHistoryFreq = 1.0; //cap it to 1 + } + + double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history + double adjustedWeights = remainingWeightSum * reliability; + + prob+=adjustedWeights * freq[2*i]/freq[2*i+1]; + + remainingWeightSum -= adjustedWeights; + } + + return log(prob); +} + +double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq) +{ + double maxProb = 0.0; + + if(freq[0]>0){ + + int i=0; + bool stillMatched = true; + + while(stillMatched && (i<this->maxN)){ + if(freq[2*i]>0){ + double prob=freq[2*i]/freq[2*i+1]; + + if(prob>maxProb){ + maxProb = prob; + } + } + else{ + stillMatched = false; + } + + i++; + } + + return log(maxProb); + } + else{ //unknown word + return SALM_LOG_PROB_UNK; + } +} + +IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord) +{ + return this->voc->returnId(aWord); +} diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ new file mode 100755 index 0000000..5241621 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ @@ -0,0 +1,690 @@ +/** +* Revision $Rev: 3815 $ +* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $ +**/ + +#include "_SuffixArrayLanguageModel.h" +#include <iostream> +#include <fstream> +#include <stdlib.h> +#include <memory.h> + +#include "math.h" + +using namespace std; + + +C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel() +{ + +} + +C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel() +{ + +} + + +/** +* Construct the suffix array language model object +* Using the training data corpusFileNameStem that has been indexed by IndexSA +* Consider at most maxN-gram in language modeling +* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting +* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history +* @param cfgFileName Configuration file that specifies the value of parameters for SALM +* +* Each line in the configuration file is a Keyword Value pair. Legal keywords are: +* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified! +* N : Highest order of n considered for n-gram LM estimation, default value = 5 +* MAX_FREQ_DISC : When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1. +* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories +* 'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities +* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history) +* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable +**/ +C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName) +{ + + fstream cfgFile; + cfgFile.open(cfgFileName,ios::in); + + if(!cfgFile){ + fprintf(stderr,"Configuration file does not exist! quit!!\n"); + exit(0); + } + + //----------------------------------------------------------------------------- + //reading parameters + char paraName[1024]; + char corpusFileNameStem[1024]; + corpusFileNameStem[0]=0; + this->maxFreqForDiscounting=-1; + + this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob + this->maxN = 5; // default value; consider up to 5 words + + while(!cfgFile.eof()){ + cfgFile>>paraName; + + if(strcmp(paraName,"CORPUS")==0){ + cfgFile>>corpusFileNameStem; + } + else if(strcmp(paraName,"N")==0){ + cfgFile>>this->maxN; + } + else if(strcmp(paraName,"MAX_FREQ_DISC")==0){ + cfgFile>>maxFreqForDiscounting; + } + else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){ + cfgFile>>this->interpolationStrategy; + } + + paraName[0]=0; + + } + + //load corpus and suffix array + if(strlen(corpusFileNameStem)==0){ + cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n"; + exit(-1); + } + this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset, + + + //if apply discounting construct the discounting map + if(this->maxFreqForDiscounting<=0){ + this->applyDiscounting = false; + } + else{ + this->applyDiscounting = true; + this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map + } + + //get vocID for sentEnd + this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_")); + + if(this->vocIdForSentEnd==0){ + cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_")); + if(this->vocIdForSentStart==0){ + cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_")); + if(this->vocIdForCorpusEnd==0){ + cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n"; + exit(0); + } + + this->interpolationStrategy = 'e'; //default: interpolation strategy: equally weighted n-gram conditional prob + +} + + +/** +* Similar to the function in C_SuffixArrayScanningBase +* Scan the corpus to obtain count of counts information +* and construct the discounting using Good-Turing smoothing +**/ +void C_SuffixArrayLanguageModel::constructDiscountingMap() +{ + int i,j; + unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting); + + if(countOfCountsTable==NULL){ + cerr<<"Count of counts table can not be initialized. Exit\n"; + exit(0); + } + + //initialize count of counts table + for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){ + countOfCountsTable[c]=0; + } + + //initialize the scanning list + S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN); + for(i=0;i<this->maxN;i++){ + nGramScanningList[i].freqSoFar=0; + nGramScanningList[i].vocId = 0; + nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output + } + + bool stillMeaningful = true; + TextLenType saPos=0; + + while(stillMeaningful && ( saPos<this->corpusSize ) ){ + + TextLenType posInCorpus = this->suffix_list[saPos]; + IndexType wordInCorpus = this->corpus_list[posInCorpus]; + + if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting + + if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested + + bool quit =false; + i=0; + + while(!quit && (i<this->maxN)){ + wordInCorpus = this->corpus_list[posInCorpus+i]; + if( + (wordInCorpus<this->sentIdStart)&& + (wordInCorpus!=this->vocIdForSentEnd)&& + (wordInCorpus!=this->vocIdForSentStart)&& + (wordInCorpus==nGramScanningList[i].vocId)){ //still match + + nGramScanningList[i].freqSoFar++; + } + else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type + + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + + for(j=i;j<this->maxN;j++){ + + + if(nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ //perform actions depends on actionType + + freqSoFar = nGramScanningList[j].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){ + //increase the count for (j+1)-gram with freq freqSoFar + countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++; + } + } + + //finished output, now clear the list from point of i + if((posInCorpus+j)<this->corpusSize){ + wordInCorpus = this->corpus_list[posInCorpus+j]; + } + else{ + wordInCorpus = 0; //out of bound for corpus + } + + if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){ + wordInCorpus=0; //write 0 for <sentId>, <s> and </s> + nGramScanningList[j].freqSoFar = 0; + } + else{ + nGramScanningList[j].freqSoFar = 1; + } + + nGramScanningList[j].vocId = wordInCorpus; + } + + quit=true; //at i+1 gram, already not match, no need to check for longer + } + + i++; + } + } + } + else{ + stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text + } + + saPos++; + } + + //at the end of corpus (according to suffix order) + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + for(i=0;i<this->maxN;i++){ + if(nGramScanningList[i].vocId==0){ //invalide word + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ + + freqSoFar = nGramScanningList[i].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){ + //increase the count for (i+1)-gram with freq freqSoFar + countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++; + } + } + } + + //now, use Good-Turing discounting to create frequency mapping + //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted + this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting); + + for(i=0;i<this->maxN;i++){ + //for (i+1)-gram + + unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting; + double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting; + + for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq + //for all (freq+1) ngrams + if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists + discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]); + } + else{ + discountingMapForThisN[freq] = -1; + } + } + + discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency + } + + + free(countOfCountsTable); + +} + +///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1] +///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord +///only need to get freq(w_n | history) of different history +///return in freq table, freq(history+Wn, history) for all the matched n +///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history +/// 3-gram freq, freq of 3-gram history +///freqTable should have length of 2*n +///return the longest match with this updated n-gram +void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + vector<IndexType> nGram; + + if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk> + if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram + currentMatchStart++; + currentMatchLen--; + } + + for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){ + nGram.push_back(this->corpus_list[pos]); + } + } + + nGram.push_back(nextWord); + + int sentLen = nGram.size(); + + //construct the n-gram search table + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram); + + int startPosForNgram; + int startPosForLongestMatchingWithNextWord; + int cellIndexForLongestMatchingWithNextWord; + + bool stillMatched = true; + bool atLeastOneMatched = false; + + int indexForNgram; + + unsigned int totalOccurrences; + unsigned int totalOccurrencesOfHistory; + + //for unigram + indexForNgram = sentLen - 1; + if(table[indexForNgram].found){ + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + if(this->applyDiscounting){ + freqTable[0] = this->discountFreq(1, totalOccurrences); + } + else{ + freqTable[0] = totalOccurrences; + } + + freqTable[1] = this->corpusSize; + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = sentLen-1; + atLeastOneMatched = true; + } + else{ + stillMatched = false; + } + + int n=2; //considering 2-gram and longer n-gram now + startPosForNgram = sentLen - 2; + while((stillMatched)&&(startPosForNgram>=0)){ + + indexForNgram = (n-1) * sentLen + startPosForNgram; + int indexForHistory = (n-2) * sentLen + startPosForNgram; + + if(table[indexForNgram].found){ + + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1; + + + if(this->applyDiscounting){ + freqTable[2*n-2] = this->discountFreq(n, totalOccurrences); + } + else{ + freqTable[2*n-2] = (double)totalOccurrences; + } + + freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history + + if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = startPosForNgram; + } + } + else{ + stillMatched = false; + } + + startPosForNgram--; + n++; + } + + if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord' + updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA]; + updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord); + } + else{ + updatedMatchingStart = (TextLenType) -1; + updatedMatchingLen = 0; + } + + free(table); + +} + + +//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing +double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq) +{ + if(n>=this->maxN){ //do not discount + return (double) observedFreq; + } + + if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq + return (double) observedFreq; + } + + //else, check the discount map + double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1]; + + if(discountedFreq>0){ + return discountedFreq; + } + + //else, no discounting + return (double) observedFreq; +} + + +///Start a new sentence now, clear up the sentence LM state +LMState C_SuffixArrayLanguageModel::beginOfSentenceState() +{ + + this->resetLmStates(); + this->initialLmState(); + + return 0; +} + +void C_SuffixArrayLanguageModel::initialLmState() +{ + //add sentence start + S_LMStateInfo sentStartNode; + sentStartNode.locationInCorpus.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s> + sentStartNode.locationInCorpus.len = 1; + sentStartNode.cachedNextWordExtension.clear(); + + this->allLMStates.push_back(sentStartNode); + this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0)); +} + +void C_SuffixArrayLanguageModel::resetLmStates() +{ + this->allLMStates.clear(); + this->ngramLocation2LmStateId.clear(); +} + + +/** +* Given the current history (as represented by the 'lmState' +* caculate the log prob of nextWord given this history P(nextword|history) +* and return the updated language model state with next word appended to the history +* @param lmState Current language model state +* @param nextWord The vocId of the next word (the word to be predicted) +* @param &nextState Returning the updated language model state when the next word is appended +**/ +double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState) +{ + if(lmState>=this->allLMStates.size()){ + cerr<<"Invalid LM State: "<<lmState<<endl; + exit(-1); + } + + //first check if we have already seen this 'nextWord' before + map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache; + iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord ); + + if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet + + //search for it in the corpus + S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus; + S_NgramLocationInCorpus updatedNgramLocation; + + double logProb = this->logProbFromFreq( + correspondingNgramLocation.posInCorpus, + correspondingNgramLocation.len, + nextWord, + updatedNgramLocation.posInCorpus, + updatedNgramLocation.len); + + //caching the logprob of 'nextword' given the lmState + int updatedLmStateId; + map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId; + iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation); + if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){ //this updated lm state does not exist yet + S_LMStateInfo newLmStateNode; + + newLmStateNode.locationInCorpus = updatedNgramLocation; + newLmStateNode.cachedNextWordExtension.clear(); + + this->allLMStates.push_back(newLmStateNode); + updatedLmStateId = this->allLMStates.size() -1 ; + this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId)); + } + else{ + updatedLmStateId = iterNgramLocation2LmStateId->second; + } + + //cache this + S_CachedLmInfo cachedLmInfo; + cachedLmInfo.logProb = logProb; + cachedLmInfo.nextState = updatedLmStateId; + + this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo)); + + //updated next state + nextState = updatedLmStateId; + + return logProb; + } + + nextState = iterNextWordExtensionCache->second.nextState; + + return iterNextWordExtensionCache->second.logProb; +} + + +/** +* Given the history as lmState and append a phrase as a vector of IndexType, +* calculate the LM prob and update the lm state +* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase. +* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function. + * @param lmState Current language model state +* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted) +* @param &nextState Returning the updated language model state when the next word is appended +**/ +double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState) +{ + double logProb = 0; + + if (phrase.size() == 0) { + nextState = lmState; + return logProb; + } + + for(int i=0;i<phrase.size();i++){ + logProb+=this->logProb(lmState, phrase[i], nextState); + lmState = nextState; + } + + return logProb; +} + +/** +* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s> +**/ +double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState) +{ + LMState dummyNextState; + return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState); +} + +/** +* Extend the current matched n-gram with next word, calculate the prob and update the updated range +* the n-gram is represented by its position in the suffix array and the length +* @param currentMatchStart Starting position of the current matched n-gram in corpus +* @param currentMatchLen Length of the matched n-gram \ +* @param nextWord Vocabulary ID of the next word (the word to be predicted) +* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus +* @param &updatedMatchingLen The length of the extended n-gram +**/ +double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + + double logProb; + + double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN)); + memset(freqTable, 0, 2*this->maxN*sizeof(double)); + + this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen); + + logProb = this->calcLogProb(freqTable); + + free(freqTable); + + return logProb; + +} + +double C_SuffixArrayLanguageModel::calcLogProb(double *freq) +{ + switch(this->interpolationStrategy){ + case 'e': + return this->calcLogProb_equalWeightedInterpolation(freq); + break; + case 'i': + return this->calcLogProb_ibmHeuristicInterpolation(freq); + break; + case 'm': + return this->calcLogProb_maxProbInterpolation(freq); + break; + default: + cerr<<"Unknown interpolation strategy!\n"; + exit(0); + } +} + +double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq) +{ + double prob = 0.0; + + + if(freq[0]>0){ + + int i=0; + bool stillMatched = true; + + while(stillMatched && (i<this->maxN)){ + if(freq[2*i]>0){ + prob+=freq[2*i]/freq[2*i+1]; + } + else{ + stillMatched = false; + } + + i++; + } + + return log(prob/(double)this->maxN); + } + else{ //unknown word + return SALM_LOG_PROB_UNK; + } +} + +double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq) +{ + double prob = 0.0; + if(freq[0]==0){ //unknown word + return SALM_LOG_PROB_UNK; + } + + double remainingWeightSum = 1.0; + + //find the first non-zero match + int i = this->maxN - 1; + + while(freq[2*i]==0){ //will stop for sure because freq[0]!=0 + i--; + } + + for(int j=i;j>=0;j--){ + //for (j+1)-gram + double historyFreq = freq[2*j+1]; + double logHistoryFreq = log(historyFreq); + if(logHistoryFreq>1){ + logHistoryFreq = 1.0; //cap it to 1 + } + + double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history + double adjustedWeights = remainingWeightSum * reliability; + + prob+=adjustedWeights * freq[2*i]/freq[2*i+1]; + + remainingWeightSum -= adjustedWeights; + } + + return log(prob); +} + +double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq) +{ + double maxProb = 0.0; + + if(freq[0]>0){ + + int i=0; + bool stillMatched = true; + + while(stillMatched && (i<this->maxN)){ + if(freq[2*i]>0){ + double prob=freq[2*i]/freq[2*i+1]; + + if(prob>maxProb){ + maxProb = prob; + } + } + else{ + stillMatched = false; + } + + i++; + } + + return log(maxProb); + } + else{ //unknown word + return SALM_LOG_PROB_UNK; + } +} + +IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord) +{ + return this->voc->returnId(aWord); +} diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h new file mode 100755 index 0000000..62427e5 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h @@ -0,0 +1,137 @@ +// Revision $Rev: 3794 $ +// Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ + +#if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__) +#define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__ + + +#include "_SuffixArraySearchApplicationBase.h" +#include "salm_shared.h" + +/** +* \ingroup lm +**/ +typedef unsigned int LMState; + + +/** +* \ingroup lm +**/ +typedef struct s_cachedLmInfo{ + int nextState; + double logProb; +}S_CachedLmInfo; + +/** +* \ingroup lm +**/ +typedef struct s_NgramLocationInCorpus{ + TextLenType posInCorpus; + unsigned char len; +}S_NgramLocationInCorpus; + +/** +* \ingroup lm +**/ +typedef struct s_lmStateInfo{ + S_NgramLocationInCorpus locationInCorpus; + map<IndexType, S_CachedLmInfo> cachedNextWordExtension; //cached information of this LMState extended by the next word +}S_LMStateInfo; + +/** +* \ingroup lm +**/ +struct lt_ngramLocationInCorpus +{ + bool operator()(S_NgramLocationInCorpus a, S_NgramLocationInCorpus b) const{ + if(a.posInCorpus<b.posInCorpus){ + return true; + } + + if(a.posInCorpus>b.posInCorpus){ + return false; + } + + if(a.len<b.len){ + return true; + } + + return false; + } +}; + + +/** +* \ingroup lm +* C_SuffixArrayLanguageModel inherit the C_SuffixArraySearchApplicationBase class and C_SuffixArrayScanningBase +* to provide functionalities of estimating the likelihood of a sentence given an indexed training corpus +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase +{ + +public: + IndexType returnVocId(C_String aWord); + + /// At the beginning of a sentence, return the LMState and reset the cache + LMState beginOfSentenceState(); + + /// Calculate the log prob of a word predicted by the history LM state + double logProb(LMState lmState, IndexType nextWord, LMState & nextState); + + /// The log prob of a phrase extending the history as a LMState + double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState); + + /// End of sentence + double logProbEnd(LMState lmState); + + ///set the interploation strategy + void setParam_interpolationStrategy(char interpolationStrategy); + + + C_SuffixArrayLanguageModel(const char * cfgFileName); + C_SuffixArrayLanguageModel(); + ~C_SuffixArrayLanguageModel(); + + +private: + + void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen); + + //Log prob calculation + double logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen); + double calcLogProb(double *freq); + double calcLogProb_equalWeightedInterpolation(double *freq); + double calcLogProb_ibmHeuristicInterpolation(double *freq); + double calcLogProb_maxProbInterpolation(double * freq); + + char interpolationStrategy; + int maxN; + IndexType vocIdForSentStart; + IndexType vocIdForSentEnd; + IndexType vocIdForCorpusEnd; + + ///Discounting + void constructDiscountingMap(); + double *discountingMap; + double discountFreq(int n, unsigned int observedFreq); + bool applyDiscounting; + int maxFreqForDiscounting; + S_nGramScanningInfoElement * nGramScanningList; + + + ///LM State and related functions + void resetLmStates(); + void initialLmState(); + + //caching lm prob for each sentence + vector<S_LMStateInfo> allLMStates; + map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus> ngramLocation2LmStateId; + + + +}; + +#endif diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp new file mode 100755 index 0000000..d7c96a2 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp @@ -0,0 +1,34 @@ + +#include "_SuffixArrayScanningBase.h" +#include "stdio.h" +#include "stdlib.h" +#include <iostream> +#include <fstream> +#include <map> + +using namespace std; + +/** +* Given a corpus indexed by its suffix array, output the count-of-count information +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + //----------------------------------------------------------------------------- + if(argc<4){ + fprintf(stderr,"\nGiven an indexed corpus, output the count of counts for n-grams.\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s fileNameStem maxN maxFreq\n\n",argv[0]); + exit(0); + } + + unsigned int maxN = atoi(argv[2]); + unsigned int maxFreq = atoi(argv[3]); + + C_SuffixArrayScanningBase saObj(argv[1], maxN); + saObj.scanSuffixArrayForCountofCounts(maxFreq); + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp new file mode 100755 index 0000000..8e9544a --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp @@ -0,0 +1,70 @@ +#include "_SuffixArrayScanningBase.h" +#include "stdio.h" +#include "stdlib.h" +#include <iostream> +#include <fstream> +#include <map> + +using namespace std; + +/** +* Output n-gram types that have frequencies equal or higher than specified +* +* +* CfgFile Format: +* n1<tab>freq thresh for output n1-gram +* n2<tab>freq thresh for output n2-gram +* ... ... ... +* n1<tab>freq thresh for output n1-gram +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + //----------------------------------------------------------------------------- + if(argc<3){ + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s fileNameStem cfgFile\n\n",argv[0]); + + fprintf(stderr,"\n\tCfgFile Format:"); + fprintf(stderr,"\n\t\tn1<tab>freq thresh for output n1-gram"); + fprintf(stderr,"\n\t\tn2<tab>freq thresh for output n2-gram"); + fprintf(stderr,"\n\t\t... ... ..."); + fprintf(stderr,"\n\t\tn1<tab>freq thresh for output n1-gram\n"); + + + exit(0); + } + + //processing the threshold file + map<int, unsigned int> threshMap; + map<int, unsigned int>::iterator iterThreshMap; + fstream threshFile; + threshFile.open(argv[2]); + int n; + int maxN = 0; + unsigned int thresh; + while(! threshFile.eof()){ + threshFile>>n>>thresh; + if(n>maxN){ + maxN=n; + } + iterThreshMap = threshMap.find(n); + if(iterThreshMap==threshMap.end()){ + threshMap.insert(make_pair(n,thresh)); //a little over-kill here, should have a well defined cfg file + } + } + + C_SuffixArrayScanningBase saObj(argv[1], maxN); + iterThreshMap = threshMap.begin(); + while(iterThreshMap!=threshMap.end()){ + saObj.setNgramOutputFreqThresh(iterThreshMap->first, iterThreshMap->second); + iterThreshMap++; + } + + saObj.scanSuffixArrayForHighFreqNgramType(); + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp new file mode 100755 index 0000000..35f9d3d --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp @@ -0,0 +1,32 @@ +#include "_SuffixArrayScanningBase.h" +#include "stdio.h" +#include "stdlib.h" +#include <iostream> +#include <fstream> +#include <map> + +using namespace std; + +/** +* Given an indexed corpus, output the type/token information of the n-grams in the corpus. +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + //----------------------------------------------------------------------------- + if(argc<3){ + fprintf(stderr,"\nGiven an indexed corpus, output the type token information for n-grams.\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s fileNameStem maxN \n\n",argv[0]); + exit(0); + } + + unsigned int maxN = atoi(argv[2]); + + C_SuffixArrayScanningBase saObj(argv[1], maxN); + saObj.scanSuffixArrayForTypeToken(); + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp new file mode 100755 index 0000000..9050408 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp @@ -0,0 +1,338 @@ +/** +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ + +#include "_SuffixArrayScanningBase.h" +#include <iostream> +#include <stdlib.h> + +using namespace std; + +C_SuffixArrayScanningBase::C_SuffixArrayScanningBase() +{ + this->countOfCountsTable = 0; //no memory has been allocated + this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough +} + +C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN) +{ + this->countOfCountsTable = 0; //no memory has been allocated + this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough + + //load suffix array + this->loadData(filename, false, true, true); + + this->initializeForScanning(filename, maxN); +} + +void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered) +{ + this->maxFreqConsidered = maxFreqConsidered; +} + + +/** +* Initialize data structure needed for scanning after the suffix array has been loaded +**/ +void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN) +{ + this->maxN = maxN; + this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN); + this->countOfCountsTable = 0; //no memory has been allocated + + //initialize the scanning list + for(int i=0;i<this->maxN;i++){ + this->nGramScanningList[i].freqSoFar=0; + this->nGramScanningList[i].vocId = 0; + this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output + } + + //get vocID for sentEnd + this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_")); + + if(this->vocIdForSentEnd==0){ + cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_")); + if(this->vocIdForSentStart==0){ + cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_")); + if(this->vocIdForCorpusEnd==0){ + cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n"; + exit(0); + } +} + +C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase() +{ + free(this->nGramScanningList); + + if(this->countOfCountsTable!=0){ + free(this->countOfCountsTable); + } + +} + +void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh) +{ + if(n>this->maxN){ + cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl; + exit(0); + } + + this->nGramScanningList[n-1].freqThreshForOutput = freqThresh; +} + +void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType() +{ + this->scanSuffixArray('H'); + +} + +/// Count of counts is the number of n-gram types that occur a certain times in the corpus. +/// Count of counts is important information in LM smoothing +/// We scan the corpus for n-gram's type/token frequency and collect information for 1-gram, 2-gram,...and up to maxFreqConsidered-gram +void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered) +{ + this->maxFreqConsidered = maxFreqConsidered; + this->constructCountOfCountsTable(); + + //output the count of counts + cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl; + for(int i=0;i<this->maxN;i++){ + cout<<i+1<<endl; + + unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered; + for(int freq=0;freq<maxFreqConsidered;freq++){ + cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl; + } + } + +} + +///Check from 1-gram to maxN-gram for type-token information +///the process is similar to "scanSuffixArrayForHighFreqNgramType" +void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken() +{ + this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN); + this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN); + + //initialize + for(int n=0;n<maxN;n++){ + this->typeFreq[n]=0; + this->tokenFreq[n]=0; + } + + + //scan the suffix array + this->scanSuffixArray('T'); + + //output + cout<<"n\tType\tToken\n"; + for(int i=0;i<this->maxN;i++){ + cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl; + } +} + +/** +* Allocate memory for count-of-counts table and scan the corpus to fill in count of counts +* memory will be freed in the destructor +**/ +void C_SuffixArrayScanningBase::constructCountOfCountsTable() +{ + if(this->countOfCountsTable!=0){ //if there is already a count of counts table + free(this->countOfCountsTable); + } + + this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered); + + if(this->countOfCountsTable==NULL){ + cerr<<"Count of counts table can not be initialized. Exit\n"; + exit(0); + } + + for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){ + this->countOfCountsTable[c]=0; + } + + this->scanSuffixArray('C'); + + +} + +/** +* Scan through the indexed corpus and according to the action type, +* perform actions accordingly when seeing a new n-gram type +**/ +void C_SuffixArrayScanningBase::scanSuffixArray(char actionType) +{ + + int i,j; + bool stillMeaningful = true; + TextLenType saPos=0; + + while(stillMeaningful && ( saPos<this->corpusSize ) ){ + + TextLenType posInCorpus = this->suffix_list[saPos]; + IndexType wordInCorpus = this->corpus_list[posInCorpus]; + + if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting + + if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested + + bool quit =false; + i=0; + + while(!quit && (i<this->maxN)){ + wordInCorpus = this->corpus_list[posInCorpus+i]; + if( + (wordInCorpus<this->sentIdStart)&& + (wordInCorpus!=this->vocIdForSentEnd)&& + (wordInCorpus!=this->vocIdForSentStart)&& + (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match + + this->nGramScanningList[i].freqSoFar++; + } + else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type + + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + C_String tmpPhrase; //for output high freq n-grams + + //prepare the prefix of the n-grams + if(actionType=='H'){ + //common i-gram + for(j=0;j<=i-1;j++){ + if(this->nGramScanningList[j].vocId==0){ //one of the word in the common i-gram is a NULL word, not a valid n-gram + validNgramUpSoFar = false; + } + tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId)); + tmpPhrase.appending(C_String(" ")); + } + } + + + for(j=i;j<this->maxN;j++){ + + + if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ //perform actions depends on actionType + + switch(actionType){ + + case 'C': //count of counts + freqSoFar = this->nGramScanningList[j].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){ + //increase the count for (j+1)-gram with freq freqSoFar + this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++; + } + break; + + case 'H': //output high-freq n-grams + tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId)); + tmpPhrase.appending(C_String(" ")); + + if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){ + cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl; + } + break; + + case 'T': //type-token statistics + if(this->nGramScanningList[j].freqSoFar>0){ + typeFreq[j]++; + } + + tokenFreq[j]+=this->nGramScanningList[j].freqSoFar; + + break; + default: + cerr<<"Unknown action!\n"; + exit(-1); + } + } + + //finished output, now clear the list from point of i + if((posInCorpus+j)<this->corpusSize){ + wordInCorpus = this->corpus_list[posInCorpus+j]; + } + else{ + wordInCorpus = 0; //out of bound for corpus + } + + if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){ + wordInCorpus=0; //write 0 for <sentId>, <s> and </s> + this->nGramScanningList[j].freqSoFar = 0; + } + else{ + this->nGramScanningList[j].freqSoFar = 1; + } + + this->nGramScanningList[j].vocId = wordInCorpus; + } + + quit=true; //at i+1 gram, already not match, no need to check for longer + } + + i++; + } + } + } + else{ + stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text + } + + saPos++; + } + + //at the end of corpus (according to suffix order) + C_String finalTmpString; //for output high-freq n-gram type + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + for(i=0;i<this->maxN;i++){ + if(this->nGramScanningList[i].vocId==0){ //invalide word + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ + switch(actionType){ + case 'C': //for count-of-counts + freqSoFar = this->nGramScanningList[i].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){ + //increase the count for (i+1)-gram with freq freqSoFar + this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++; + } + break; + + case 'H': //for high-freq n-gram types + finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId)); + finalTmpString.appending(C_String(" ")); + if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){ + cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl; + } + break; + + case 'T': //for type-token statistics + if(this->nGramScanningList[i].freqSoFar>0){ + typeFreq[i]++; + } + + tokenFreq[i]+=this->nGramScanningList[i].freqSoFar; + break; + + default: + cerr<<"Unknown action!\n"; + exit(-1); + } + } + } + +} diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~ new file mode 100755 index 0000000..fd8bae8 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~ @@ -0,0 +1,338 @@ +/** +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ + +#include "_SuffixArrayScanningBase.h" +#include <iostream> +#include <cstring> + +using namespace std; + +C_SuffixArrayScanningBase::C_SuffixArrayScanningBase() +{ + this->countOfCountsTable = 0; //no memory has been allocated + this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough +} + +C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN) +{ + this->countOfCountsTable = 0; //no memory has been allocated + this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough + + //load suffix array + this->loadData(filename, false, true, true); + + this->initializeForScanning(filename, maxN); +} + +void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered) +{ + this->maxFreqConsidered = maxFreqConsidered; +} + + +/** +* Initialize data structure needed for scanning after the suffix array has been loaded +**/ +void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN) +{ + this->maxN = maxN; + this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN); + this->countOfCountsTable = 0; //no memory has been allocated + + //initialize the scanning list + for(int i=0;i<this->maxN;i++){ + this->nGramScanningList[i].freqSoFar=0; + this->nGramScanningList[i].vocId = 0; + this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output + } + + //get vocID for sentEnd + this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_")); + + if(this->vocIdForSentEnd==0){ + cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_")); + if(this->vocIdForSentStart==0){ + cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_")); + if(this->vocIdForCorpusEnd==0){ + cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n"; + exit(0); + } +} + +C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase() +{ + free(this->nGramScanningList); + + if(this->countOfCountsTable!=0){ + free(this->countOfCountsTable); + } + +} + +void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh) +{ + if(n>this->maxN){ + cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl; + exit(0); + } + + this->nGramScanningList[n-1].freqThreshForOutput = freqThresh; +} + +void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType() +{ + this->scanSuffixArray('H'); + +} + +/// Count of counts is the number of n-gram types that occur a certain times in the corpus. +/// Count of counts is important information in LM smoothing +/// We scan the corpus for n-gram's type/token frequency and collect information for 1-gram, 2-gram,...and up to maxFreqConsidered-gram +void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered) +{ + this->maxFreqConsidered = maxFreqConsidered; + this->constructCountOfCountsTable(); + + //output the count of counts + cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl; + for(int i=0;i<this->maxN;i++){ + cout<<i+1<<endl; + + unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered; + for(int freq=0;freq<maxFreqConsidered;freq++){ + cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl; + } + } + +} + +///Check from 1-gram to maxN-gram for type-token information +///the process is similar to "scanSuffixArrayForHighFreqNgramType" +void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken() +{ + this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN); + this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN); + + //initialize + for(int n=0;n<maxN;n++){ + this->typeFreq[n]=0; + this->tokenFreq[n]=0; + } + + + //scan the suffix array + this->scanSuffixArray('T'); + + //output + cout<<"n\tType\tToken\n"; + for(int i=0;i<this->maxN;i++){ + cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl; + } +} + +/** +* Allocate memory for count-of-counts table and scan the corpus to fill in count of counts +* memory will be freed in the destructor +**/ +void C_SuffixArrayScanningBase::constructCountOfCountsTable() +{ + if(this->countOfCountsTable!=0){ //if there is already a count of counts table + free(this->countOfCountsTable); + } + + this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered); + + if(this->countOfCountsTable==NULL){ + cerr<<"Count of counts table can not be initialized. Exit\n"; + exit(0); + } + + for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){ + this->countOfCountsTable[c]=0; + } + + this->scanSuffixArray('C'); + + +} + +/** +* Scan through the indexed corpus and according to the action type, +* perform actions accordingly when seeing a new n-gram type +**/ +void C_SuffixArrayScanningBase::scanSuffixArray(char actionType) +{ + + int i,j; + bool stillMeaningful = true; + TextLenType saPos=0; + + while(stillMeaningful && ( saPos<this->corpusSize ) ){ + + TextLenType posInCorpus = this->suffix_list[saPos]; + IndexType wordInCorpus = this->corpus_list[posInCorpus]; + + if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting + + if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested + + bool quit =false; + i=0; + + while(!quit && (i<this->maxN)){ + wordInCorpus = this->corpus_list[posInCorpus+i]; + if( + (wordInCorpus<this->sentIdStart)&& + (wordInCorpus!=this->vocIdForSentEnd)&& + (wordInCorpus!=this->vocIdForSentStart)&& + (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match + + this->nGramScanningList[i].freqSoFar++; + } + else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type + + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + C_String tmpPhrase; //for output high freq n-grams + + //prepare the prefix of the n-grams + if(actionType=='H'){ + //common i-gram + for(j=0;j<=i-1;j++){ + if(this->nGramScanningList[j].vocId==0){ //one of the word in the common i-gram is a NULL word, not a valid n-gram + validNgramUpSoFar = false; + } + tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId)); + tmpPhrase.appending(C_String(" ")); + } + } + + + for(j=i;j<this->maxN;j++){ + + + if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ //perform actions depends on actionType + + switch(actionType){ + + case 'C': //count of counts + freqSoFar = this->nGramScanningList[j].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){ + //increase the count for (j+1)-gram with freq freqSoFar + this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++; + } + break; + + case 'H': //output high-freq n-grams + tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId)); + tmpPhrase.appending(C_String(" ")); + + if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){ + cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl; + } + break; + + case 'T': //type-token statistics + if(this->nGramScanningList[j].freqSoFar>0){ + typeFreq[j]++; + } + + tokenFreq[j]+=this->nGramScanningList[j].freqSoFar; + + break; + default: + cerr<<"Unknown action!\n"; + exit(-1); + } + } + + //finished output, now clear the list from point of i + if((posInCorpus+j)<this->corpusSize){ + wordInCorpus = this->corpus_list[posInCorpus+j]; + } + else{ + wordInCorpus = 0; //out of bound for corpus + } + + if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){ + wordInCorpus=0; //write 0 for <sentId>, <s> and </s> + this->nGramScanningList[j].freqSoFar = 0; + } + else{ + this->nGramScanningList[j].freqSoFar = 1; + } + + this->nGramScanningList[j].vocId = wordInCorpus; + } + + quit=true; //at i+1 gram, already not match, no need to check for longer + } + + i++; + } + } + } + else{ + stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text + } + + saPos++; + } + + //at the end of corpus (according to suffix order) + C_String finalTmpString; //for output high-freq n-gram type + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + for(i=0;i<this->maxN;i++){ + if(this->nGramScanningList[i].vocId==0){ //invalide word + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ + switch(actionType){ + case 'C': //for count-of-counts + freqSoFar = this->nGramScanningList[i].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){ + //increase the count for (i+1)-gram with freq freqSoFar + this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++; + } + break; + + case 'H': //for high-freq n-gram types + finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId)); + finalTmpString.appending(C_String(" ")); + if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){ + cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl; + } + break; + + case 'T': //for type-token statistics + if(this->nGramScanningList[i].freqSoFar>0){ + typeFreq[i]++; + } + + tokenFreq[i]+=this->nGramScanningList[i].freqSoFar; + break; + + default: + cerr<<"Unknown action!\n"; + exit(-1); + } + } + } + +} diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h new file mode 100755 index 0000000..c517b72 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h @@ -0,0 +1,53 @@ +#if !defined (_HEADER_SUFFIX_ARRAY_SCANNING_BASE_CLASS_) +#define _HEADER_SUFFIX_ARRAY_SCANNING_BASE_CLASS_ + + +#include "_SuffixArrayApplicationBase.h" + + + + +/** +* \ingroup scan +* C_SuffixArrayScanningBase class provides functions to scan through an indexed corpus +* and output information such as the type/token frequency of the data +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +class C_SuffixArrayScanningBase : public C_SuffixArrayApplicationBase +{ +public: + void setNgramOutputFreqThresh(int n, unsigned int freqThresh); + void scanSuffixArrayForHighFreqNgramType(); + void scanSuffixArrayForCountofCounts(int maxFreqConsidered); + void scanSuffixArrayForTypeToken(); + + C_SuffixArrayScanningBase(const char * filename, unsigned int maxN); + C_SuffixArrayScanningBase(); + ~C_SuffixArrayScanningBase(); + +protected: + void setParam_maxFreqConsidered(int maxFreqConsidered); + void constructCountOfCountsTable(); + void initializeForScanning(const char * filename, unsigned int maxN); + + int maxN; + int maxFreqConsidered; + + unsigned int * countOfCountsTable; + + IndexType vocIdForSentStart; + IndexType vocIdForSentEnd; + IndexType vocIdForCorpusEnd; + +private: + void scanSuffixArray(char actionType); + + S_nGramScanningInfoElement * nGramScanningList; + + + unsigned int * typeFreq; + unsigned int * tokenFreq; +}; + +#endif diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp new file mode 100755 index 0000000..24b8cc4 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp @@ -0,0 +1,130 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <fstream> +#include <vector> +#include <cstring> + +using namespace std; +int SHOW_DEBUG_INFO = 0; + +typedef struct s_ngram_freq_info{ + C_String ngramText; + vector<IndexType> ngram; + unsigned int freq; +}S_Ngram_Freq_Info; + +/** +* Given several corpora indexed by their suffix array, +* collect counts of n-grams in a list from all the corpora. +* This is useful when a corpus is very large, +* one can split the data into many chunks and sum up the n-gram frquencies. +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //check parameters + if(argc<2){ + cerr<<"\n-------------------------------------------"; + cerr<<"\nUsage:"; + cerr<<"\n\t"<<argv[0]<<" n-gram_list_filename < list of suffix arry used"; + cerr<<"\nNote:"; + cerr<<"\n\tn-gram_list_filename.id_voc must exist first."; + cerr<<"\n-------------------------------------------\n\n"; + + exit(0); + } + + //load vocabulary + char id_voc_filename[1024]; + sprintf(id_voc_filename, "%s.id_voc", argv[1]); + C_IDVocabulary voc(id_voc_filename); + + //load the n-gram list + vector<S_Ngram_Freq_Info> ngramList; + + ifstream NgramListFile; + NgramListFile.open(argv[1]); + char tmpString[4096]; + while(!NgramListFile.eof()){ + + NgramListFile.getline(tmpString, 4096, '\n'); + + if(strlen(tmpString)>0){ + S_Ngram_Freq_Info tmpNode; + tmpNode.ngramText = C_String(tmpString); + tmpNode.freq = 1; + tmpNode.ngram.clear(); + + //conver the n-gram as string to vocId + char tmpToken[MAX_TOKEN_LEN]; + memset(tmpToken,0,MAX_TOKEN_LEN); + int pos = 0; + int inputLen = strlen(tmpString); + + for(int posInInput = 0; posInInput<inputLen; posInInput++){ + char thisChar = tmpString[posInInput]; + + if((thisChar==' ')||(thisChar=='\t')){ //delimiters + if(strlen(tmpToken)>0){ + tmpToken[pos] = '\0'; + tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken))); + pos=0; + tmpToken[pos] = '\0'; + } + } + else{ + tmpToken[pos] = thisChar; + pos++; + if(pos>=MAX_TOKEN_LEN){ //we can handle it + fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN); + exit(0); + } + } + } + + tmpToken[pos] = '\0'; + if(strlen(tmpToken)>0){ + tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken))); + } + + ngramList.push_back(tmpNode); + } + tmpString[0]='\0'; + } + cerr<<"Total "<<ngramList.size()<<" ngrams loaded.\n"; + + //loop over all suffix array and collec the n-gram counts + char sa_filename[1024]; + while(! cin.eof()){ + cin>>sa_filename; + + if(strlen(sa_filename)>0){ + cerr<<"Considering "<<sa_filename<<endl; + + C_SuffixArraySearchApplicationBase sa; + sa.loadData_forSearch(sa_filename, true, true); + + for(int i=0; i<ngramList.size(); i++){ + unsigned int freq; + + freq = sa.freqOfExactPhraseMatch(ngramList[i].ngram); + + ngramList[i].freq+=freq; + } + } + + sa_filename[0]=0; + } + + + for(int m=0;m<ngramList.size();m++){ + cout<<ngramList[m].freq<<"\t"; + cout<<ngramList[m].ngramText.toString()<<"\n"; + } + + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~ new file mode 100755 index 0000000..492b770 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~ @@ -0,0 +1,129 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <fstream> +#include <vector> + +using namespace std; +int SHOW_DEBUG_INFO = 0; + +typedef struct s_ngram_freq_info{ + C_String ngramText; + vector<IndexType> ngram; + unsigned int freq; +}S_Ngram_Freq_Info; + +/** +* Given several corpora indexed by their suffix array, +* collect counts of n-grams in a list from all the corpora. +* This is useful when a corpus is very large, +* one can split the data into many chunks and sum up the n-gram frquencies. +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //check parameters + if(argc<2){ + cerr<<"\n-------------------------------------------"; + cerr<<"\nUsage:"; + cerr<<"\n\t"<<argv[0]<<" n-gram_list_filename < list of suffix arry used"; + cerr<<"\nNote:"; + cerr<<"\n\tn-gram_list_filename.id_voc must exist first."; + cerr<<"\n-------------------------------------------\n\n"; + + exit(0); + } + + //load vocabulary + char id_voc_filename[1024]; + sprintf(id_voc_filename, "%s.id_voc", argv[1]); + C_IDVocabulary voc(id_voc_filename); + + //load the n-gram list + vector<S_Ngram_Freq_Info> ngramList; + + ifstream NgramListFile; + NgramListFile.open(argv[1]); + char tmpString[4096]; + while(!NgramListFile.eof()){ + + NgramListFile.getline(tmpString, 4096, '\n'); + + if(strlen(tmpString)>0){ + S_Ngram_Freq_Info tmpNode; + tmpNode.ngramText = C_String(tmpString); + tmpNode.freq = 1; + tmpNode.ngram.clear(); + + //conver the n-gram as string to vocId + char tmpToken[MAX_TOKEN_LEN]; + memset(tmpToken,0,MAX_TOKEN_LEN); + int pos = 0; + int inputLen = strlen(tmpString); + + for(int posInInput = 0; posInInput<inputLen; posInInput++){ + char thisChar = tmpString[posInInput]; + + if((thisChar==' ')||(thisChar=='\t')){ //delimiters + if(strlen(tmpToken)>0){ + tmpToken[pos] = '\0'; + tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken))); + pos=0; + tmpToken[pos] = '\0'; + } + } + else{ + tmpToken[pos] = thisChar; + pos++; + if(pos>=MAX_TOKEN_LEN){ //we can handle it + fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN); + exit(0); + } + } + } + + tmpToken[pos] = '\0'; + if(strlen(tmpToken)>0){ + tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken))); + } + + ngramList.push_back(tmpNode); + } + tmpString[0]='\0'; + } + cerr<<"Total "<<ngramList.size()<<" ngrams loaded.\n"; + + //loop over all suffix array and collec the n-gram counts + char sa_filename[1024]; + while(! cin.eof()){ + cin>>sa_filename; + + if(strlen(sa_filename)>0){ + cerr<<"Considering "<<sa_filename<<endl; + + C_SuffixArraySearchApplicationBase sa; + sa.loadData_forSearch(sa_filename, true, true); + + for(int i=0; i<ngramList.size(); i++){ + unsigned int freq; + + freq = sa.freqOfExactPhraseMatch(ngramList[i].ngram); + + ngramList[i].freq+=freq; + } + } + + sa_filename[0]=0; + } + + + for(int m=0;m<ngramList.size();m++){ + cout<<ngramList[m].freq<<"\t"; + cout<<ngramList[m].ngramText.toString()<<"\n"; + } + + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp new file mode 100755 index 0000000..9d47f3a --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp @@ -0,0 +1,72 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <vector> +#include <map> +#include <cstring> + +using namespace std; + +/** +* Given a corpus indexed by its suffix array, filter out the duplicated sentences in the data +* and output the unique sentences within. +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + if(argc<2){ + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s fileNameStem < original corpus > corpus with uniq sentences\n",argv[0]); + + exit(0); + } + + map< pair<TextLenType, int>, bool> duplicatedSentAlreadyOutput; + map< pair<TextLenType, int>, bool>::iterator iterDuplicatedSentAlreadyOutput; + + + C_SuffixArraySearchApplicationBase sa; + sa.loadData_forSearch(argv[1], false, true); + + unsigned long totalFilteredSent = 0; + + cerr<<"Filtering duplicated sentences:\n"; + char tmpString[4000]; + while(!cin.eof()){ + cin.getline(tmpString,100000,'\n'); + if(strlen(tmpString)>0){ + TextLenType freq = 0; + TextLenType firstOccurrence; + int sentLen; + + freq = sa.freqOfExactPhraseMatchAndFirstOccurrence(tmpString, firstOccurrence, sentLen); + + if(freq>1){ //freq is at least 1, because this is the same corpus + //then there are multiple occurrences of this sentence + //check if we have already output it + iterDuplicatedSentAlreadyOutput = duplicatedSentAlreadyOutput.find(make_pair(firstOccurrence, sentLen)); + + if(iterDuplicatedSentAlreadyOutput == duplicatedSentAlreadyOutput.end()){ //we haven't output it + cout<<tmpString<<endl; + duplicatedSentAlreadyOutput.insert(make_pair(make_pair(firstOccurrence, sentLen), true)); + } + else{ + //it has been output already, ignore it + totalFilteredSent++; + } + } + else{ //freq==1, no duplication + cout<<tmpString<<endl; + } + + } + } + + cerr<<"Total "<<totalFilteredSent<<" duplicated sentences are filtered\n"; + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~ new file mode 100755 index 0000000..1278b3f --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~ @@ -0,0 +1,71 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <vector> +#include <map> + +using namespace std; + +/** +* Given a corpus indexed by its suffix array, filter out the duplicated sentences in the data +* and output the unique sentences within. +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + if(argc<2){ + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s fileNameStem < original corpus > corpus with uniq sentences\n",argv[0]); + + exit(0); + } + + map< pair<TextLenType, int>, bool> duplicatedSentAlreadyOutput; + map< pair<TextLenType, int>, bool>::iterator iterDuplicatedSentAlreadyOutput; + + + C_SuffixArraySearchApplicationBase sa; + sa.loadData_forSearch(argv[1], false, true); + + unsigned long totalFilteredSent = 0; + + cerr<<"Filtering duplicated sentences:\n"; + char tmpString[4000]; + while(!cin.eof()){ + cin.getline(tmpString,100000,'\n'); + if(strlen(tmpString)>0){ + TextLenType freq = 0; + TextLenType firstOccurrence; + int sentLen; + + freq = sa.freqOfExactPhraseMatchAndFirstOccurrence(tmpString, firstOccurrence, sentLen); + + if(freq>1){ //freq is at least 1, because this is the same corpus + //then there are multiple occurrences of this sentence + //check if we have already output it + iterDuplicatedSentAlreadyOutput = duplicatedSentAlreadyOutput.find(make_pair(firstOccurrence, sentLen)); + + if(iterDuplicatedSentAlreadyOutput == duplicatedSentAlreadyOutput.end()){ //we haven't output it + cout<<tmpString<<endl; + duplicatedSentAlreadyOutput.insert(make_pair(make_pair(firstOccurrence, sentLen), true)); + } + else{ + //it has been output already, ignore it + totalFilteredSent++; + } + } + else{ //freq==1, no duplication + cout<<tmpString<<endl; + } + + } + } + + cerr<<"Total "<<totalFilteredSent<<" duplicated sentences are filtered\n"; + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp new file mode 100755 index 0000000..3daf337 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp @@ -0,0 +1,47 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <vector> +#include <cstring> + +using namespace std; + +int SHOW_DEBUG_INFO = 0; + + +/** +* Application main functionL ExactNgramMatchingFreq +* Input from stdin ngrams with each line containing one n-gram +* Search the corpus for the occurrences of each n-gram and output their frequencies in the corpus +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + if(argc<2){ + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s fileNameStem \n",argv[0]); + + exit(0); + } + + + C_SuffixArraySearchApplicationBase sa; + sa.loadData_forSearch(argv[1], false, true); //we need vocabulary, but do not need offset information here + + cerr<<"Input N-grams:\n"; + char tmpString[1000]; + while(!cin.eof()){ + cin.getline(tmpString,100000,'\n'); + if(strlen(tmpString)>0){ + TextLenType freq = 0; + freq = sa.freqOfExactPhraseMatch(tmpString); + cout<<freq<<": "<<tmpString<<endl; + } + } + + return 0; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~ new file mode 100755 index 0000000..4c63c0b --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~ @@ -0,0 +1,46 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <vector> + +using namespace std; + +int SHOW_DEBUG_INFO = 0; + + +/** +* Application main functionL ExactNgramMatchingFreq +* Input from stdin ngrams with each line containing one n-gram +* Search the corpus for the occurrences of each n-gram and output their frequencies in the corpus +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + if(argc<2){ + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s fileNameStem \n",argv[0]); + + exit(0); + } + + + C_SuffixArraySearchApplicationBase sa; + sa.loadData_forSearch(argv[1], false, true); //we need vocabulary, but do not need offset information here + + cerr<<"Input N-grams:\n"; + char tmpString[1000]; + while(!cin.eof()){ + cin.getline(tmpString,100000,'\n'); + if(strlen(tmpString)>0){ + TextLenType freq = 0; + freq = sa.freqOfExactPhraseMatch(tmpString); + cout<<freq<<": "<<tmpString<<endl; + } + } + + return 0; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp new file mode 100755 index 0000000..421e503 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp @@ -0,0 +1,85 @@ +#include "stdio.h" +#include "stdlib.h" +#include <vector> +#include <iostream> +#include <cstring> +#include "_SuffixArraySearchApplicationBase.h" + +using namespace std; + + +/** +* Return locations of all the embedded n-grams of a sentence in the indexed corpus +* +* Revison $Rev: 3794 $ +* Last modified: $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + + //----------------------------------------------------------------------------- + //check arguments + if(argc<2){ + fprintf(stderr,"\n\nOutput locations of all the matched embedded n-grams of a sentence in an indexed corpus\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem [highestFreq maxRet smallestUnit longestUnit] < list of sentences\n\n",argv[0]); + + exit(-1); + } + + + int highFreq; + int maxRet; + int smallestUnit; + int longestUnit; + + C_SuffixArraySearchApplicationBase saObj; + + saObj.loadData_forSearch(argv[1], false, false); + + if(argc>=6){ //if argument of highestFreq, maxRet, smallestUnits are set + highFreq = atoi(argv[2]); + maxRet = atoi(argv[3]); + smallestUnit = atoi(argv[4]); + longestUnit = atoi(argv[5]); + + saObj.setParam_highestFreqThresholdForReport(highFreq); + saObj.setParam_reportMaxOccurrenceOfOneNgram(maxRet); + saObj.setParam_shortestUnitToReport(smallestUnit); + saObj.setParam_longestUnitToReport(longestUnit); + } + + cerr<<"Input sentences:\n"; + + char sentence[10000]; + + while(!cin.eof()){ + cin.getline(sentence,10000,'\n'); + if(strlen(sentence)>0){ + + vector<C_String> sentAsCStringVector = saObj.convertCharStringToCStringVector(sentence); //for later display purpose + + + vector<S_phraseLocationElement> locations; + locations = saObj.findPhrasesInASentence(sentence); + + if(locations.size()==0){ + cout<<"Nothing can be found in the corpus.\n"; + } + else{ + for(int i=0;i<locations.size(); i++){ + cout<<"N-gram ["<<(int)locations[i].posStartInSrcSent<<", "<<(int)locations[i].posEndInSrcSent<<"]: "; + for(int j=locations[i].posStartInSrcSent; j<=locations[i].posEndInSrcSent; j++){ + cout<<sentAsCStringVector[j-1].toString()<<" "; + } + cout<<" found in corpus: "; + cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl; + } + } + cout<<endl; + } + } + + + + return 0; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~ new file mode 100755 index 0000000..cd7a86a --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~ @@ -0,0 +1,84 @@ +#include "stdio.h" +#include "stdlib.h" +#include <vector> +#include <iostream> +#include "_SuffixArraySearchApplicationBase.h" + +using namespace std; + + +/** +* Return locations of all the embedded n-grams of a sentence in the indexed corpus +* +* Revison $Rev: 3794 $ +* Last modified: $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + + //----------------------------------------------------------------------------- + //check arguments + if(argc<2){ + fprintf(stderr,"\n\nOutput locations of all the matched embedded n-grams of a sentence in an indexed corpus\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem [highestFreq maxRet smallestUnit longestUnit] < list of sentences\n\n",argv[0]); + + exit(-1); + } + + + int highFreq; + int maxRet; + int smallestUnit; + int longestUnit; + + C_SuffixArraySearchApplicationBase saObj; + + saObj.loadData_forSearch(argv[1], false, false); + + if(argc>=6){ //if argument of highestFreq, maxRet, smallestUnits are set + highFreq = atoi(argv[2]); + maxRet = atoi(argv[3]); + smallestUnit = atoi(argv[4]); + longestUnit = atoi(argv[5]); + + saObj.setParam_highestFreqThresholdForReport(highFreq); + saObj.setParam_reportMaxOccurrenceOfOneNgram(maxRet); + saObj.setParam_shortestUnitToReport(smallestUnit); + saObj.setParam_longestUnitToReport(longestUnit); + } + + cerr<<"Input sentences:\n"; + + char sentence[10000]; + + while(!cin.eof()){ + cin.getline(sentence,10000,'\n'); + if(strlen(sentence)>0){ + + vector<C_String> sentAsCStringVector = saObj.convertCharStringToCStringVector(sentence); //for later display purpose + + + vector<S_phraseLocationElement> locations; + locations = saObj.findPhrasesInASentence(sentence); + + if(locations.size()==0){ + cout<<"Nothing can be found in the corpus.\n"; + } + else{ + for(int i=0;i<locations.size(); i++){ + cout<<"N-gram ["<<(int)locations[i].posStartInSrcSent<<", "<<(int)locations[i].posEndInSrcSent<<"]: "; + for(int j=locations[i].posStartInSrcSent; j<=locations[i].posEndInSrcSent; j++){ + cout<<sentAsCStringVector[j-1].toString()<<" "; + } + cout<<" found in corpus: "; + cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl; + } + } + cout<<endl; + } + } + + + + return 0; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp new file mode 100755 index 0000000..deb8b81 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp @@ -0,0 +1,67 @@ +#include "stdio.h" +#include "stdlib.h" + +#include "_SuffixArraySearchApplicationBase.h" + +#include <vector> +#include <iostream> +#include <cstring> + +using namespace std; + +/** +* \ingroup search +* +* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs +* SentID and offset are all 1-based +* +* Note: +* The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement +* To output it as a number, one needs to cast it to integer type for proper display +* +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + if(argc<2){ + fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]); + + exit(-1); + } + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase saObj; + + //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false) + saObj.loadData_forSearch(argv[1], false, false); + + + cerr<<"Input N-grams:\n"; + char tmpString[10000]; + while(!cin.eof()){ + cin.getline(tmpString,10000,'\n'); + if(strlen(tmpString)>0){ + vector<S_SimplePhraseLocationElement> locations; + + locations = saObj.locateExactPhraseInCorpus(tmpString); + + if(locations.size()==0){ + cout<<"No occurrences found.\n"; + } + else{ + for(int i=0;i<locations.size(); i++){ + cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl; + } + } + cout<<endl; + } + } + + return 0; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~ new file mode 100755 index 0000000..71097f9 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~ @@ -0,0 +1,66 @@ +#include "stdio.h" +#include "stdlib.h" + +#include "_SuffixArraySearchApplicationBase.h" + +#include <vector> +#include <iostream> + +using namespace std; + +/** +* \ingroup search +* +* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs +* SentID and offset are all 1-based +* +* Note: +* The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement +* To output it as a number, one needs to cast it to integer type for proper display +* +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + //----------------------------------------------------------------------------- + //check parameter + if(argc<2){ + fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]); + + exit(-1); + } + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase saObj; + + //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false) + saObj.loadData_forSearch(argv[1], false, false); + + + cerr<<"Input N-grams:\n"; + char tmpString[10000]; + while(!cin.eof()){ + cin.getline(tmpString,10000,'\n'); + if(strlen(tmpString)>0){ + vector<S_SimplePhraseLocationElement> locations; + + locations = saObj.locateExactPhraseInCorpus(tmpString); + + if(locations.size()==0){ + cout<<"No occurrences found.\n"; + } + else{ + for(int i=0;i<locations.size(); i++){ + cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl; + } + } + cout<<endl; + } + } + + return 0; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp new file mode 100755 index 0000000..e614fdc --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp @@ -0,0 +1,132 @@ +#include "stdio.h" +#include "stdlib.h" + +#include <string> +#include <iostream> +#include <fstream> +#include <vector> +#include <cstring> + +#include "_SuffixArraySearchApplicationBase.h" + + +#include <time.h> +#include <stdio.h> +#include <map> + +using namespace std; + +/** +* Given the indexed training corpus, analyze the token/type matching ratio of the n-grams in the testing data. +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + fprintf(stderr,"\nOutput the n-gram matching statistics of a testing data given an indexed corpus\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]); + + exit(0); + } + + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase SA; + + map<int, pair<int, unsigned long> > results4OneSent; + map<int, pair<int, unsigned long> >::iterator iterResult; + + vector<int> nGramTokenCountsInTest; + vector<int> nGramInTestMatched; + vector<double> nGramFreqInTrainMatched; + + int maxSentLen = 4086; + nGramTokenCountsInTest.reserve(maxSentLen); + nGramInTestMatched.reserve(maxSentLen); + nGramFreqInTrainMatched.reserve(maxSentLen); + + //initialize + for(int i=0;i<maxSentLen;i++){ + nGramTokenCountsInTest.push_back(0); + nGramInTestMatched.push_back(0); + nGramFreqInTrainMatched.push_back(0); + } + + char fileName[1000]; + char tmpString[10000]; + + strcpy(fileName, argv[1]); + + fprintf(stderr,"Loading data...\n"); + SA.loadData_forSearch(fileName, false, true); + + fprintf(stderr,"Input sentences:\n"); + + long ltime1, ltime2; + + time( <ime1 ); + + int totalSentences = 0; + int matchedSentences = 0; + while(!cin.eof()){ + int sentLen; + cin.getline(tmpString,10000,'\n'); + + if(strlen(tmpString)>0){ + + totalSentences++; + + results4OneSent.clear(); + results4OneSent = SA.returnNGramMatchingStatForOneSent(tmpString, sentLen); + + if(sentLen>maxSentLen){ + cerr<<"Sentence too long, we can not handle it! Exit.\n"; + exit(0); + } + + for(int j=1;j<=sentLen;j++){ //j-gram + nGramTokenCountsInTest[j]+=(sentLen-j+1); //number of j-grams in the sentence; + } + + iterResult=results4OneSent.begin(); + while(iterResult!=results4OneSent.end()){ + + nGramInTestMatched[iterResult->first]+=iterResult->second.first; + nGramFreqInTrainMatched[iterResult->first]+=iterResult->second.second; + + if(iterResult->first==sentLen){ //a complete match + matchedSentences++; + } + + iterResult++; + } + } + + tmpString[0]=0; + + } + + int n = 1; + while(nGramInTestMatched[n]!=0){ + int matched = nGramInTestMatched[n]; + int totalInTest = nGramTokenCountsInTest[n]; + cout<<"N="<<n<<":\t"<<matched<<" / "<<totalInTest<<"\t"; + printf("%.1f\t", double(matched)/double(totalInTest)*100.0); + cout<<"OccInTrain= "<<nGramFreqInTrainMatched[n]<<endl; + + n++; + } + + cout<<"\nOut of "<<totalSentences<<" input sentences, "<<matchedSentences<<" can be found in the training data.\n";; + time( <ime2 ); + cout<<"Time cost:"<<ltime2-ltime2<<" seconds\n"; + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~ new file mode 100755 index 0000000..d33d3a9 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~ @@ -0,0 +1,131 @@ +#include "stdio.h" +#include "stdlib.h" + +#include <string> +#include <iostream> +#include <fstream> +#include <vector> + +#include "_SuffixArraySearchApplicationBase.h" + + +#include <time.h> +#include <stdio.h> +#include <map> + +using namespace std; + +/** +* Given the indexed training corpus, analyze the token/type matching ratio of the n-grams in the testing data. +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + fprintf(stderr,"\nOutput the n-gram matching statistics of a testing data given an indexed corpus\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]); + + exit(0); + } + + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase SA; + + map<int, pair<int, unsigned long> > results4OneSent; + map<int, pair<int, unsigned long> >::iterator iterResult; + + vector<int> nGramTokenCountsInTest; + vector<int> nGramInTestMatched; + vector<double> nGramFreqInTrainMatched; + + int maxSentLen = 4086; + nGramTokenCountsInTest.reserve(maxSentLen); + nGramInTestMatched.reserve(maxSentLen); + nGramFreqInTrainMatched.reserve(maxSentLen); + + //initialize + for(int i=0;i<maxSentLen;i++){ + nGramTokenCountsInTest.push_back(0); + nGramInTestMatched.push_back(0); + nGramFreqInTrainMatched.push_back(0); + } + + char fileName[1000]; + char tmpString[10000]; + + strcpy(fileName, argv[1]); + + fprintf(stderr,"Loading data...\n"); + SA.loadData_forSearch(fileName, false, true); + + fprintf(stderr,"Input sentences:\n"); + + long ltime1, ltime2; + + time( <ime1 ); + + int totalSentences = 0; + int matchedSentences = 0; + while(!cin.eof()){ + int sentLen; + cin.getline(tmpString,10000,'\n'); + + if(strlen(tmpString)>0){ + + totalSentences++; + + results4OneSent.clear(); + results4OneSent = SA.returnNGramMatchingStatForOneSent(tmpString, sentLen); + + if(sentLen>maxSentLen){ + cerr<<"Sentence too long, we can not handle it! Exit.\n"; + exit(0); + } + + for(int j=1;j<=sentLen;j++){ //j-gram + nGramTokenCountsInTest[j]+=(sentLen-j+1); //number of j-grams in the sentence; + } + + iterResult=results4OneSent.begin(); + while(iterResult!=results4OneSent.end()){ + + nGramInTestMatched[iterResult->first]+=iterResult->second.first; + nGramFreqInTrainMatched[iterResult->first]+=iterResult->second.second; + + if(iterResult->first==sentLen){ //a complete match + matchedSentences++; + } + + iterResult++; + } + } + + tmpString[0]=0; + + } + + int n = 1; + while(nGramInTestMatched[n]!=0){ + int matched = nGramInTestMatched[n]; + int totalInTest = nGramTokenCountsInTest[n]; + cout<<"N="<<n<<":\t"<<matched<<" / "<<totalInTest<<"\t"; + printf("%.1f\t", double(matched)/double(totalInTest)*100.0); + cout<<"OccInTrain= "<<nGramFreqInTrainMatched[n]<<endl; + + n++; + } + + cout<<"\nOut of "<<totalSentences<<" input sentences, "<<matchedSentences<<" can be found in the training data.\n";; + time( <ime2 ); + cout<<"Time cost:"<<ltime2-ltime2<<" seconds\n"; + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp new file mode 100755 index 0000000..ca12119 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp @@ -0,0 +1,50 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <vector> +#include <cstring> + +using namespace std; + +int SHOW_DEBUG_INFO = 0; + +/** +* Given a corpus indexed by its suffix array, input a sentence from STDIN and output the frequencies of its embedded n-grams in the corpus. +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]); + + exit(0); + } + + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase SA; + + char tmpString[1000]; + + fprintf(stderr,"Loading data...\n"); + SA.loadData_forSearch(argv[1], false, true); + + fprintf(stderr,"Input Sentences:\n"); + + while(!cin.eof()){ + cin.getline(tmpString,100000,'\n'); + if(strlen(tmpString)>0){ + SA.displayNgramMatchingFreq4Sent(tmpString); + } + } + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~ new file mode 100755 index 0000000..5e2433b --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~ @@ -0,0 +1,49 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <vector> + +using namespace std; + +int SHOW_DEBUG_INFO = 0; + +/** +* Given a corpus indexed by its suffix array, input a sentence from STDIN and output the frequencies of its embedded n-grams in the corpus. +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]); + + exit(0); + } + + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase SA; + + char tmpString[1000]; + + fprintf(stderr,"Loading data...\n"); + SA.loadData_forSearch(argv[1], false, true); + + fprintf(stderr,"Input Sentences:\n"); + + while(!cin.eof()){ + cin.getline(tmpString,100000,'\n'); + if(strlen(tmpString)>0){ + SA.displayNgramMatchingFreq4Sent(tmpString); + } + } + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp new file mode 100755 index 0000000..544a230 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp @@ -0,0 +1,144 @@ +#include "stdio.h" +#include "stdlib.h" +#include "float.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <vector> +#include <cstring> + +using namespace std; + +int SHOW_DEBUG_INFO = 0; + +///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n> +///startingPosInSrcSent starts at 0, n is the n-gram length +void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n) +{ + n = index / sentLen + 1; + posInSrcSent = index % sentLen; +} + +///Given the starting position in src sentence and the length of the n-gram +///calculate the index in the table +///posInSent starts at 0, n is the actual len of n-gram, starts at 1 +unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen) +{ + unsigned int indexInTable = (n-1)*sentLen + posInSent; + + return indexInTable; +} + +/** +* Given a corpus indexed by its suffix array +* calcuate the non-compositionalities of the embedded n-grams in a testing sentence +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]); + + exit(0); + } + + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase SA; + + char tmpString[1000]; + double bigN = 1000000; + + fprintf(stderr,"Loading data...\n"); + SA.loadData_forSearch(argv[1], false, true); + + fprintf(stderr,"Input Sentences:\n"); + + while(!cin.eof()){ + cin.getline(tmpString,100000,'\n'); + if(strlen(tmpString)>0){ + + SA.displayNgramMatchingFreq4Sent(tmpString); + + printf("\n"); + + int sentLen; + + S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen); + + //convert this to frequency table + double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen); + + for(unsigned int i=0;i<(sentLen*sentLen);i++){ + //all the short n-grams should all exist and their frequency information should be in table now + unsigned int startPos, n; + double minNc; + int leftNWithMinNc; + + local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n); + + if(matchingTable[i].found){ + double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1; + freqTable[i]=freq; + + + + //consider all splitting method + minNc = DBL_MAX; + + for(unsigned int leftN=1;leftN<n;leftN++){ + int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen); + int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen); + + double leftFreq = freqTable[index_left]; + double rightFreq = freqTable[index_right]; + + double nc = freq*bigN/(leftFreq*rightFreq); + + if(nc<minNc){ + minNc = nc; + leftNWithMinNc = leftN; + } + + } + } + else{ + freqTable[i]=0; + minNc = 0; + } + + if(startPos==0){ + printf("\n%d\t",n); + } + + if(n==1){ + printf("A\t"); //atom word, no way to break it + } + else{ + if(minNc>0){ + printf("%.1f[%d]\t", minNc, leftNWithMinNc); + } + else{ + printf("_\t"); + } + } + } + + printf("\n"); + + + free(matchingTable); + free(freqTable); + + + } + } + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~ new file mode 100755 index 0000000..294724e --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~ @@ -0,0 +1,145 @@ +#include "stdio.h" +#include "stdlib.h" +#include "float.h" +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <vector> +#include <cstring> + + +using namespace std; + +int SHOW_DEBUG_INFO = 0; + +///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n> +///startingPosInSrcSent starts at 0, n is the n-gram length +void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n) +{ + n = index / sentLen + 1; + posInSrcSent = index % sentLen; +} + +///Given the starting position in src sentence and the length of the n-gram +///calculate the index in the table +///posInSent starts at 0, n is the actual len of n-gram, starts at 1 +unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen) +{ + unsigned int indexInTable = (n-1)*sentLen + posInSent; + + return indexInTable; +} + +/** +* Given a corpus indexed by its suffix array +* calcuate the non-compositionalities of the embedded n-grams in a testing sentence +* +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]); + + exit(0); + } + + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase SA; + + char tmpString[1000]; + double bigN = 1000000; + + fprintf(stderr,"Loading data...\n"); + SA.loadData_forSearch(argv[1], false, true); + + fprintf(stderr,"Input Sentences:\n"); + + while(!cin.eof()){ + cin.getline(tmpString,100000,'\n'); + if(strlen(tmpString)>0){ + + SA.displayNgramMatchingFreq4Sent(tmpString); + + printf("\n"); + + int sentLen; + + S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen); + + //convert this to frequency table + double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen); + + for(unsigned int i=0;i<(sentLen*sentLen);i++){ + //all the short n-grams should all exist and their frequency information should be in table now + unsigned int startPos, n; + double minNc; + int leftNWithMinNc; + + local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n); + + if(matchingTable[i].found){ + double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1; + freqTable[i]=freq; + + + + //consider all splitting method + minNc = DBL_MAX; + + for(unsigned int leftN=1;leftN<n;leftN++){ + int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen); + int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen); + + double leftFreq = freqTable[index_left]; + double rightFreq = freqTable[index_right]; + + double nc = freq*bigN/(leftFreq*rightFreq); + + if(nc<minNc){ + minNc = nc; + leftNWithMinNc = leftN; + } + + } + } + else{ + freqTable[i]=0; + minNc = 0; + } + + if(startPos==0){ + printf("\n%d\t",n); + } + + if(n==1){ + printf("A\t"); //atom word, no way to break it + } + else{ + if(minNc>0){ + printf("%.1f[%d]\t", minNc, leftNWithMinNc); + } + else{ + printf("_\t"); + } + } + } + + printf("\n"); + + + free(matchingTable); + free(freqTable); + + + } + } + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp new file mode 100755 index 0000000..9697f4a --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp @@ -0,0 +1,178 @@ +#include "stdio.h" +#include "stdlib.h" + +#include <string> +#include <iostream> +#include <fstream> +#include <vector> + +#include "_String.h" +#include "_SuffixArraySearchApplicationBase.h" + +#include <time.h> +#include <stdio.h> +#include <map> +#include <cstring> + +using namespace std; + + +vector<C_String> convertTextToStringVector(const char * sentText) +{ + + vector<C_String> sentAsStringVect; + + char tmpToken[MAX_TOKEN_LEN]; + memset(tmpToken,0,MAX_TOKEN_LEN); + + int pos = 0; + + int inputLen = strlen(sentText); + + for(int posInInput = 0; posInInput<inputLen; posInInput++){ + char thisChar = sentText[posInInput]; + + if((thisChar==' ')||(thisChar=='\t')){ //delimiters + if(strlen(tmpToken)>0){ + tmpToken[pos] = '\0'; + sentAsStringVect.push_back(C_String(tmpToken)); + pos=0; + tmpToken[pos] = '\0'; + } + } + else{ + tmpToken[pos] = thisChar; + pos++; + if(pos>=MAX_TOKEN_LEN){ //we can handle it + fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN); + exit(0); + } + } + } + + tmpToken[pos] = '\0'; + if(strlen(tmpToken)>0){ + sentAsStringVect.push_back(C_String(tmpToken)); + } + + return sentAsStringVect; +} + +/** +* \ingroup search +* +* Given the training corpus indexed by its suffix array, +* output all the n-grams in a testing data that can be found in the training corpus +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + fprintf(stderr,"\nOutput the matched n-gram types a testing data set given an indexed corpus\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]); + + exit(0); + } + + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase SA; + + map<C_String, double> matchedNgrams; + map<C_String, double>::iterator iterMatchedNgrams; + + + int maxSentLen = 4086; + + + char fileName[1000]; + char tmpString[10000]; + + strcpy(fileName, argv[1]); + + fprintf(stderr,"Loading data...\n"); + SA.loadData_forSearch(fileName, false, true); + + cerr<<"Input sentences:\n"; + + long ltime1, ltime2; + + time( <ime1 ); + + int totalSentences = 0; + int matchedSentences = 0; + while(!cin.eof()){ + cin.getline(tmpString,10000,'\n'); + + if(strlen(tmpString)>0){ + vector<C_String> sentAsStringVector = convertTextToStringVector(tmpString); + + int sentLen; + S_sentSearchTableElement * freqTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen); + + if(sentLen!=sentAsStringVector.size()){ + cerr<<"Something wrong, can not proceed.!\n"; + exit(-1); + } + + + //go over the frequency table + for(int startPos = 0; startPos<sentLen; startPos++){ + C_String ngram; + bool stillMatching = true; + int n=1; + while(stillMatching & (n<=(sentLen-startPos)) ){ + + ngram.appending(sentAsStringVector[startPos+n-1]); + + int posInFreqTable = (n-1)*sentLen+startPos; + if(freqTable[posInFreqTable].found){ + double frequency = freqTable[posInFreqTable].endingPosInSA - freqTable[posInFreqTable].startPosInSA + 1; + + iterMatchedNgrams = matchedNgrams.find(ngram); + if(iterMatchedNgrams!=matchedNgrams.end()){ //exist already + iterMatchedNgrams->second=frequency; //frequency is not meaningful in this case, just use it because map need some values to be mapped to + } + else{ + matchedNgrams.insert(make_pair(ngram, frequency)); + } + } + else{ + stillMatching = false; + } + + + ngram.appending(C_String(" ")); + + n++; + } + } + + } + + tmpString[0]=0; + + } + + + //now output all the n-grams + iterMatchedNgrams = matchedNgrams.begin(); + while(iterMatchedNgrams != matchedNgrams.end()){ + cout<<(iterMatchedNgrams->first).toString()<<endl; + + iterMatchedNgrams++; + } + + + time( <ime2 ); + cerr<<"Time spent:"<<ltime2-ltime2<<" seconds\n"; + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~ new file mode 100755 index 0000000..5418db6 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~ @@ -0,0 +1,177 @@ +#include "stdio.h" +#include "stdlib.h" + +#include <string> +#include <iostream> +#include <fstream> +#include <vector> + +#include "_String.h" +#include "_SuffixArraySearchApplicationBase.h" + +#include <time.h> +#include <stdio.h> +#include <map> + +using namespace std; + + +vector<C_String> convertTextToStringVector(const char * sentText) +{ + + vector<C_String> sentAsStringVect; + + char tmpToken[MAX_TOKEN_LEN]; + memset(tmpToken,0,MAX_TOKEN_LEN); + + int pos = 0; + + int inputLen = strlen(sentText); + + for(int posInInput = 0; posInInput<inputLen; posInInput++){ + char thisChar = sentText[posInInput]; + + if((thisChar==' ')||(thisChar=='\t')){ //delimiters + if(strlen(tmpToken)>0){ + tmpToken[pos] = '\0'; + sentAsStringVect.push_back(C_String(tmpToken)); + pos=0; + tmpToken[pos] = '\0'; + } + } + else{ + tmpToken[pos] = thisChar; + pos++; + if(pos>=MAX_TOKEN_LEN){ //we can handle it + fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN); + exit(0); + } + } + } + + tmpToken[pos] = '\0'; + if(strlen(tmpToken)>0){ + sentAsStringVect.push_back(C_String(tmpToken)); + } + + return sentAsStringVect; +} + +/** +* \ingroup search +* +* Given the training corpus indexed by its suffix array, +* output all the n-grams in a testing data that can be found in the training corpus +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char* argv[]){ + //----------------------------------------------------------------------------- + //check parameter + + + if(argc<2){ + fprintf(stderr,"\nOutput the matched n-gram types a testing data set given an indexed corpus\n"); + fprintf(stderr,"\nUsage:\n"); + fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]); + + exit(0); + } + + + //----------------------------------------------------------------------------- + + C_SuffixArraySearchApplicationBase SA; + + map<C_String, double> matchedNgrams; + map<C_String, double>::iterator iterMatchedNgrams; + + + int maxSentLen = 4086; + + + char fileName[1000]; + char tmpString[10000]; + + strcpy(fileName, argv[1]); + + fprintf(stderr,"Loading data...\n"); + SA.loadData_forSearch(fileName, false, true); + + cerr<<"Input sentences:\n"; + + long ltime1, ltime2; + + time( <ime1 ); + + int totalSentences = 0; + int matchedSentences = 0; + while(!cin.eof()){ + cin.getline(tmpString,10000,'\n'); + + if(strlen(tmpString)>0){ + vector<C_String> sentAsStringVector = convertTextToStringVector(tmpString); + + int sentLen; + S_sentSearchTableElement * freqTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen); + + if(sentLen!=sentAsStringVector.size()){ + cerr<<"Something wrong, can not proceed.!\n"; + exit(-1); + } + + + //go over the frequency table + for(int startPos = 0; startPos<sentLen; startPos++){ + C_String ngram; + bool stillMatching = true; + int n=1; + while(stillMatching & (n<=(sentLen-startPos)) ){ + + ngram.appending(sentAsStringVector[startPos+n-1]); + + int posInFreqTable = (n-1)*sentLen+startPos; + if(freqTable[posInFreqTable].found){ + double frequency = freqTable[posInFreqTable].endingPosInSA - freqTable[posInFreqTable].startPosInSA + 1; + + iterMatchedNgrams = matchedNgrams.find(ngram); + if(iterMatchedNgrams!=matchedNgrams.end()){ //exist already + iterMatchedNgrams->second=frequency; //frequency is not meaningful in this case, just use it because map need some values to be mapped to + } + else{ + matchedNgrams.insert(make_pair(ngram, frequency)); + } + } + else{ + stillMatching = false; + } + + + ngram.appending(C_String(" ")); + + n++; + } + } + + } + + tmpString[0]=0; + + } + + + //now output all the n-grams + iterMatchedNgrams = matchedNgrams.begin(); + while(iterMatchedNgrams != matchedNgrams.end()){ + cout<<(iterMatchedNgrams->first).toString()<<endl; + + iterMatchedNgrams++; + } + + + time( <ime2 ); + cerr<<"Time spent:"<<ltime2-ltime2<<" seconds\n"; + + return 1; +} diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp new file mode 100755 index 0000000..ebb2ed5 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp @@ -0,0 +1,754 @@ +/** +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ + +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <stdlib.h> +#include <cstring> + +using namespace std; + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase() +{ + + this->reportMaxOccurrenceOfOneNgram = -1; + this->highestFreqThresholdForReport = -1; + this->shortestUnitToReport = 1; + this->longestUnitToReport = -1; //no constraint + + this->level1Buckets = NULL; + this->noLevel1Bucket = false; //by default, build level1 bucket + + this->noOffset = false; //by default, load offset +} + +C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase() +{ + +} + +/** +* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence +* parameter highestFreqThresholdForReport is set so that very high frequent n-grams such as unigram "the" is skipped +* high frequent n-grams occur too often in the corpus and their statistics can often be estimated offline. +* Default value = -1 (no effective threshold) +**/ +void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport) +{ + this->highestFreqThresholdForReport = highestFreqThresholdForReport; +} + +/** +* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence +* parameter shortestUnitToReport is set so that short n-grams can be skipped to speed up the process +* Default value = 1 (no effective constraint) +**/ +void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport) +{ + this->shortestUnitToReport = shortestUnitToReport; +} + +/** +* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence +* parameter longestUnitToReport is set to skip long n-gram matches +* +* Default value = -1 (no effective limit, output all the matched n-grams no matter how long they are) +**/ +void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport) +{ + this->longestUnitToReport = longestUnitToReport; +} + +/** +* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence +* parameter reportMaxOccurrenceOfOneNgram is set to output information of only the "first" few occurrences of the matched n-gram +* Since the order is based on the order of the corresponding suffices in the corpus, +* the output occurrences are usually not the first few occurrences of the n-gram in the corpus +**/ +void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram) +{ + this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram; +} + + + +/** +* Load the indexed corpus, suffix array, offset and vocabulary into memory +* Note: if C_SuffixArraySearchApplicationBase will be used in the application to return the sentenceId/offset in sentence for the matched n-gram +* then noOffset needs to be set to be false (to load the offset) +**/ +void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset) +{ + + this->loadData(filename, noVoc, noOffset, false); //call the constructor of the super class, load data and build level1Bucket + + if(! this->noOffset){ + TextLenType lastSentId; + unsigned char tmpOffset; + this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset); + this->totalSentNum = lastSentId; + } + else{ + //we do not have offset information, simply travel to the sentence head + TextLenType pos = this->corpusSize-3; + while(this->corpus_list[pos]<this->sentIdStart){ //still actual words + pos--; + } + //at this position, it should be the <sentId> for the last sentence + this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1; + } + cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n"; + +} + + +///return 0 if w = text +///return 1 if w < text +///return 2 if w > text +///given that the prefix of lcp words are the same +char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText) +{ + + IndexType vocInText = this->corpus_list[posInText+lcp]; + + if(vocInWord == vocInText){ + return 0; + } + + if(vocInWord < vocInText){ + return 1; + } + + return 2; +} + +/** Utility function +* Convert an input sentence as char string into a vector of C_String objects +**/ +vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText) +{ + vector<C_String> sentAsStringVector; + + char tmpToken[MAX_TOKEN_LEN]; + memset(tmpToken,0,MAX_TOKEN_LEN); + + int pos = 0; + + int inputLen = strlen(sentText); + + for(int posInInput = 0; posInInput<inputLen; posInInput++){ + char thisChar = sentText[posInInput]; + + if((thisChar==' ')||(thisChar=='\t')){ //delimiters + if(strlen(tmpToken)>0){ + tmpToken[pos] = '\0'; + sentAsStringVector.push_back(C_String(tmpToken)); + pos=0; + tmpToken[pos] = '\0'; + } + } + else{ + tmpToken[pos] = thisChar; + pos++; + if(pos>=MAX_TOKEN_LEN){ //we can handle it + fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN); + exit(0); + } + } + } + + tmpToken[pos] = '\0'; + if(strlen(tmpToken)>0){ + sentAsStringVector.push_back(C_String(tmpToken)); + } + + return sentAsStringVector; + +} + +/** +* Utility function: convert a sentence as a vector of C_String to a vector of vocIDs +**/ +vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector) +{ + if(this->noVocabulary){ + cerr<<"Vocabulary not available!\n"; + exit(-1); + } + + vector<IndexType> sentAsVocIdVector; + + for(int i=0;i<sentAsStringVector.size();i++){ + sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i])); + } + return sentAsVocIdVector; +} + + +/** +* Utility function: +* Convert a sentence as character string to a vector of vocIDs +**/ +vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText) +{ + vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText); + return this->convertCStringVectorToVocIdVector(sentAsCStringVector); +} + + +/** +* If know the range where the phrase is, search in this range for it +* position here are all positions in SA, not the positions in the textstring +* +* LCP indicates that all the suffixes in the range has the same prefix with LCP length with the proposed n-gram phrase +* only need to compare the "nextWord" at LCP+1 position +* +* return true if such phrase can be found inside the range, false if not +**/ +bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos) +{ + TextLenType leftPos, rightPos, middlePos; + + //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket + //e.g. range correspondes to [ab, ad], but we are searching for (aa) + //so first step is to make sure the lcp+next word is still in this range + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){ + //phrase+next word < text corresponding rangeStart, we could not find it inside this range + return false; + } + + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){ + //phrase+next word > text corresponding to rangeEnd + return false; + } + + //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd]) + + + //search for left bound ( the pos in text which is the min(text>=w)) + //at any time, Left<w<=Right (actually Left<=w<=Right) + leftPos = rangeStartPos; + rightPos = rangeEndPos; + while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop + + middlePos = (TextLenType)((leftPos + rightPos) / 2); + if(((leftPos + rightPos) % 2) != 0){ + middlePos++; //bias towards right + } + + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){ + // phrase <= middlePos in Text, go left + rightPos = middlePos; + } + else{ + leftPos = middlePos; //word > middle, go right + } + + } + //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range + //here we can only guarantee that Left<=w, so need to check if Left==w at lcp + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){ + resultStartPos = leftPos; + } + else{ + resultStartPos = rightPos; + } + + //search for right bound ( the value which is the max(text<=w)) + //at any time, Left<w<=Right (actually Left<=w<=Right) + leftPos = rangeStartPos; + rightPos = rangeEndPos; + while( rightPos > (leftPos+1)){ //stop when right = left + 1 + middlePos = (TextLenType) ((leftPos + rightPos) / 2 ); //bias towards left + + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right + leftPos = middlePos; + } + else{ + rightPos = middlePos; // ==1, phrase < middlePos + } + } + //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range + //here we can only guarantee that w<=Right, so need to check if Right==w at lcp + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){ + resultEndPos = rightPos; + } + else{ + resultEndPos = leftPos; + } + + if(resultEndPos>=resultStartPos){ + return true; + } + + return false; //could not find this phrase +} + +///memory allocated here, remember to free the memory when the table is not needed any more in the +///calling function +S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen) +{ + vector<IndexType> sentInVocId = this->convertStringToVocId(sentText); + sentLen = sentInVocId.size(); + + return this->constructNgramSearchTable4SentWithLCP(sentInVocId); +} + + +///constructing the n-gram search table +///memory allocated here, remember to free the memory when the table is not needed any more in the +///calling function +/// +///faster than constructNgramSearchTable4Sent because the suffixes in the range given by n-1 gram can +///guaranteed to have the first n-1 words to be the same as the n-1 gram +///only needs to compare the following one word +/// +/// for a sentence as:w1, w2,.... +/// cell [i,j] in the table is for n-gram from w_(j-1)...w_(j+i-1), that is a +/// (i+1)-gram starting at position j+1 in sentence +S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId) +{ + int sentLen = sentInVocId.size(); + S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement)); + + //for consistency, initialize all cells + for(int c=0;c<(sentLen*sentLen);c++){ + table[c].found = false; + table[c].startPosInSA = 0; + table[c].endingPosInSA = 0; + } + + TextLenType startPos, endPos; + + //initialize word level elements + for(int i=0;i<sentLen;i++){ + IndexType vocId = sentInVocId[i]; + //cout<<vocId<<" "; + if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word + table[i].found = false; + } + else{ + table[i].startPosInSA = this->level1Buckets[vocId].first; + table[i].endingPosInSA = this->level1Buckets[vocId].last; + + if(table[i].startPosInSA<=table[i].endingPosInSA){ + table[i].found = true; + } + else{ //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc + table[i].found = false; + } + } + } + + + //filling in the cells in the table row by row + //basically this means we start by looking for smaller units first + //if they are found, search for longer n-grams + for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent + int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension + int levelN_0 = n * sentLen; + for(int j=0;j<= (sentLen - 1 - n); j++){ //possible starting point for n+1 gram + //necessary conditions that this n+1 gram exist are: + //the two sub n-gram all exist in the corpus + if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){ + IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram + + //n+1 gram has to be in the range of the n-gram in SA + startPos = table[levelN_1_0 + j].startPosInSA; + endPos = table[levelN_1_0 + j].endingPosInSA; + + TextLenType foundPosStart = 0; + TextLenType foundPosEnd = 0; + + //the prefix of n words of all suffixes between [startPos, endPos] is the same as the + //prefix of the n words in the proposed n+1 gram, no need to compare + //only need to compare the n+1 word, which is "nextWord" here + if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){ + table[levelN_0 + j].found = true; + table[levelN_0 + j].startPosInSA = foundPosStart; + table[levelN_0 + j].endingPosInSA = foundPosEnd; + } + else{ + table[levelN_0 + j].found = false; + } + + } + else{ + table[levelN_0 + j].found = false; + } + } + } + return table; +} + +void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent) +{ + vector<IndexType> sentInVocId = this->convertStringToVocId(sent); + this->displayNgramMatchingFreq4Sent(sentInVocId); +} + +void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId) +{ + int sentLen = sentInVocId.size(); + + int i,j; + + //construct the n-gram search table + S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId); + + //show sentence + cout<<"\t"; + for(i=0;i<sentLen;i++){ + cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t"; + } + cout<<endl; + + //show frequency of each n-gram + i=0; + bool stillMatch = true; + while(stillMatch &&( i<sentLen)){ + cout<<i+1<<"\t"; + int startForRow = i*sentLen; + bool anyGood = false; + for(j=0;j<= (sentLen - 1 - i); j++){ + if(table[startForRow+j].found){ + //this is for regular case + if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){ //more than one occurrence + cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1; + anyGood = true; + } + else{ + cout<<"0"; + } + + } + else{ + cout<<"0"; + } + cout<<"\t"; + } + + stillMatch = anyGood; + cout<<endl; + i++; + } + + free(table); +} + +///given the pos of a word in corpus, return its offset in the sentence +///and the sentence ID +///offset has to be loaded +///we do not check it here for efficicency purposes +void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset) +{ + offset = this->offset_list[pos]; + sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1; + + offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus +} + +void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen) +{ + offset = this->offset_list[pos]; + sentLen = this->offset_list[pos-offset]; + sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1; + + offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus +} + +vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs) +{ + if(srcSentAsVocIDs.size()>255){ + cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n"; + exit(0); + } + + unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size(); + + //construct the n-gram search table + S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs); + + //Now, we know all the n-grams we are looking for + //output the results + vector<S_phraseLocationElement> allFoundNgrams; + S_phraseLocationElement tmpNode; + + int longestUnitToReportForThisSent = sentLen; + if(this->longestUnitToReport!=-1){ + //and if longestUnitToReport is shorter than sentLen + if(this->longestUnitToReport<sentLen){ + longestUnitToReportForThisSent = this->longestUnitToReport; + } + } + + for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){ + int firstPosInRow = r*sentLen; + for(unsigned char c=0; c<= (sentLen - 1 - r); c++){ + if(table[firstPosInRow + c].found){ //at this position the ngram was found + tmpNode.posStartInSrcSent = c + 1; //position starts from 1 + tmpNode.posEndInSrcSent = r + c + 1; + + //now for all ocurrences, find their sentId and realative positions + TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA; + TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA; + + if( (this->highestFreqThresholdForReport <= 0) || //no limit + ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport )) + ){ + // we don't want to retrieve high-freq n-gram which is very time consuming + //and meaningless for translation, such as 1M occurrences of "of the" in the corpus + + + if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){ + //and for each n-gram, report only a limited amount of occurrences + endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1; + } + + TextLenType sentId; + unsigned char posInSent; + for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){ + this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent); + tmpNode.sentIdInCorpus = sentId; + tmpNode.posInSentInCorpus = posInSent; + + allFoundNgrams.push_back(tmpNode); + } + } + } + + } + } + + free(table); + + return allFoundNgrams; +} + +vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent) +{ + //use the vocabulary associated with this corpus to convert words to vocIDs + vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent); + + return this->findPhrasesInASentence(srcSentAsVocIDs); +} + + +bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd) +{ + int phraseLen = phrase.size(); + + //first check if there are any <unk> in the phrase + for(int i=0;i<phrase.size();i++){ + if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){ + return false; //return empty matching result + } + } + + TextLenType currentRangeStart, currentRangeEnd; + TextLenType narrowedRangeStart, narrowedRangeEnd; + IndexType vocId; + + //for word 1 + vocId = phrase[0]; + currentRangeStart = this->level1Buckets[vocId].first; + currentRangeEnd = this->level1Buckets[vocId].last; + + if(currentRangeStart>currentRangeEnd){ + return false; //even this 1-gram does not exist + } + + int posInPhrase = 1; + while( posInPhrase<phraseLen ){ + vocId = phrase[posInPhrase]; + bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd); + + if(! stillExist){ + return false; + } + + currentRangeStart = narrowedRangeStart; + currentRangeEnd = narrowedRangeEnd; + + posInPhrase++; + } + + //we find the range of matching phrase, now get the sentId + rangeStart = currentRangeStart; + rangeEnd = currentRangeEnd; + + return true; +} + +///similar to construct the freq table +///but only search for the exact phrase matching +///Important: because locateSentIdFromPos is called which requires the offset information +///Suffix array has to be initialized with offset loaded +///i.e. initilized with loadData_forSearch(corpusName, bool noVoc, noOffset=fase) +///otherwise the program will have segmentation fault +///SALM does not check if offset has been loaded already for efficiency reasons because locateSendIdFromPos() is called frequently +vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase) +{ + vector<S_SimplePhraseLocationElement> matchingResult; + + TextLenType rangeStart, rangeEnd; + + if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){ + //we find some match + S_SimplePhraseLocationElement tmpNode; + for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){ + this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus); + matchingResult.push_back(tmpNode); + } + } + + return matchingResult; +} + +vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase) +{ + //use the vocabulary associated with this corpus to convert words to vocIds + vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase); + + return this->locateExactPhraseInCorpus(phraseAsVocIDs); +} + + +TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase) +{ + TextLenType rangeStart, rangeEnd; + + if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){ + return rangeEnd - rangeStart + 1; + } + + return 0; +} + +TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase) +{ + //use the vocabulary associated with this corpus to convert words to vocIds + vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase); + + return this->freqOfExactPhraseMatch(phraseAsVocIDs); +} + + +TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen) +{ + TextLenType rangeStart, rangeEnd; + + sentLen = phrase.size(); + + if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){ + startPosInSA = rangeStart; + return rangeEnd - rangeStart + 1; + } + + return 0; +} + +TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen) +{ + //use the vocabulary associated with this corpus to convert words to vocIds + vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase); + + return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen); +} + + +TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber() +{ + return this->totalSentNum; +} + +///given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n> +///startingPosInSrcSent starts at 0, n is the n-gram length +void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n) +{ + n = index / sentLen + 1; + posInSrcSent = index % sentLen; +} + +///given the starting position in src sentence and the length of the n-gram +///calculate the index in the table +///posInSent starts at 0, n is the actual len of n-gram, starts at 1 +unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen) +{ + unsigned int indexInTable = (n-1)*sentLen + posInSent; + + return indexInTable; +} + +///simple return how many n-grams are matched +unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent) +{ + vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent); + return this->numberOfMatcedNgram(sentInVocId); +} + +///simply return how many n-grams are matched +unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId) +{ + int sentLen = sentInVocId.size(); + + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId); + + unsigned int totalMatched = 0; + + for(unsigned int i=0;i<(sentLen*sentLen);i++){ + if(table[i].found){ + totalMatched++; + } + } + + free(table); + return totalMatched; +} + + +map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen) +{ + vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent); + return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen); +} + +map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen) +{ + sentLen = sentInVocId.size(); + map<int, pair<int, unsigned long> > nGramMatched; + map<int, pair<int, unsigned long> >::iterator iterNGramMatched; + + //construct the n-gram search table + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId); + + for(int n = 1; n <= sentLen; n++){ + for(int startPos=0; startPos <= (sentLen - n); startPos++){ + int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen); + + if(table[indexInTable].found){ + + unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1; + iterNGramMatched = nGramMatched.find(n); + if(iterNGramMatched==nGramMatched.end()){//has not seen this before + nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) )); + } + else{ + iterNGramMatched->second.first++; + iterNGramMatched->second.second+=freqInTraining; + } + } + } + } + + free(table); + + return nGramMatched; +} + diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~ new file mode 100755 index 0000000..94d272c --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~ @@ -0,0 +1,753 @@ +/** +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ + +#include "_SuffixArraySearchApplicationBase.h" +#include <iostream> +#include <stdlib.h> + +using namespace std; + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase() +{ + + this->reportMaxOccurrenceOfOneNgram = -1; + this->highestFreqThresholdForReport = -1; + this->shortestUnitToReport = 1; + this->longestUnitToReport = -1; //no constraint + + this->level1Buckets = NULL; + this->noLevel1Bucket = false; //by default, build level1 bucket + + this->noOffset = false; //by default, load offset +} + +C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase() +{ + +} + +/** +* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence +* parameter highestFreqThresholdForReport is set so that very high frequent n-grams such as unigram "the" is skipped +* high frequent n-grams occur too often in the corpus and their statistics can often be estimated offline. +* Default value = -1 (no effective threshold) +**/ +void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport) +{ + this->highestFreqThresholdForReport = highestFreqThresholdForReport; +} + +/** +* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence +* parameter shortestUnitToReport is set so that short n-grams can be skipped to speed up the process +* Default value = 1 (no effective constraint) +**/ +void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport) +{ + this->shortestUnitToReport = shortestUnitToReport; +} + +/** +* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence +* parameter longestUnitToReport is set to skip long n-gram matches +* +* Default value = -1 (no effective limit, output all the matched n-grams no matter how long they are) +**/ +void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport) +{ + this->longestUnitToReport = longestUnitToReport; +} + +/** +* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence +* parameter reportMaxOccurrenceOfOneNgram is set to output information of only the "first" few occurrences of the matched n-gram +* Since the order is based on the order of the corresponding suffices in the corpus, +* the output occurrences are usually not the first few occurrences of the n-gram in the corpus +**/ +void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram) +{ + this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram; +} + + + +/** +* Load the indexed corpus, suffix array, offset and vocabulary into memory +* Note: if C_SuffixArraySearchApplicationBase will be used in the application to return the sentenceId/offset in sentence for the matched n-gram +* then noOffset needs to be set to be false (to load the offset) +**/ +void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset) +{ + + this->loadData(filename, noVoc, noOffset, false); //call the constructor of the super class, load data and build level1Bucket + + if(! this->noOffset){ + TextLenType lastSentId; + unsigned char tmpOffset; + this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset); + this->totalSentNum = lastSentId; + } + else{ + //we do not have offset information, simply travel to the sentence head + TextLenType pos = this->corpusSize-3; + while(this->corpus_list[pos]<this->sentIdStart){ //still actual words + pos--; + } + //at this position, it should be the <sentId> for the last sentence + this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1; + } + cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n"; + +} + + +///return 0 if w = text +///return 1 if w < text +///return 2 if w > text +///given that the prefix of lcp words are the same +char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText) +{ + + IndexType vocInText = this->corpus_list[posInText+lcp]; + + if(vocInWord == vocInText){ + return 0; + } + + if(vocInWord < vocInText){ + return 1; + } + + return 2; +} + +/** Utility function +* Convert an input sentence as char string into a vector of C_String objects +**/ +vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText) +{ + vector<C_String> sentAsStringVector; + + char tmpToken[MAX_TOKEN_LEN]; + memset(tmpToken,0,MAX_TOKEN_LEN); + + int pos = 0; + + int inputLen = strlen(sentText); + + for(int posInInput = 0; posInInput<inputLen; posInInput++){ + char thisChar = sentText[posInInput]; + + if((thisChar==' ')||(thisChar=='\t')){ //delimiters + if(strlen(tmpToken)>0){ + tmpToken[pos] = '\0'; + sentAsStringVector.push_back(C_String(tmpToken)); + pos=0; + tmpToken[pos] = '\0'; + } + } + else{ + tmpToken[pos] = thisChar; + pos++; + if(pos>=MAX_TOKEN_LEN){ //we can handle it + fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN); + exit(0); + } + } + } + + tmpToken[pos] = '\0'; + if(strlen(tmpToken)>0){ + sentAsStringVector.push_back(C_String(tmpToken)); + } + + return sentAsStringVector; + +} + +/** +* Utility function: convert a sentence as a vector of C_String to a vector of vocIDs +**/ +vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector) +{ + if(this->noVocabulary){ + cerr<<"Vocabulary not available!\n"; + exit(-1); + } + + vector<IndexType> sentAsVocIdVector; + + for(int i=0;i<sentAsStringVector.size();i++){ + sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i])); + } + return sentAsVocIdVector; +} + + +/** +* Utility function: +* Convert a sentence as character string to a vector of vocIDs +**/ +vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText) +{ + vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText); + return this->convertCStringVectorToVocIdVector(sentAsCStringVector); +} + + +/** +* If know the range where the phrase is, search in this range for it +* position here are all positions in SA, not the positions in the textstring +* +* LCP indicates that all the suffixes in the range has the same prefix with LCP length with the proposed n-gram phrase +* only need to compare the "nextWord" at LCP+1 position +* +* return true if such phrase can be found inside the range, false if not +**/ +bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos) +{ + TextLenType leftPos, rightPos, middlePos; + + //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket + //e.g. range correspondes to [ab, ad], but we are searching for (aa) + //so first step is to make sure the lcp+next word is still in this range + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){ + //phrase+next word < text corresponding rangeStart, we could not find it inside this range + return false; + } + + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){ + //phrase+next word > text corresponding to rangeEnd + return false; + } + + //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd]) + + + //search for left bound ( the pos in text which is the min(text>=w)) + //at any time, Left<w<=Right (actually Left<=w<=Right) + leftPos = rangeStartPos; + rightPos = rangeEndPos; + while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop + + middlePos = (TextLenType)((leftPos + rightPos) / 2); + if(((leftPos + rightPos) % 2) != 0){ + middlePos++; //bias towards right + } + + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){ + // phrase <= middlePos in Text, go left + rightPos = middlePos; + } + else{ + leftPos = middlePos; //word > middle, go right + } + + } + //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range + //here we can only guarantee that Left<=w, so need to check if Left==w at lcp + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){ + resultStartPos = leftPos; + } + else{ + resultStartPos = rightPos; + } + + //search for right bound ( the value which is the max(text<=w)) + //at any time, Left<w<=Right (actually Left<=w<=Right) + leftPos = rangeStartPos; + rightPos = rangeEndPos; + while( rightPos > (leftPos+1)){ //stop when right = left + 1 + middlePos = (TextLenType) ((leftPos + rightPos) / 2 ); //bias towards left + + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right + leftPos = middlePos; + } + else{ + rightPos = middlePos; // ==1, phrase < middlePos + } + } + //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range + //here we can only guarantee that w<=Right, so need to check if Right==w at lcp + if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){ + resultEndPos = rightPos; + } + else{ + resultEndPos = leftPos; + } + + if(resultEndPos>=resultStartPos){ + return true; + } + + return false; //could not find this phrase +} + +///memory allocated here, remember to free the memory when the table is not needed any more in the +///calling function +S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen) +{ + vector<IndexType> sentInVocId = this->convertStringToVocId(sentText); + sentLen = sentInVocId.size(); + + return this->constructNgramSearchTable4SentWithLCP(sentInVocId); +} + + +///constructing the n-gram search table +///memory allocated here, remember to free the memory when the table is not needed any more in the +///calling function +/// +///faster than constructNgramSearchTable4Sent because the suffixes in the range given by n-1 gram can +///guaranteed to have the first n-1 words to be the same as the n-1 gram +///only needs to compare the following one word +/// +/// for a sentence as:w1, w2,.... +/// cell [i,j] in the table is for n-gram from w_(j-1)...w_(j+i-1), that is a +/// (i+1)-gram starting at position j+1 in sentence +S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId) +{ + int sentLen = sentInVocId.size(); + S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement)); + + //for consistency, initialize all cells + for(int c=0;c<(sentLen*sentLen);c++){ + table[c].found = false; + table[c].startPosInSA = 0; + table[c].endingPosInSA = 0; + } + + TextLenType startPos, endPos; + + //initialize word level elements + for(int i=0;i<sentLen;i++){ + IndexType vocId = sentInVocId[i]; + //cout<<vocId<<" "; + if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word + table[i].found = false; + } + else{ + table[i].startPosInSA = this->level1Buckets[vocId].first; + table[i].endingPosInSA = this->level1Buckets[vocId].last; + + if(table[i].startPosInSA<=table[i].endingPosInSA){ + table[i].found = true; + } + else{ //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc + table[i].found = false; + } + } + } + + + //filling in the cells in the table row by row + //basically this means we start by looking for smaller units first + //if they are found, search for longer n-grams + for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent + int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension + int levelN_0 = n * sentLen; + for(int j=0;j<= (sentLen - 1 - n); j++){ //possible starting point for n+1 gram + //necessary conditions that this n+1 gram exist are: + //the two sub n-gram all exist in the corpus + if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){ + IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram + + //n+1 gram has to be in the range of the n-gram in SA + startPos = table[levelN_1_0 + j].startPosInSA; + endPos = table[levelN_1_0 + j].endingPosInSA; + + TextLenType foundPosStart = 0; + TextLenType foundPosEnd = 0; + + //the prefix of n words of all suffixes between [startPos, endPos] is the same as the + //prefix of the n words in the proposed n+1 gram, no need to compare + //only need to compare the n+1 word, which is "nextWord" here + if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){ + table[levelN_0 + j].found = true; + table[levelN_0 + j].startPosInSA = foundPosStart; + table[levelN_0 + j].endingPosInSA = foundPosEnd; + } + else{ + table[levelN_0 + j].found = false; + } + + } + else{ + table[levelN_0 + j].found = false; + } + } + } + return table; +} + +void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent) +{ + vector<IndexType> sentInVocId = this->convertStringToVocId(sent); + this->displayNgramMatchingFreq4Sent(sentInVocId); +} + +void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId) +{ + int sentLen = sentInVocId.size(); + + int i,j; + + //construct the n-gram search table + S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId); + + //show sentence + cout<<"\t"; + for(i=0;i<sentLen;i++){ + cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t"; + } + cout<<endl; + + //show frequency of each n-gram + i=0; + bool stillMatch = true; + while(stillMatch &&( i<sentLen)){ + cout<<i+1<<"\t"; + int startForRow = i*sentLen; + bool anyGood = false; + for(j=0;j<= (sentLen - 1 - i); j++){ + if(table[startForRow+j].found){ + //this is for regular case + if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){ //more than one occurrence + cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1; + anyGood = true; + } + else{ + cout<<"0"; + } + + } + else{ + cout<<"0"; + } + cout<<"\t"; + } + + stillMatch = anyGood; + cout<<endl; + i++; + } + + free(table); +} + +///given the pos of a word in corpus, return its offset in the sentence +///and the sentence ID +///offset has to be loaded +///we do not check it here for efficicency purposes +void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset) +{ + offset = this->offset_list[pos]; + sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1; + + offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus +} + +void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen) +{ + offset = this->offset_list[pos]; + sentLen = this->offset_list[pos-offset]; + sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1; + + offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus +} + +vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs) +{ + if(srcSentAsVocIDs.size()>255){ + cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n"; + exit(0); + } + + unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size(); + + //construct the n-gram search table + S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs); + + //Now, we know all the n-grams we are looking for + //output the results + vector<S_phraseLocationElement> allFoundNgrams; + S_phraseLocationElement tmpNode; + + int longestUnitToReportForThisSent = sentLen; + if(this->longestUnitToReport!=-1){ + //and if longestUnitToReport is shorter than sentLen + if(this->longestUnitToReport<sentLen){ + longestUnitToReportForThisSent = this->longestUnitToReport; + } + } + + for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){ + int firstPosInRow = r*sentLen; + for(unsigned char c=0; c<= (sentLen - 1 - r); c++){ + if(table[firstPosInRow + c].found){ //at this position the ngram was found + tmpNode.posStartInSrcSent = c + 1; //position starts from 1 + tmpNode.posEndInSrcSent = r + c + 1; + + //now for all ocurrences, find their sentId and realative positions + TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA; + TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA; + + if( (this->highestFreqThresholdForReport <= 0) || //no limit + ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport )) + ){ + // we don't want to retrieve high-freq n-gram which is very time consuming + //and meaningless for translation, such as 1M occurrences of "of the" in the corpus + + + if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){ + //and for each n-gram, report only a limited amount of occurrences + endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1; + } + + TextLenType sentId; + unsigned char posInSent; + for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){ + this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent); + tmpNode.sentIdInCorpus = sentId; + tmpNode.posInSentInCorpus = posInSent; + + allFoundNgrams.push_back(tmpNode); + } + } + } + + } + } + + free(table); + + return allFoundNgrams; +} + +vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent) +{ + //use the vocabulary associated with this corpus to convert words to vocIDs + vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent); + + return this->findPhrasesInASentence(srcSentAsVocIDs); +} + + +bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd) +{ + int phraseLen = phrase.size(); + + //first check if there are any <unk> in the phrase + for(int i=0;i<phrase.size();i++){ + if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){ + return false; //return empty matching result + } + } + + TextLenType currentRangeStart, currentRangeEnd; + TextLenType narrowedRangeStart, narrowedRangeEnd; + IndexType vocId; + + //for word 1 + vocId = phrase[0]; + currentRangeStart = this->level1Buckets[vocId].first; + currentRangeEnd = this->level1Buckets[vocId].last; + + if(currentRangeStart>currentRangeEnd){ + return false; //even this 1-gram does not exist + } + + int posInPhrase = 1; + while( posInPhrase<phraseLen ){ + vocId = phrase[posInPhrase]; + bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd); + + if(! stillExist){ + return false; + } + + currentRangeStart = narrowedRangeStart; + currentRangeEnd = narrowedRangeEnd; + + posInPhrase++; + } + + //we find the range of matching phrase, now get the sentId + rangeStart = currentRangeStart; + rangeEnd = currentRangeEnd; + + return true; +} + +///similar to construct the freq table +///but only search for the exact phrase matching +///Important: because locateSentIdFromPos is called which requires the offset information +///Suffix array has to be initialized with offset loaded +///i.e. initilized with loadData_forSearch(corpusName, bool noVoc, noOffset=fase) +///otherwise the program will have segmentation fault +///SALM does not check if offset has been loaded already for efficiency reasons because locateSendIdFromPos() is called frequently +vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase) +{ + vector<S_SimplePhraseLocationElement> matchingResult; + + TextLenType rangeStart, rangeEnd; + + if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){ + //we find some match + S_SimplePhraseLocationElement tmpNode; + for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){ + this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus); + matchingResult.push_back(tmpNode); + } + } + + return matchingResult; +} + +vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase) +{ + //use the vocabulary associated with this corpus to convert words to vocIds + vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase); + + return this->locateExactPhraseInCorpus(phraseAsVocIDs); +} + + +TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase) +{ + TextLenType rangeStart, rangeEnd; + + if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){ + return rangeEnd - rangeStart + 1; + } + + return 0; +} + +TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase) +{ + //use the vocabulary associated with this corpus to convert words to vocIds + vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase); + + return this->freqOfExactPhraseMatch(phraseAsVocIDs); +} + + +TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen) +{ + TextLenType rangeStart, rangeEnd; + + sentLen = phrase.size(); + + if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){ + startPosInSA = rangeStart; + return rangeEnd - rangeStart + 1; + } + + return 0; +} + +TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen) +{ + //use the vocabulary associated with this corpus to convert words to vocIds + vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase); + + return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen); +} + + +TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber() +{ + return this->totalSentNum; +} + +///given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n> +///startingPosInSrcSent starts at 0, n is the n-gram length +void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n) +{ + n = index / sentLen + 1; + posInSrcSent = index % sentLen; +} + +///given the starting position in src sentence and the length of the n-gram +///calculate the index in the table +///posInSent starts at 0, n is the actual len of n-gram, starts at 1 +unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen) +{ + unsigned int indexInTable = (n-1)*sentLen + posInSent; + + return indexInTable; +} + +///simple return how many n-grams are matched +unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent) +{ + vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent); + return this->numberOfMatcedNgram(sentInVocId); +} + +///simply return how many n-grams are matched +unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId) +{ + int sentLen = sentInVocId.size(); + + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId); + + unsigned int totalMatched = 0; + + for(unsigned int i=0;i<(sentLen*sentLen);i++){ + if(table[i].found){ + totalMatched++; + } + } + + free(table); + return totalMatched; +} + + +map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen) +{ + vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent); + return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen); +} + +map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen) +{ + sentLen = sentInVocId.size(); + map<int, pair<int, unsigned long> > nGramMatched; + map<int, pair<int, unsigned long> >::iterator iterNGramMatched; + + //construct the n-gram search table + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId); + + for(int n = 1; n <= sentLen; n++){ + for(int startPos=0; startPos <= (sentLen - n); startPos++){ + int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen); + + if(table[indexInTable].found){ + + unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1; + iterNGramMatched = nGramMatched.find(n); + if(iterNGramMatched==nGramMatched.end()){//has not seen this before + nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) )); + } + else{ + iterNGramMatched->second.first++; + iterNGramMatched->second.second+=freqInTraining; + } + } + } + } + + free(table); + + return nGramMatched; +} + diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h new file mode 100755 index 0000000..2c0070d --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h @@ -0,0 +1,127 @@ +#if !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_) +#define __SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_ + +#include "_SuffixArrayApplicationBase.h" +/** +* \ingroup search +* Used by locateExactPhraseInCorpus() to return the location of an matched n-gram in the corpus +* as a pair of <sentenceId, offset pos in sentence> +**/ +typedef struct simplePhraseLocationElement +{ + TextLenType sentIdInCorpus; + unsigned char posInSentInCorpus; +}S_SimplePhraseLocationElement; + +/** +* \ingroup search +* Used by findPhraseInASentence() to return the location of an embedded n-gram in the corpus +* <posStartInSrcSent, posEndInSrcSent> represents the embedded n-gram in the sentence +* <sentIdInCorpus, posInSentInCorpus> represents the location in the corpus +**/ +typedef struct phraseLocationElement +{ + unsigned char posStartInSrcSent; + unsigned char posEndInSrcSent; + TextLenType sentIdInCorpus; + unsigned char posInSentInCorpus; +}S_phraseLocationElement; + +/** +* \ingroup search +**/ +typedef struct phraseLocationWithSrcSentElement +{ + int srcPosStart; + int srcPosEnd; + TextLenType sentId; + TextLenType posInSent; + vector<C_String> sentence; +}S_phraseLocationWithSrcSentElement; + +/** +* \ingroup search +**/ +typedef struct sentSearchTableElement +{ + bool found; + TextLenType startPosInSA; + TextLenType endingPosInSA; +}S_sentSearchTableElement; + + +/** +* \ingroup search +* Base class for suffix array search applications +* Provides functions to search n-grams in the corpus +* Including the frequency of the n-gram and the actual location (sentenceID+offset in sentence) +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +class C_SuffixArraySearchApplicationBase : public C_SuffixArrayApplicationBase +{ +public: + void loadData_forSearch(const char * filename, bool noVoc, bool noOffset); + + unsigned int numberOfMatcedNgram(const char * srcSent); + unsigned int numberOfMatcedNgram(vector<IndexType> & sentInVocId); + + TextLenType freqOfExactPhraseMatch(const char * phrase); + TextLenType freqOfExactPhraseMatch(vector<IndexType> & phrase); + + TextLenType freqOfExactPhraseMatchAndFirstOccurrence(const char * phrase, TextLenType & startPosInSA, int & sentLen); + TextLenType freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen); + + vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(const char * phrase); + vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(vector<IndexType> & phrase); + + vector<S_phraseLocationElement> findPhrasesInASentence(const char * srcSent); + vector<S_phraseLocationElement> findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs); + + void displayNgramMatchingFreq4Sent(const char *); + void displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId); + + map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen); + map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int & sentLen); + + S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen); + S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId); + + void setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram); + void setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport); + void setParam_longestUnitToReport(int longestUnitToReport); + void setParam_shortestUnitToReport(int shortestUnitToReport); + + TextLenType returnTotalSentNumber(); + + vector<IndexType> convertStringToVocId(const char * sentText); + vector<C_String> convertCharStringToCStringVector(const char * sentText); + vector<IndexType> convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector); + + + C_SuffixArraySearchApplicationBase(); + virtual ~C_SuffixArraySearchApplicationBase(); + +protected: + bool locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd); + + bool searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType & resultStartPos, TextLenType & resultEndPos); + char comparePhraseWithTextWithLCP(IndexType, int, TextLenType); + + void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset); + void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen); + + + unsigned int twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen); + void oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n); + + int reportMaxOccurrenceOfOneNgram; + int highestFreqThresholdForReport; + int longestUnitToReport; + int shortestUnitToReport; + + TextLenType totalSentNum; +}; + +#endif // !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_) diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp new file mode 100755 index 0000000..91962fe --- /dev/null +++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp @@ -0,0 +1,314 @@ +/** +* Revision $Rev: 3815 $ +* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $ +**/ + +#include "_SuffixArrayApplicationBase.h" + +#include "malloc.h" +#include "time.h" + +#include <iostream> +#include <fstream> +#include <stdlib.h> + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase() +{ + this->level1Buckets = NULL; + this->noVocabulary = false; //by default, still load the vocabulary + this->noOffset = false; //by default, load offset + this->noLevel1Bucket = false; //by default, construct level1 bucket +} + +C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase() +{ + if(this->level1Buckets!=NULL){ + free(this->level1Buckets); + } + + //not necessary too + free(this->corpus_list); + free(this->suffix_list); + + if(! this->noOffset){ + free(this->offset_list); + } + + if(! this->noVocabulary){ + delete(this->voc); + } +} + +/** +* Load the indexed corpus, suffix array, vocabulary, offset into memory for follow up applications +* It is optional to load vocabulary, offset depends on the argument. +* In the case when the testing data shares the same vocabulary as the training data and only vocIDs are used to represent the sentence/n-grams +* then vocabulary which maps between vocId and the word text can be skipped to save some memory. +* +* If the suffix array object does not need to locate the sentence id of an occurred n-gram, then offset information is not needed. +* +* Be very careful here, the suffix array class does not check if offset has been loaded in the search function to make it efficient +* you need to know what the suffix array class will be used (whether offset is needed) and load it properly +* @param fileNameStem The filename of the corpus. This should be the same filename used in IndexSA +* @param noVoc If set to be 'true', vocabulary will not be loaded +* @param noOffset If set to be 'true', the offset information will not be loaded. <sentId, offsetInSent> information for an n-gram's occurrences can not be calculated. +* @param noLevel1Bucket Level1Bucket is used to speed up the search at the cost of additional memory. For applications which do not need to locate n-grams in the corpus (such as the corpus scanning application), then there is no need to create Level1Bucket +**/ +void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket) +{ + long ltime1, ltime2; + + this->noVocabulary = noVoc; + this->noOffset = noOffset; + this->noLevel1Bucket = noLevel1Bucket; + + + char tmpString[1000]; + + //the order of loading the data is important, do not change + if(! this->noVocabulary){ + time( <ime1 ); + cerr<<"Loading Vocabulary...\n"; + sprintf(tmpString,"%s.id_voc",fileNameStem); + this->loadVoc(tmpString); + time( <ime2); + cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n"; + } + + time( <ime1 ); + cerr<<"Loading corpus...\n"; + sprintf(tmpString,"%s.sa_corpus",fileNameStem); + this->loadCorpusAndInitMem(tmpString); + time( <ime2); + cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n"; + + time( <ime1 ); + cerr<<"Loading suffix...\n"; + sprintf(tmpString,"%s.sa_suffix",fileNameStem); + this->loadSuffix(tmpString); + time( <ime2); + cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n"; + + if(! this->noOffset){ + time( <ime1 ); + cerr<<"Loading offset...\n"; + sprintf(tmpString,"%s.sa_offset",fileNameStem); + this->loadOffset(tmpString); + time( <ime2); + cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n"; + } +} + +void C_SuffixArrayApplicationBase::loadVoc(const char *filename) +{ + this->voc = new C_IDVocabulary(filename); +} + +void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename) +{ + unsigned int dwRead = 0; + FILE * CorpusInputFile = fopen(filename, "rb"); + + if(!CorpusInputFile){ + cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n"; + exit(0); + } + + //first, read the size of the corpus + dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile); + + //allocate memory for all data structure + this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize); + if(! this->corpus_list){ + cerr<<"Can not allocate memory to load the corpus!\n"; + exit(0); + } + + this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize); + if(! this->suffix_list){ + cerr<<"Can not allocate memory to load the suffix!\n"; + exit(0); + } + + if(! this->noOffset){ + this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize); + if(! this->offset_list){ + cerr<<"Can not allocate memory to load the offset!\n"; + exit(0); + } + } + + //read the corpus file + unsigned int totalRead = 0; + unsigned int remaining = this->corpusSize; + unsigned int oneBatchReadSize; + char * currentPosInCorpusList = (char *) this->corpus_list; + while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){ + oneBatchReadSize = SIZE_ONE_READ; + if(remaining<SIZE_ONE_READ){ + oneBatchReadSize = remaining; + } + + dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile); + + totalRead+=dwRead; + remaining-=dwRead; + + currentPosInCorpusList+=sizeof(IndexType)*dwRead; + } + if(totalRead!=this->corpusSize){ + cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl; + exit(0); + } + fclose(CorpusInputFile); + + this->sentIdStart = this->corpus_list[0]; + this->vocIdForSentStart = this->corpus_list[1]; + this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1]; + this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2]; + + if(! this->noLevel1Bucket){ + //in this corpus, we will have at most sentIdStart-1 word types + //the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data + this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart); + + //initialize the level1 buckets + for(IndexType i=0;i<this->sentIdStart;i++){ + this->level1Buckets[i].first = (TextLenType) -1; + this->level1Buckets[i].last = 0; + } + } +} + +void C_SuffixArrayApplicationBase::loadSuffix(const char *filename) +{ + unsigned int dwRead = 0; + FILE * SuffixInputFile = fopen(filename, "rb"); + if(!SuffixInputFile){ + cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl; + exit(0); + } + + //first, read in the size of the suffix array + TextLenType suffixArraySize; + dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile); + + if(suffixArraySize!=this->corpusSize){ + cerr<<"Something wrong, the suffix array size is different from the corpus size.\n"; + cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl; + exit(0); + } + + //read all the suffix into memory + unsigned int totalRead = 0; + unsigned int remaining = suffixArraySize; + unsigned int oneBatchReadSize; + char * currentPosInSuffixList = (char *) this->suffix_list; + while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){ + oneBatchReadSize = SIZE_ONE_READ; + if(remaining<SIZE_ONE_READ){ + oneBatchReadSize = remaining; + } + + dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile); + + totalRead+=dwRead; + remaining -= dwRead; + + currentPosInSuffixList+=sizeof(TextLenType)*dwRead; + } + if(totalRead!=suffixArraySize){ + cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl; + exit(0); + } + + fclose(SuffixInputFile); + + if(! this->noLevel1Bucket){ + //build level-1 bucket + cerr<<"Initialize level-1 buckets...\n"; + IndexType currentVocId = 0; + IndexType vocId; + TextLenType pos; + TextLenType lastSaIndex = 0; + + for(TextLenType i=0; i<suffixArraySize; i++){ + pos = this->suffix_list[i]; + + //for level1 bucket + vocId = this->corpus_list[pos]; + + if(vocId<this->sentIdStart){ //is a meaningful word type + if(vocId!=currentVocId){ + this->level1Buckets[currentVocId].last = lastSaIndex; //for first word which is <unk> this does not matter + this->level1Buckets[vocId].first = i; + + currentVocId=vocId; + } + + lastSaIndex = i; + } + } + + //for the last word type + this->level1Buckets[currentVocId].last = lastSaIndex; + } + else{ + this->level1Buckets = NULL; + } +} + +void C_SuffixArrayApplicationBase::loadOffset(const char *filename) +{ + unsigned int dwRead = 0; + FILE * OffsetInputFile = fopen(filename, "rb"); + + if(!OffsetInputFile){ + cerr<<"Offset file: "<<filename<<" does not exist!"<<endl; + exit(0); + } + + //first, read the size of the corpus + TextLenType offsetListLen; + dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile); + if(offsetListLen!=this->corpusSize){ + cerr<<"Text length is inconsistent with the length of the offset.\n"; + exit(0); + } + + //read all the suffix into memory + unsigned int totalRead = 0; + unsigned int remaining = offsetListLen; + unsigned int oneBatchReadSize; + char * currentOffsetListPos = (char *) this->offset_list; + while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){ + oneBatchReadSize = SIZE_ONE_READ; + + if(remaining<SIZE_ONE_READ){ + oneBatchReadSize = remaining; + } + + dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile); + + totalRead+=dwRead; + remaining-=dwRead; + + currentOffsetListPos+=sizeof(unsigned char)*dwRead; + + } + if(totalRead!=offsetListLen){ + cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl; + exit(0); + } + fclose(OffsetInputFile); + +} + +TextLenType C_SuffixArrayApplicationBase::returnCorpusSize() +{ + return this->corpusSize; +} diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~ new file mode 100755 index 0000000..bd17287 --- /dev/null +++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~ @@ -0,0 +1,313 @@ +/** +* Revision $Rev: 3815 $ +* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $ +**/ + +#include "_SuffixArrayApplicationBase.h" + +#include "malloc.h" +#include "time.h" + +#include <iostream> +#include <fstream> + +////////////////////////////////////////////////////////////////////// +// Construction/Destruction +////////////////////////////////////////////////////////////////////// + +C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase() +{ + this->level1Buckets = NULL; + this->noVocabulary = false; //by default, still load the vocabulary + this->noOffset = false; //by default, load offset + this->noLevel1Bucket = false; //by default, construct level1 bucket +} + +C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase() +{ + if(this->level1Buckets!=NULL){ + free(this->level1Buckets); + } + + //not necessary too + free(this->corpus_list); + free(this->suffix_list); + + if(! this->noOffset){ + free(this->offset_list); + } + + if(! this->noVocabulary){ + delete(this->voc); + } +} + +/** +* Load the indexed corpus, suffix array, vocabulary, offset into memory for follow up applications +* It is optional to load vocabulary, offset depends on the argument. +* In the case when the testing data shares the same vocabulary as the training data and only vocIDs are used to represent the sentence/n-grams +* then vocabulary which maps between vocId and the word text can be skipped to save some memory. +* +* If the suffix array object does not need to locate the sentence id of an occurred n-gram, then offset information is not needed. +* +* Be very careful here, the suffix array class does not check if offset has been loaded in the search function to make it efficient +* you need to know what the suffix array class will be used (whether offset is needed) and load it properly +* @param fileNameStem The filename of the corpus. This should be the same filename used in IndexSA +* @param noVoc If set to be 'true', vocabulary will not be loaded +* @param noOffset If set to be 'true', the offset information will not be loaded. <sentId, offsetInSent> information for an n-gram's occurrences can not be calculated. +* @param noLevel1Bucket Level1Bucket is used to speed up the search at the cost of additional memory. For applications which do not need to locate n-grams in the corpus (such as the corpus scanning application), then there is no need to create Level1Bucket +**/ +void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket) +{ + long ltime1, ltime2; + + this->noVocabulary = noVoc; + this->noOffset = noOffset; + this->noLevel1Bucket = noLevel1Bucket; + + + char tmpString[1000]; + + //the order of loading the data is important, do not change + if(! this->noVocabulary){ + time( <ime1 ); + cerr<<"Loading Vocabulary...\n"; + sprintf(tmpString,"%s.id_voc",fileNameStem); + this->loadVoc(tmpString); + time( <ime2); + cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n"; + } + + time( <ime1 ); + cerr<<"Loading corpus...\n"; + sprintf(tmpString,"%s.sa_corpus",fileNameStem); + this->loadCorpusAndInitMem(tmpString); + time( <ime2); + cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n"; + + time( <ime1 ); + cerr<<"Loading suffix...\n"; + sprintf(tmpString,"%s.sa_suffix",fileNameStem); + this->loadSuffix(tmpString); + time( <ime2); + cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n"; + + if(! this->noOffset){ + time( <ime1 ); + cerr<<"Loading offset...\n"; + sprintf(tmpString,"%s.sa_offset",fileNameStem); + this->loadOffset(tmpString); + time( <ime2); + cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n"; + } +} + +void C_SuffixArrayApplicationBase::loadVoc(const char *filename) +{ + this->voc = new C_IDVocabulary(filename); +} + +void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename) +{ + unsigned int dwRead = 0; + FILE * CorpusInputFile = fopen(filename, "rb"); + + if(!CorpusInputFile){ + cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n"; + exit(0); + } + + //first, read the size of the corpus + dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile); + + //allocate memory for all data structure + this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize); + if(! this->corpus_list){ + cerr<<"Can not allocate memory to load the corpus!\n"; + exit(0); + } + + this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize); + if(! this->suffix_list){ + cerr<<"Can not allocate memory to load the suffix!\n"; + exit(0); + } + + if(! this->noOffset){ + this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize); + if(! this->offset_list){ + cerr<<"Can not allocate memory to load the offset!\n"; + exit(0); + } + } + + //read the corpus file + unsigned int totalRead = 0;
+ unsigned int remaining = this->corpusSize;
+ unsigned int oneBatchReadSize; + char * currentPosInCorpusList = (char *) this->corpus_list; + while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+ + dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+ + currentPosInCorpusList+=sizeof(IndexType)*dwRead; + } + if(totalRead!=this->corpusSize){ + cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl; + exit(0); + } + fclose(CorpusInputFile); + + this->sentIdStart = this->corpus_list[0]; + this->vocIdForSentStart = this->corpus_list[1]; + this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1]; + this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2]; + + if(! this->noLevel1Bucket){ + //in this corpus, we will have at most sentIdStart-1 word types + //the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data + this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart); + + //initialize the level1 buckets + for(IndexType i=0;i<this->sentIdStart;i++){ + this->level1Buckets[i].first = (TextLenType) -1; + this->level1Buckets[i].last = 0; + } + } +} + +void C_SuffixArrayApplicationBase::loadSuffix(const char *filename) +{ + unsigned int dwRead = 0; + FILE * SuffixInputFile = fopen(filename, "rb"); + if(!SuffixInputFile){ + cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl; + exit(0); + } + + //first, read in the size of the suffix array + TextLenType suffixArraySize; + dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile); + + if(suffixArraySize!=this->corpusSize){ + cerr<<"Something wrong, the suffix array size is different from the corpus size.\n"; + cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl; + exit(0); + } + + //read all the suffix into memory + unsigned int totalRead = 0;
+ unsigned int remaining = suffixArraySize;
+ unsigned int oneBatchReadSize; + char * currentPosInSuffixList = (char *) this->suffix_list; + while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+ + dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
+ + totalRead+=dwRead;
+ remaining -= dwRead;
+ + currentPosInSuffixList+=sizeof(TextLenType)*dwRead; + } + if(totalRead!=suffixArraySize){ + cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl; + exit(0); + } + + fclose(SuffixInputFile); + + if(! this->noLevel1Bucket){ + //build level-1 bucket + cerr<<"Initialize level-1 buckets...\n"; + IndexType currentVocId = 0; + IndexType vocId; + TextLenType pos; + TextLenType lastSaIndex = 0; + + for(TextLenType i=0; i<suffixArraySize; i++){ + pos = this->suffix_list[i]; + + //for level1 bucket + vocId = this->corpus_list[pos]; + + if(vocId<this->sentIdStart){ //is a meaningful word type + if(vocId!=currentVocId){ + this->level1Buckets[currentVocId].last = lastSaIndex; //for first word which is <unk> this does not matter + this->level1Buckets[vocId].first = i; + + currentVocId=vocId; + } + + lastSaIndex = i; + } + } + + //for the last word type + this->level1Buckets[currentVocId].last = lastSaIndex; + } + else{ + this->level1Buckets = NULL; + } +} + +void C_SuffixArrayApplicationBase::loadOffset(const char *filename) +{ + unsigned int dwRead = 0; + FILE * OffsetInputFile = fopen(filename, "rb"); + + if(!OffsetInputFile){ + cerr<<"Offset file: "<<filename<<" does not exist!"<<endl; + exit(0); + } + + //first, read the size of the corpus + TextLenType offsetListLen; + dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile); + if(offsetListLen!=this->corpusSize){ + cerr<<"Text length is inconsistent with the length of the offset.\n"; + exit(0); + } + + //read all the suffix into memory + unsigned int totalRead = 0;
+ unsigned int remaining = offsetListLen;
+ unsigned int oneBatchReadSize; + char * currentOffsetListPos = (char *) this->offset_list; + while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){ + oneBatchReadSize = SIZE_ONE_READ;
+
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile); +
+ totalRead+=dwRead;
+ remaining-=dwRead;
+ + currentOffsetListPos+=sizeof(unsigned char)*dwRead; + + } + if(totalRead!=offsetListLen){ + cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl; + exit(0); + } + fclose(OffsetInputFile); + +} + +TextLenType C_SuffixArrayApplicationBase::returnCorpusSize() +{ + return this->corpusSize; +} diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h new file mode 100755 index 0000000..74fad4e --- /dev/null +++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h @@ -0,0 +1,58 @@ +#if !defined(__SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_) +#define __SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_ + +#include "salm_shared.h" +#include "_IDVocabulary.h" +#include "_String.h" + +using namespace std; + +typedef struct level1BucketElement +{ + TextLenType first; + TextLenType last; +} S_level1BucketElement; + + +/** +* Base class of Suffix Array applications +* Providing functions to load the suffix array and initialize the required vocIDs +* Revision $Rev: 3665 $ +* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $ +**/ +class C_SuffixArrayApplicationBase +{ +public: + void loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket); + TextLenType returnCorpusSize(); + + C_SuffixArrayApplicationBase(); + virtual ~C_SuffixArrayApplicationBase(); + +protected: + TextLenType corpusSize; + + void loadVoc(const char * filename); + void loadOffset(const char * filename); + void loadSuffix(const char * filename); + void loadCorpusAndInitMem(const char * filename); + + bool noVocabulary; + bool noOffset; + bool noLevel1Bucket; + + C_IDVocabulary * voc; + IndexType sentIdStart; + IndexType vocIdForSentStart; + IndexType vocIdForSentEnd; + IndexType vocIdForCorpusEnd; + + IndexType * corpus_list; + unsigned char * offset_list; + TextLenType * suffix_list; + + S_level1BucketElement * level1Buckets; + +}; + +#endif // !defined(__SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_) diff --git a/Src/Utils/InitializeVocabulary.cpp b/Src/Utils/InitializeVocabulary.cpp new file mode 100755 index 0000000..b749568 --- /dev/null +++ b/Src/Utils/InitializeVocabulary.cpp @@ -0,0 +1,30 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_IDVocabulary.h" + +#include <iostream> + +using namespace std; + +/** +* \ingroup utils +* Intialize an empty vocabulary with reserved words +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + if(argc<2){ + cerr<<"\nUsage:"; + cerr<<"\n\t"<<argv[0]<<" vocabularyFileName\n\n"; + exit(0); + } + + C_IDVocabulary voc; + + voc.addingReservedWords(); + voc.outputToFile(argv[1]); + + return 0; + +} diff --git a/Src/Utils/UpdateUniversalVoc.cpp b/Src/Utils/UpdateUniversalVoc.cpp new file mode 100755 index 0000000..02ea6cb --- /dev/null +++ b/Src/Utils/UpdateUniversalVoc.cpp @@ -0,0 +1,28 @@ +#include "stdio.h" +#include "stdlib.h" +#include "_UniversalVocabulary.h" + +#include <iostream> + +using namespace std; + +/** +* \ingroup utils +* Update the universal vocabulary with words in corpus +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +int main(int argc, char * argv[]){ + if(argc<3){ + cerr<<"\nUsage:"; + cerr<<"\n\t"<<argv[0]<<" universal_voc corpusFileName\n\n"; + exit(0); + } + + C_UniversalVocabulary universalVoc(argv[1]); + + universalVoc.updateWithNewCorpus(argv[2]); + + return 1; +} diff --git a/Src/Utils/_UniversalVocabulary.cpp b/Src/Utils/_UniversalVocabulary.cpp new file mode 100755 index 0000000..3be91d2 --- /dev/null +++ b/Src/Utils/_UniversalVocabulary.cpp @@ -0,0 +1,118 @@ +#include "_UniversalVocabulary.h" +#include "malloc.h" +#include <string> +#include <fstream> +#include <iostream> +#include <cstring> +#include <stdlib.h> + +using namespace std; + +C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName) +{ + int fileNameSize=strlen(universalVocFileName); + fileNameSize++; + + this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize); + sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName); + + this->universalVoc = new C_IDVocabulary(universalVocFileName); + +} + +C_UniversalVocabulary::~C_UniversalVocabulary() +{ + free(this->universalCorpusFileName); + delete(this->universalVoc); +} + + +/** +* Update the universal vocabulary with words in a new corpus +* Output the updated universal vocabulary +* Output the vocabulary needed for the new corpus too +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName) +{ + + ifstream textStream; + textStream.open(newCorpusFileName); + + if(textStream==NULL){ + fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName); + exit(-1); + } + + + //add reserved words from universal voc + for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){ + C_String reservedWordText = this->universalVoc->getText(vocId); + this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId)); + } + + string aLine; + unsigned int sentNumber = 1; + unsigned int corpusSize = 0; + + char * thisToken; + char delimit[] =" \t\r\n"; + map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus; + + + getline(textStream, aLine); + while(!textStream.eof()){ + + if(aLine.length()>0){ + + thisToken = strtok((char*) aLine.c_str(), delimit ); + while( thisToken != NULL ) { + + C_String thisWord(thisToken); + + //check if this word has already been seen + iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord); + + if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){ + //new type + IndexType vocId = this->universalVoc->getId(thisWord); + this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId)); + } + + + // While there are tokens in "string" + // Get next token: + thisToken = strtok( NULL, delimit); + } + + } + + getline(textStream, aLine); + } + + + //now output the updated universal vocabulary + this->universalVoc->outputToFile(this->universalCorpusFileName); + + //output the vocabulary needed for the new corpus + char vocabularyForNewCorpusFileName[1024]; + sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName); + + ofstream outputVocFile; + outputVocFile.open(vocabularyForNewCorpusFileName); + + if(!outputVocFile){ + cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n"; + exit(-1); + } + + iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin(); + while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){ + outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl; + iterWordsUsedInTheNewCorpus++; + } + + outputVocFile.close(); +} diff --git a/Src/Utils/_UniversalVocabulary.cpp~ b/Src/Utils/_UniversalVocabulary.cpp~ new file mode 100755 index 0000000..50a7396 --- /dev/null +++ b/Src/Utils/_UniversalVocabulary.cpp~ @@ -0,0 +1,117 @@ +#include "_UniversalVocabulary.h" +#include "malloc.h" +#include <string> +#include <fstream> +#include <iostream> +#include <cstring> + +using namespace std; + +C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName) +{ + int fileNameSize=strlen(universalVocFileName); + fileNameSize++; + + this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize); + sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName); + + this->universalVoc = new C_IDVocabulary(universalVocFileName); + +} + +C_UniversalVocabulary::~C_UniversalVocabulary() +{ + free(this->universalCorpusFileName); + delete(this->universalVoc); +} + + +/** +* Update the universal vocabulary with words in a new corpus +* Output the updated universal vocabulary +* Output the vocabulary needed for the new corpus too +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName) +{ + + ifstream textStream; + textStream.open(newCorpusFileName); + + if(textStream==NULL){ + fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName); + exit(-1); + } + + + //add reserved words from universal voc + for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){ + C_String reservedWordText = this->universalVoc->getText(vocId); + this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId)); + } + + string aLine; + unsigned int sentNumber = 1; + unsigned int corpusSize = 0; + + char * thisToken; + char delimit[] =" \t\r\n"; + map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus; + + + getline(textStream, aLine); + while(!textStream.eof()){ + + if(aLine.length()>0){ + + thisToken = strtok((char*) aLine.c_str(), delimit ); + while( thisToken != NULL ) { + + C_String thisWord(thisToken); + + //check if this word has already been seen + iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord); + + if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){ + //new type + IndexType vocId = this->universalVoc->getId(thisWord); + this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId)); + } + + + // While there are tokens in "string" + // Get next token: + thisToken = strtok( NULL, delimit); + } + + } + + getline(textStream, aLine); + } + + + //now output the updated universal vocabulary + this->universalVoc->outputToFile(this->universalCorpusFileName); + + //output the vocabulary needed for the new corpus + char vocabularyForNewCorpusFileName[1024]; + sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName); + + ofstream outputVocFile; + outputVocFile.open(vocabularyForNewCorpusFileName); + + if(!outputVocFile){ + cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n"; + exit(-1); + } + + iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin(); + while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){ + outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl; + iterWordsUsedInTheNewCorpus++; + } + + outputVocFile.close(); +} diff --git a/Src/Utils/_UniversalVocabulary.h b/Src/Utils/_UniversalVocabulary.h new file mode 100755 index 0000000..2df4954 --- /dev/null +++ b/Src/Utils/_UniversalVocabulary.h @@ -0,0 +1,38 @@ +#if !defined (__HEADER_UNIVERSAL_VOC_INCLUDED__) +#define __HEADER_UNIVERSAL_VOC_INCLUDED__ + +#include "salm_shared.h" +#include "_IDVocabulary.h" +#include "_String.h" + +#include <map> + +using namespace std; + +/** +* \ingroup utils +* Universal Vocabulary class provides function to update the univeral vocabulary +* with the words in a new corpus +* and output the vocabulary needed for the new corpus +* +* Revision $Rev: 3794 $ +* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ +**/ +class C_UniversalVocabulary{ + +public: + void updateWithNewCorpus(const char * newCorpusFileName); + + C_UniversalVocabulary(const char * universalVocFileName); + ~C_UniversalVocabulary(); + +private: + char * universalCorpusFileName; + C_IDVocabulary * universalVoc; + + map<C_String, IndexType, ltstr> wordsUsedInTheNewCorpus; + +}; + + +#endif |