Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/salm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/Src
diff options
context:
space:
mode:
authorHieu Hoang <hieu@hoang.co.uk>2013-11-25 13:56:37 +0400
committerHieu Hoang <hieu@hoang.co.uk>2013-11-25 13:56:37 +0400
commita146dbec8f0391e247db1ae4c9b7af5c225436f9 (patch)
tree1fa97934675448cdcffb26b4737887d551822a39 /Src
initial add of salm to github
Diffstat (limited to 'Src')
-rwxr-xr-xSrc/IndexSA/IndexSA.cpp58
-rwxr-xr-xSrc/IndexSA/IndexSA.cpp~57
-rwxr-xr-xSrc/IndexSA/_MonoCorpus.cpp440
-rwxr-xr-xSrc/IndexSA/_MonoCorpus.cpp~439
-rwxr-xr-xSrc/IndexSA/_MonoCorpus.h60
-rwxr-xr-xSrc/SALM-API-Description.txt24
-rwxr-xr-xSrc/Shared/_IDVocabulary.cpp219
-rwxr-xr-xSrc/Shared/_IDVocabulary.cpp~218
-rwxr-xr-xSrc/Shared/_IDVocabulary.h55
-rwxr-xr-xSrc/Shared/_String.cpp253
-rwxr-xr-xSrc/Shared/_String.h45
-rwxr-xr-xSrc/Shared/salm_shared.h36
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp63
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~62
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt5
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp1113
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h210
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp691
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~690
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h137
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp34
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp70
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp32
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp338
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~338
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h53
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp130
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~129
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp72
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~71
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp47
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~46
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp85
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~84
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp67
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~66
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp132
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~131
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp50
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~49
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp144
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~145
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp178
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~177
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp754
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~753
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h127
-rwxr-xr-xSrc/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp314
-rwxr-xr-xSrc/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~313
-rwxr-xr-xSrc/SuffixArrayApplications/_SuffixArrayApplicationBase.h58
-rwxr-xr-xSrc/Utils/InitializeVocabulary.cpp30
-rwxr-xr-xSrc/Utils/UpdateUniversalVoc.cpp28
-rwxr-xr-xSrc/Utils/_UniversalVocabulary.cpp118
-rwxr-xr-xSrc/Utils/_UniversalVocabulary.cpp~117
-rwxr-xr-xSrc/Utils/_UniversalVocabulary.h38
55 files changed, 10193 insertions, 0 deletions
diff --git a/Src/IndexSA/IndexSA.cpp b/Src/IndexSA/IndexSA.cpp
new file mode 100755
index 0000000..3013d4c
--- /dev/null
+++ b/Src/IndexSA/IndexSA.cpp
@@ -0,0 +1,58 @@
+/**
+* Main function to index a corpus according to its suffix array
+* Revision: $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <cstring>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "_MonoCorpus.h"
+#include "salm_shared.h"
+
+using namespace std;
+
+IndexType * corpus; //because the compare function needs to see this, make it global
+TextLenType actualCorpusSize;
+
+int main(int argc, char* argv[]){
+
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:");
+ fprintf(stderr,"\n%s fileNameStem [existingIDVocFileName]\n",argv[0]);
+
+ exit(0);
+ }
+
+ C_MonoCorpus corpus;
+
+ char vocFileName[1024];
+ sprintf(vocFileName, "%s.id_voc", argv[1]);
+
+ if(argc==2){ //no existing vocabulary given
+ cerr<<"Initialize vocabulary file: "<<vocFileName<<endl;
+ corpus.initializeVocabulary(argv[1]);
+ corpus.loadCorpusAndSort(argv[1], vocFileName, true);
+ }
+ else{
+ if(strcmp(vocFileName, argv[2])!=0){
+ cerr<<"Error! ExistingIDVocFileName has to be called: "<<vocFileName<<" and cover all the words in the corpus."<<endl;
+ exit(-1);
+ }
+ corpus.loadCorpusAndSort(argv[1], argv[2], false);
+ }
+
+ corpus.output(argv[1]);
+
+ return 0;
+}
+
diff --git a/Src/IndexSA/IndexSA.cpp~ b/Src/IndexSA/IndexSA.cpp~
new file mode 100755
index 0000000..d8ad043
--- /dev/null
+++ b/Src/IndexSA/IndexSA.cpp~
@@ -0,0 +1,57 @@
+/**
+* Main function to index a corpus according to its suffix array
+* Revision: $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "_MonoCorpus.h"
+#include "salm_shared.h"
+
+using namespace std;
+
+IndexType * corpus; //because the compare function needs to see this, make it global
+TextLenType actualCorpusSize;
+
+int main(int argc, char* argv[]){
+
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:");
+ fprintf(stderr,"\n%s fileNameStem [existingIDVocFileName]\n",argv[0]);
+
+ exit(0);
+ }
+
+ C_MonoCorpus corpus;
+
+ char vocFileName[1024];
+ sprintf(vocFileName, "%s.id_voc", argv[1]);
+
+ if(argc==2){ //no existing vocabulary given
+ cerr<<"Initialize vocabulary file: "<<vocFileName<<endl;
+ corpus.initializeVocabulary(argv[1]);
+ corpus.loadCorpusAndSort(argv[1], vocFileName, true);
+ }
+ else{
+ if(strcmp(vocFileName, argv[2])!=0){
+ cerr<<"Error! ExistingIDVocFileName has to be called: "<<vocFileName<<" and cover all the words in the corpus."<<endl;
+ exit(-1);
+ }
+ corpus.loadCorpusAndSort(argv[1], argv[2], false);
+ }
+
+ corpus.output(argv[1]);
+
+ return 0;
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.cpp b/Src/IndexSA/_MonoCorpus.cpp
new file mode 100755
index 0000000..ab53813
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.cpp
@@ -0,0 +1,440 @@
+/**
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "_MonoCorpus.h"
+#include "malloc.h"
+#include "time.h"
+
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+extern IndexType * corpus;
+extern TextLenType actualCorpusSize;
+
+bool operator<(const C_SuffixPointer& a, const C_SuffixPointer& b)
+{
+ bool stillEqual = true;
+ TextLenType currentPosOfA = a.pointer;
+ TextLenType currentPosOfB = b.pointer;
+
+ if(currentPosOfA==currentPosOfB){
+ return false;
+ }
+
+ while(stillEqual){
+ if(corpus[currentPosOfA]<corpus[currentPosOfB]){
+ return true;
+ }
+
+ if(corpus[currentPosOfA]>corpus[currentPosOfB]){
+ return false;
+ }
+
+ //then still equal at these two positions
+ currentPosOfA++;
+ currentPosOfB++;
+
+ if(currentPosOfA>=actualCorpusSize){
+ currentPosOfA=0;
+ }
+
+ if(currentPosOfB>=actualCorpusSize){
+ currentPosOfB=0;
+ }
+ }
+
+ //equal
+ return false;
+}
+
+
+C_SuffixPointer::C_SuffixPointer()
+{
+
+}
+
+//copy constructor
+C_SuffixPointer::C_SuffixPointer(const C_SuffixPointer & obj)
+{
+ this->pointer = obj.pointer;
+}
+
+C_SuffixPointer::~C_SuffixPointer()
+{
+
+}
+
+
+C_SuffixPointer::C_SuffixPointer(TextLenType pointer)
+{
+ this->pointer = pointer;
+}
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_MonoCorpus::C_MonoCorpus()
+{
+ this->currentPosInCorpus = 0;
+ this->maxVocIdFromCorpus = 0;
+}
+
+C_MonoCorpus::~C_MonoCorpus()
+{
+ free(corpus);
+ free(this->suffix);
+ free(this->offsetList);
+}
+
+
+/**
+* Initialize an IDVocabulary file
+**/
+void C_MonoCorpus::initializeVocabulary(char *fileNameStem)
+{
+ C_IDVocabulary tmpVoc;
+ tmpVoc.addingReservedWords();
+
+ char vocFileName[1024];
+ sprintf(vocFileName, "%s.id_voc", fileNameStem);
+
+ tmpVoc.outputToFile(vocFileName);
+}
+
+
+void C_MonoCorpus::loadCorpusAndSort(const char *fileName, const char * idVocFileName, bool vocNeedsToBeUpdated)
+{
+ IndexType id = 0;
+
+ //load vocabulary
+ this->voc = new C_IDVocabulary(idVocFileName);
+ this->vocNeedsToBeUpdated = vocNeedsToBeUpdated;
+
+ this->vocIdForSentIdPlaceHolder = this->voc->returnId(C_String("_SENT_ID_PLACEHOLDER_"));
+ if(this->vocIdForSentIdPlaceHolder==0){
+ cerr<<"ID vocabulary does not have the type _SENT_ID_PLACEHOLDER_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentIdPlaceHolder>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentIdPlaceHolder;
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"ID vocabulary does not have the type _SENTENCE_START_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentStart>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentStart;
+ }
+
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+ if(this->vocIdForSentEnd==0){
+ cerr<<"ID vocabulary does not have the type _END_OF_SENTENCE_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentEnd>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentEnd;
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"ID vocabulary does not have the type _END_OF_CORPUS_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForCorpusEnd>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForCorpusEnd;
+ }
+
+ ifstream textStream1;
+ textStream1.open(fileName);
+
+ if(textStream1==NULL){
+ fprintf(stderr,"Text %s does not exist. Exit!\n",fileName);
+ exit(-1);
+ }
+
+ long ltime1, ltime2;
+ time( &ltime1 );
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int sentLen = 0;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+
+ //first, scan the corpus to estimate the size and check if each line is shorter than 256 words
+ getline(textStream1, aLine);
+ while(!textStream1.eof()){
+
+ if(aLine.length()>0){
+ sentLen = 0;
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ if(this->vocNeedsToBeUpdated){
+ id = this->voc->getId(C_String(thisToken));
+ }
+ else{ //the provided vocabulary should cover all the words in this corpus
+ id = this->voc->returnId(C_String(thisToken));
+
+ if(id==0){ //word does not exist
+ cerr<<"Vocabulary: "<<idVocFileName<<" does not cover all the words in the corpus!"<<endl;
+ cerr<<"Word: "<<thisToken<<" does not exist in the voc!\n";
+ exit(-1);
+ }
+ }
+
+
+
+ sentLen++;
+
+ if(id>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = id;
+ }
+
+ if(sentLen>=256){
+ cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+ exit(-1);
+ }
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+ corpusSize+=sentLen;
+
+ sentLen = 0;
+ sentNumber++;
+ }
+ else{
+ cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this message if this is the last sentence.\n";
+ }
+ getline(textStream1, aLine);
+ }
+
+ sentNumber--;
+ unsigned int estimatedSize = corpusSize+3*sentNumber+1000; //with some redundancy
+ cerr<<sentNumber<<" sentences and "<<corpusSize<<" words in corpus\n";
+ cerr<<"Reserve "<<estimatedSize*2<<" bytes in RAM for sorting\n";
+ textStream1.close();
+
+
+ //second pass, convert the corpus into vocIDs and create suffix array
+ ifstream textStream2;
+ textStream2.open(fileName);
+
+ this->allocateMem(estimatedSize);
+ this->currentPosInCorpus = 0;
+ sentNumber = 1;
+
+ getline(textStream2, aLine);
+ while(!textStream2.eof()){
+
+ if(aLine.length()>0){
+ sentLen = 0;
+
+ //add sentId
+ //offset at this position will store the acutal sentence length
+ corpus[this->currentPosInCorpus]=this->vocIdForSentIdPlaceHolder;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->currentPosInCorpus++;
+
+ //add <s>
+ sentLen++; //not real sentence length, but to keep track of offset
+ corpus[this->currentPosInCorpus]=this->vocIdForSentStart;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+ this->currentPosInCorpus++;
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ id = this->voc->returnId(C_String(thisToken));
+ if(id==0){
+ cerr<<"Word \""<<thisToken<<"\" is not listed in the IDVocabulary.\n";
+ exit(-1);
+ }
+
+ sentLen++;
+
+ if(id>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = id;
+ }
+
+ corpus[this->currentPosInCorpus]=id;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+ this->currentPosInCorpus++;
+
+ if(sentLen>=256){
+ cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+ exit(-1);
+ }
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ //add <sentEnd>
+ corpus[this->currentPosInCorpus]=this->vocIdForSentEnd;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) (sentLen + 1);
+ this->offsetList[this->currentPosInCorpus - sentLen - 1] = (unsigned char) (sentLen-1); //write the sentLen to sent begin correspond to <sentId>
+ this->currentPosInCorpus++;
+
+ sentLen = 0;
+ sentNumber++;
+ }
+ else{
+ cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this if this is the last sentence.\n";
+ }
+
+ aLine[0]=0;
+ getline(textStream2, aLine);
+ }
+ textStream2.close();
+
+ //add <endOfCorpus> to the end of data
+ corpus[this->currentPosInCorpus]=this->vocIdForCorpusEnd;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) 0;
+ this->currentPosInCorpus++;
+
+ actualCorpusSize = this->currentPosInCorpus;
+
+ time( &ltime2 );
+ cerr<<"\nCorpus loaded in: "<<ltime2-ltime1<<" seconds."<<endl;
+ cerr<<"Total "<<sentNumber-1<<" sentences loaded.\n";
+
+ //replace the sentId place holder to actual sentId
+ time( &ltime1 );
+ cerr<<"Inserting sentence IDs into the corpus...\n";
+ IndexType sentId = this->maxVocIdFromCorpus+1;
+ for(TextLenType i=0;i<actualCorpusSize;i++){
+ if(corpus[i]==this->vocIdForSentIdPlaceHolder){
+ corpus[i]=sentId;
+ sentId++;
+ }
+ }
+ time( &ltime2 );
+ cerr<<"\nSentence IDs inserted in: "<<ltime2-ltime1<<" seconds."<<endl;
+
+ //sorting
+ time( &ltime1 );
+ cerr<<"Sorting the suffix...\n";
+ sort(this->suffix, this->suffix+actualCorpusSize);
+ time( &ltime2 );
+ cerr<<"\nCorpus sorted in: "<<ltime2-ltime1<<" seconds."<<endl;
+ cerr<<"Done."<<endl;
+
+}
+
+void C_MonoCorpus::allocateMem(TextLenType corpusSize)
+{
+ corpus = (IndexType *) malloc(sizeof(IndexType)*corpusSize);
+
+ if(corpus==0){
+ cerr<<"Failed to allocate memory for corpus. Quit!\n";
+ exit(-1);
+ }
+
+ this->suffix = (C_SuffixPointer *) malloc(sizeof(C_SuffixPointer)*corpusSize);
+ if(this->suffix==0){
+ cerr<<"Failed to allocate memory for suffix. Quit!\n";
+ exit(-1);
+ }
+
+ this->offsetList = (unsigned char *) malloc(sizeof(unsigned char)*corpusSize);
+ if(this->offsetList==0){
+ cerr<<"Failed to allocate memory for offset. Quit!\n";
+ exit(-1);
+ }
+
+}
+
+
+void C_MonoCorpus::outputCorpus(char *filename)
+{
+ cerr<<"Writing corpus to file: "<<filename<<endl;
+ ofstream textOutStream;
+ textOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ textOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0; i<actualCorpusSize;i++){
+ textOutStream.write((char *)&(corpus[i]), sizeof(IndexType));
+ }
+
+ textOutStream.close();
+
+}
+
+void C_MonoCorpus::outputOffset(char *filename)
+{
+ cerr<<"Writing offset to file: "<<filename<<endl;
+
+ ofstream offsetOutStream;
+ offsetOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ offsetOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0; i<actualCorpusSize; i++){
+ offsetOutStream.write((char *)& (this->offsetList[i]), sizeof(unsigned char));
+ }
+ offsetOutStream.close();
+}
+
+void C_MonoCorpus::outputSuffix(char *filename)
+{
+ cerr<<"Writing suffix information to file: "<<filename<<endl;
+
+ ofstream saOutStream;
+ saOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ saOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0;i<actualCorpusSize; i++){
+ saOutStream.write((char *) & (this->suffix[i].pointer), sizeof(TextLenType));
+ }
+
+ saOutStream.close();
+}
+
+void C_MonoCorpus::output(char *filename)
+{
+ char outputVocFileName[1024];
+ char outputCorpusFileName[1024];
+ char outputOffsetFileName[1024];
+ char outputSuffixFileName[1024];
+
+
+ if(this->vocNeedsToBeUpdated){
+ sprintf(outputVocFileName, "%s.id_voc", filename);
+ this->voc->outputToFile(outputVocFileName);
+ }
+
+ sprintf(outputCorpusFileName, "%s.sa_corpus", filename);
+ sprintf(outputOffsetFileName, "%s.sa_offset", filename);
+ sprintf(outputSuffixFileName, "%s.sa_suffix", filename);
+
+
+ this->outputCorpus(outputCorpusFileName);
+ this->outputOffset(outputOffsetFileName);
+ this->outputSuffix(outputSuffixFileName);
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.cpp~ b/Src/IndexSA/_MonoCorpus.cpp~
new file mode 100755
index 0000000..3e3a29b
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.cpp~
@@ -0,0 +1,439 @@
+/**
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "_MonoCorpus.h"
+#include "malloc.h"
+#include "time.h"
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+extern IndexType * corpus;
+extern TextLenType actualCorpusSize;
+
+bool operator<(const C_SuffixPointer& a, const C_SuffixPointer& b)
+{
+ bool stillEqual = true;
+ TextLenType currentPosOfA = a.pointer;
+ TextLenType currentPosOfB = b.pointer;
+
+ if(currentPosOfA==currentPosOfB){
+ return false;
+ }
+
+ while(stillEqual){
+ if(corpus[currentPosOfA]<corpus[currentPosOfB]){
+ return true;
+ }
+
+ if(corpus[currentPosOfA]>corpus[currentPosOfB]){
+ return false;
+ }
+
+ //then still equal at these two positions
+ currentPosOfA++;
+ currentPosOfB++;
+
+ if(currentPosOfA>=actualCorpusSize){
+ currentPosOfA=0;
+ }
+
+ if(currentPosOfB>=actualCorpusSize){
+ currentPosOfB=0;
+ }
+ }
+
+ //equal
+ return false;
+}
+
+
+C_SuffixPointer::C_SuffixPointer()
+{
+
+}
+
+//copy constructor
+C_SuffixPointer::C_SuffixPointer(const C_SuffixPointer & obj)
+{
+ this->pointer = obj.pointer;
+}
+
+C_SuffixPointer::~C_SuffixPointer()
+{
+
+}
+
+
+C_SuffixPointer::C_SuffixPointer(TextLenType pointer)
+{
+ this->pointer = pointer;
+}
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_MonoCorpus::C_MonoCorpus()
+{
+ this->currentPosInCorpus = 0;
+ this->maxVocIdFromCorpus = 0;
+}
+
+C_MonoCorpus::~C_MonoCorpus()
+{
+ free(corpus);
+ free(this->suffix);
+ free(this->offsetList);
+}
+
+
+/**
+* Initialize an IDVocabulary file
+**/
+void C_MonoCorpus::initializeVocabulary(char *fileNameStem)
+{
+ C_IDVocabulary tmpVoc;
+ tmpVoc.addingReservedWords();
+
+ char vocFileName[1024];
+ sprintf(vocFileName, "%s.id_voc", fileNameStem);
+
+ tmpVoc.outputToFile(vocFileName);
+}
+
+
+void C_MonoCorpus::loadCorpusAndSort(const char *fileName, const char * idVocFileName, bool vocNeedsToBeUpdated)
+{
+ IndexType id = 0;
+
+ //load vocabulary
+ this->voc = new C_IDVocabulary(idVocFileName);
+ this->vocNeedsToBeUpdated = vocNeedsToBeUpdated;
+
+ this->vocIdForSentIdPlaceHolder = this->voc->returnId(C_String("_SENT_ID_PLACEHOLDER_"));
+ if(this->vocIdForSentIdPlaceHolder==0){
+ cerr<<"ID vocabulary does not have the type _SENT_ID_PLACEHOLDER_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentIdPlaceHolder>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentIdPlaceHolder;
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"ID vocabulary does not have the type _SENTENCE_START_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentStart>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentStart;
+ }
+
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+ if(this->vocIdForSentEnd==0){
+ cerr<<"ID vocabulary does not have the type _END_OF_SENTENCE_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForSentEnd>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForSentEnd;
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"ID vocabulary does not have the type _END_OF_CORPUS_, error!\n Add this word to the universal vocabulary and try again!\n";
+ exit(-1);
+ }
+ if(this->vocIdForCorpusEnd>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = this->vocIdForCorpusEnd;
+ }
+
+ ifstream textStream1;
+ textStream1.open(fileName);
+
+ if(textStream1==NULL){
+ fprintf(stderr,"Text %s does not exist. Exit!\n",fileName);
+ exit(-1);
+ }
+
+ long ltime1, ltime2;
+ time( &ltime1 );
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int sentLen = 0;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+
+ //first, scan the corpus to estimate the size and check if each line is shorter than 256 words
+ getline(textStream1, aLine);
+ while(!textStream1.eof()){
+
+ if(aLine.length()>0){
+ sentLen = 0;
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ if(this->vocNeedsToBeUpdated){
+ id = this->voc->getId(C_String(thisToken));
+ }
+ else{ //the provided vocabulary should cover all the words in this corpus
+ id = this->voc->returnId(C_String(thisToken));
+
+ if(id==0){ //word does not exist
+ cerr<<"Vocabulary: "<<idVocFileName<<" does not cover all the words in the corpus!"<<endl;
+ cerr<<"Word: "<<thisToken<<" does not exist in the voc!\n";
+ exit(-1);
+ }
+ }
+
+
+
+ sentLen++;
+
+ if(id>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = id;
+ }
+
+ if(sentLen>=256){
+ cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+ exit(-1);
+ }
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+ corpusSize+=sentLen;
+
+ sentLen = 0;
+ sentNumber++;
+ }
+ else{
+ cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this message if this is the last sentence.\n";
+ }
+ getline(textStream1, aLine);
+ }
+
+ sentNumber--;
+ unsigned int estimatedSize = corpusSize+3*sentNumber+1000; //with some redundancy
+ cerr<<sentNumber<<" sentences and "<<corpusSize<<" words in corpus\n";
+ cerr<<"Reserve "<<estimatedSize*2<<" bytes in RAM for sorting\n";
+ textStream1.close();
+
+
+ //second pass, convert the corpus into vocIDs and create suffix array
+ ifstream textStream2;
+ textStream2.open(fileName);
+
+ this->allocateMem(estimatedSize);
+ this->currentPosInCorpus = 0;
+ sentNumber = 1;
+
+ getline(textStream2, aLine);
+ while(!textStream2.eof()){
+
+ if(aLine.length()>0){
+ sentLen = 0;
+
+ //add sentId
+ //offset at this position will store the acutal sentence length
+ corpus[this->currentPosInCorpus]=this->vocIdForSentIdPlaceHolder;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->currentPosInCorpus++;
+
+ //add <s>
+ sentLen++; //not real sentence length, but to keep track of offset
+ corpus[this->currentPosInCorpus]=this->vocIdForSentStart;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+ this->currentPosInCorpus++;
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ id = this->voc->returnId(C_String(thisToken));
+ if(id==0){
+ cerr<<"Word \""<<thisToken<<"\" is not listed in the IDVocabulary.\n";
+ exit(-1);
+ }
+
+ sentLen++;
+
+ if(id>this->maxVocIdFromCorpus){
+ this->maxVocIdFromCorpus = id;
+ }
+
+ corpus[this->currentPosInCorpus]=id;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+ this->currentPosInCorpus++;
+
+ if(sentLen>=256){
+ cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+ exit(-1);
+ }
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ //add <sentEnd>
+ corpus[this->currentPosInCorpus]=this->vocIdForSentEnd;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) (sentLen + 1);
+ this->offsetList[this->currentPosInCorpus - sentLen - 1] = (unsigned char) (sentLen-1); //write the sentLen to sent begin correspond to <sentId>
+ this->currentPosInCorpus++;
+
+ sentLen = 0;
+ sentNumber++;
+ }
+ else{
+ cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this if this is the last sentence.\n";
+ }
+
+ aLine[0]=0;
+ getline(textStream2, aLine);
+ }
+ textStream2.close();
+
+ //add <endOfCorpus> to the end of data
+ corpus[this->currentPosInCorpus]=this->vocIdForCorpusEnd;
+ this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+ this->offsetList[this->currentPosInCorpus] = (unsigned char) 0;
+ this->currentPosInCorpus++;
+
+ actualCorpusSize = this->currentPosInCorpus;
+
+ time( &ltime2 );
+ cerr<<"\nCorpus loaded in: "<<ltime2-ltime1<<" seconds."<<endl;
+ cerr<<"Total "<<sentNumber-1<<" sentences loaded.\n";
+
+ //replace the sentId place holder to actual sentId
+ time( &ltime1 );
+ cerr<<"Inserting sentence IDs into the corpus...\n";
+ IndexType sentId = this->maxVocIdFromCorpus+1;
+ for(TextLenType i=0;i<actualCorpusSize;i++){
+ if(corpus[i]==this->vocIdForSentIdPlaceHolder){
+ corpus[i]=sentId;
+ sentId++;
+ }
+ }
+ time( &ltime2 );
+ cerr<<"\nSentence IDs inserted in: "<<ltime2-ltime1<<" seconds."<<endl;
+
+ //sorting
+ time( &ltime1 );
+ cerr<<"Sorting the suffix...\n";
+ sort(this->suffix, this->suffix+actualCorpusSize);
+ time( &ltime2 );
+ cerr<<"\nCorpus sorted in: "<<ltime2-ltime1<<" seconds."<<endl;
+ cerr<<"Done."<<endl;
+
+}
+
+void C_MonoCorpus::allocateMem(TextLenType corpusSize)
+{
+ corpus = (IndexType *) malloc(sizeof(IndexType)*corpusSize);
+
+ if(corpus==0){
+ cerr<<"Failed to allocate memory for corpus. Quit!\n";
+ exit(-1);
+ }
+
+ this->suffix = (C_SuffixPointer *) malloc(sizeof(C_SuffixPointer)*corpusSize);
+ if(this->suffix==0){
+ cerr<<"Failed to allocate memory for suffix. Quit!\n";
+ exit(-1);
+ }
+
+ this->offsetList = (unsigned char *) malloc(sizeof(unsigned char)*corpusSize);
+ if(this->offsetList==0){
+ cerr<<"Failed to allocate memory for offset. Quit!\n";
+ exit(-1);
+ }
+
+}
+
+
+void C_MonoCorpus::outputCorpus(char *filename)
+{
+ cerr<<"Writing corpus to file: "<<filename<<endl;
+ ofstream textOutStream;
+ textOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ textOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0; i<actualCorpusSize;i++){
+ textOutStream.write((char *)&(corpus[i]), sizeof(IndexType));
+ }
+
+ textOutStream.close();
+
+}
+
+void C_MonoCorpus::outputOffset(char *filename)
+{
+ cerr<<"Writing offset to file: "<<filename<<endl;
+
+ ofstream offsetOutStream;
+ offsetOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ offsetOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0; i<actualCorpusSize; i++){
+ offsetOutStream.write((char *)& (this->offsetList[i]), sizeof(unsigned char));
+ }
+ offsetOutStream.close();
+}
+
+void C_MonoCorpus::outputSuffix(char *filename)
+{
+ cerr<<"Writing suffix information to file: "<<filename<<endl;
+
+ ofstream saOutStream;
+ saOutStream.open(filename, ios::binary);
+
+ //first, write down the corpus size
+ saOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+ for(TextLenType i=0;i<actualCorpusSize; i++){
+ saOutStream.write((char *) & (this->suffix[i].pointer), sizeof(TextLenType));
+ }
+
+ saOutStream.close();
+}
+
+void C_MonoCorpus::output(char *filename)
+{
+ char outputVocFileName[1024];
+ char outputCorpusFileName[1024];
+ char outputOffsetFileName[1024];
+ char outputSuffixFileName[1024];
+
+
+ if(this->vocNeedsToBeUpdated){
+ sprintf(outputVocFileName, "%s.id_voc", filename);
+ this->voc->outputToFile(outputVocFileName);
+ }
+
+ sprintf(outputCorpusFileName, "%s.sa_corpus", filename);
+ sprintf(outputOffsetFileName, "%s.sa_offset", filename);
+ sprintf(outputSuffixFileName, "%s.sa_suffix", filename);
+
+
+ this->outputCorpus(outputCorpusFileName);
+ this->outputOffset(outputOffsetFileName);
+ this->outputSuffix(outputSuffixFileName);
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.h b/Src/IndexSA/_MonoCorpus.h
new file mode 100755
index 0000000..4c834b0
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.h
@@ -0,0 +1,60 @@
+#if !defined(__MonoCorpus__H__INCLUDED_)
+#define __MonoCorpus__H__INCLUDED_
+
+#include "_IDVocabulary.h"
+#include "salm_shared.h"
+
+/**
+* \ingroup index
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+* Defines the wrapper class for the comparing function
+**/
+class C_SuffixPointer
+{
+public:
+ C_SuffixPointer(const C_SuffixPointer &);
+ C_SuffixPointer();
+ ~C_SuffixPointer();
+ C_SuffixPointer(TextLenType pointer);
+ TextLenType pointer;
+};
+
+/**
+* \ingroup index
+* Monolingual corpus class for loading the corpus from file, sort it according to the suffix array order
+* and convert it to the binary format for suffix array applications
+**/
+class C_MonoCorpus
+{
+public:
+ void initializeVocabulary(char * fileNameStem);
+ void output(char * filename);
+ void loadCorpusAndSort(const char * fileName, const char * idVocFileName, bool vocNeedsToBeUpdated);
+
+ C_MonoCorpus();
+ virtual ~C_MonoCorpus();
+
+private:
+ IndexType maxVocIdFromCorpus;
+ void outputSuffix(char * filename);
+ void outputOffset(char * filename);
+ void outputCorpus(char * filename);
+
+ IndexType vocIdForSentIdPlaceHolder;
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+ TextLenType currentPosInCorpus;
+ void allocateMem(TextLenType corpusSize);
+
+ C_SuffixPointer * suffix;
+ unsigned char * offsetList;
+ C_IDVocabulary * voc;
+
+ bool vocNeedsToBeUpdated;
+
+};
+
+#endif // !defined(__MonoCorpus__H__INCLUDED_)
diff --git a/Src/SALM-API-Description.txt b/Src/SALM-API-Description.txt
new file mode 100755
index 0000000..c36f60c
--- /dev/null
+++ b/Src/SALM-API-Description.txt
@@ -0,0 +1,24 @@
+/**
+* \defgroup index Indexing the corpus
+* \defgroup search Search Applications
+* \defgroup scan Scan Applications
+* \defgroup lm Suffix Array Language Model
+* \defgroup utils Utilities
+*
+* \mainpage SALM API Documentation
+* Author: <a href=mailto:joy+salm@cs.cmu.edu > Ying (Joy) Zhang </a>
+* \section intro Introduction
+*
+* There are three main modules in <a href=http://projectile.is.cs.cmu.edu/research/public/tools/salm/salm.htm > SALM </a> : Indexing, Searching and Scanning.
+* To start, use IndexSA to index the corpus according to its suffix array.
+* This is the first step for all applications.
+* Once the corpus is indexed. We can use SALM to do all kinds of interesting process on this corpus.
+* \section search Applications based on searching the corpus
+* These applications searches for the occurrences of an n-gram or all the embedded n-grams of a sentence in the corpus.
+* \section scan Applications based on scanning the corpus
+* These applications scan through the corpus in a linear time and collects information such as the type/token frequency of the n-grams in the data.
+* \section lm Suffix Array Language Model
+* An online language model based on the suffix array indexing. Suffix array language model can use arbitrarily long history and very large corpus.
+* \section utils Utilities
+* Utility functions such as updating the universal ID vocabulary after observing a new corpus
+**/
diff --git a/Src/Shared/_IDVocabulary.cpp b/Src/Shared/_IDVocabulary.cpp
new file mode 100755
index 0000000..a34b043
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.cpp
@@ -0,0 +1,219 @@
+/**
+* _IDVocabulary.cpp: implementation of the C_IDVocabulary class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+
+#include "_IDVocabulary.h"
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <memory.h>
+#include <stdlib.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_IDVocabulary::C_IDVocabulary()
+{
+ this->maxIdInVoc = 0;
+}
+
+C_IDVocabulary::C_IDVocabulary(const char * fileName)
+{
+
+ this->maxIdInVoc = 0;
+
+ this->loadFromFile(fileName);
+}
+
+C_IDVocabulary::~C_IDVocabulary()
+{
+
+}
+
+/// Return the vocID of word "text" if it exist in the vocabulary
+/// Otherwise return 0
+IndexType C_IDVocabulary::returnId(C_String text)
+{
+ IndexType id;
+
+ map<C_String, IndexType, ltstr>::iterator iterText2Id;
+ iterText2Id = this->text2id.find(text);
+
+ if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk>
+ id = 0;
+ }
+ else{
+ id = iterText2Id->second;
+ }
+
+ return id;
+}
+
+/// Return the text of the word given its vocID
+/// return <UNK> if specified vocID does not exist
+C_String C_IDVocabulary::getText(IndexType id)
+{
+ map<IndexType, C_String>::iterator iterId2Text;
+ iterId2Text = this->id2text.find(id);
+
+ if(iterId2Text==this->id2text.end()){
+ return C_String("<UNK>");
+ }
+
+ return iterId2Text->second;
+}
+
+IndexType C_IDVocabulary::getSize()
+{
+ return this->text2id.size();
+}
+
+
+/// Load the vocabulary file into memory
+/// The format of the vocabulary file is:
+/// word vocID
+// in each line.
+void C_IDVocabulary::loadFromFile(const char *fileName)
+{
+
+ ifstream existingVocFile;
+ existingVocFile.open(fileName);
+
+ if(!existingVocFile){
+ cerr<<"Can not open existing vocabulary file "<<fileName<<endl;
+ exit(0);
+ }
+
+ cerr<<"Loading existing vocabulary file: "<<fileName<<endl;
+
+ char aLine[1024];
+ char * aToken;
+ char delimit[] = " \t\r\n";
+ IndexType vocId = 0;
+
+ while(!existingVocFile.eof()){
+ existingVocFile.getline(aLine, 1024, '\n');
+
+ if(strlen(aLine)>0){ //a meaningful word, esp for the last line during reading file
+ vector<C_String> tokensInLine;
+
+ aToken = strtok(aLine, delimit);
+ while( aToken != NULL ) {
+ tokensInLine.push_back(C_String(aToken));
+ aToken = strtok( NULL, delimit);
+ }
+
+ if(tokensInLine.size()!=2){
+ cerr<<"Not valid format for Vocabulary: "<<aLine<<endl;
+ }
+
+ vocId = atoi(tokensInLine[1].toString());
+
+ if(vocId>this->maxIdInVoc){
+ this->maxIdInVoc = vocId;
+ }
+
+ this->text2id.insert(make_pair(tokensInLine[0], vocId));
+ this->id2text.insert(make_pair(vocId, tokensInLine[0] ));
+
+ }
+
+ aLine[0]=0;
+ }
+ cerr<<"Total "<<this->text2id.size()<<" word types loaded\n";
+ cerr<<"Max VocID="<<this->maxIdInVoc<<endl;
+}
+
+/// Return the maximum ID from all words in the vocabulary
+/// Usually equals to the size of the vocabulary if the vocabulary is created from this corpus only.
+/// If the vocabulary includes words from other corpora and the vocabulary only lists words in this corpus,
+/// then max voc ID could be different from the vocabulary size
+IndexType C_IDVocabulary::returnMaxID()
+{
+ return this->maxIdInVoc;
+}
+
+IndexType C_IDVocabulary::returnNullWordID()
+{
+ return 0;
+}
+
+/**
+* Output the vocabulary to a file
+**/
+void C_IDVocabulary::outputToFile(char *filename)
+{
+
+ ofstream outputVocFile;
+ outputVocFile.open(filename);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<filename<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ map<C_String, IndexType, ltstr>::iterator iterText2Id;
+
+ iterText2Id = this->text2id.begin();
+ while(iterText2Id!=this->text2id.end()){
+ outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl;
+ iterText2Id++;
+ }
+
+ outputVocFile.close();
+}
+
+/// Reserver vocID 0-NUMBER_OF_RESERVED_WORDS_IN_VOC for special words that might be useful for applications
+/// Here we reserved 5 words:
+/// _SENT_ID_PLACEHOLDER_ 1
+/// _END_OF_SENTENCE_ 2
+/// _TOO_LONG_TOKEN_ 3
+/// _SENTENCE_START_ 4
+/// _END_OF_CORPUS_ 5
+/// You can add other special words to the list as long as the assignment of vocID and its interpretation is consistent between application and indexing
+void C_IDVocabulary::addingReservedWords()
+{
+ this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1);
+ this->insertWord(C_String("_END_OF_SENTENCE_"), 2);
+ this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3);
+ this->insertWord(C_String("_SENTENCE_START_"), 4);
+ this->insertWord(C_String("_END_OF_CORPUS_"), 5);
+
+ char reservedWord[20];
+ for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){
+ memset(reservedWord, 0, 20);
+ sprintf(reservedWord, "_RESERVED_WORDS_%d", i);
+ this->insertWord(C_String(reservedWord), i);
+ }
+}
+
+void C_IDVocabulary::insertWord(C_String text, IndexType id)
+{
+ this->text2id.insert(make_pair(text, id));
+ this->id2text.insert(make_pair(id, text));
+
+}
+
+/**
+* Check if the word already exist in the voc,
+* if so, return the vocID of the word,
+* otherwise assign an ID to this word and insert it into the voc
+**/
+IndexType C_IDVocabulary::getId(C_String text)
+{
+ IndexType id = this->returnId(text);
+ if(id==0){
+ this->maxIdInVoc++;
+ this->insertWord(text, this->maxIdInVoc);
+ return this->maxIdInVoc;
+ }
+
+ //else, already exist
+ return id;
+}
diff --git a/Src/Shared/_IDVocabulary.cpp~ b/Src/Shared/_IDVocabulary.cpp~
new file mode 100755
index 0000000..d5e6a14
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.cpp~
@@ -0,0 +1,218 @@
+/**
+* _IDVocabulary.cpp: implementation of the C_IDVocabulary class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+
+#include "_IDVocabulary.h"
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <memory.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_IDVocabulary::C_IDVocabulary()
+{
+ this->maxIdInVoc = 0;
+}
+
+C_IDVocabulary::C_IDVocabulary(const char * fileName)
+{
+
+ this->maxIdInVoc = 0;
+
+ this->loadFromFile(fileName);
+}
+
+C_IDVocabulary::~C_IDVocabulary()
+{
+
+}
+
+/// Return the vocID of word "text" if it exist in the vocabulary
+/// Otherwise return 0
+IndexType C_IDVocabulary::returnId(C_String text)
+{
+ IndexType id;
+
+ map<C_String, IndexType, ltstr>::iterator iterText2Id;
+ iterText2Id = this->text2id.find(text);
+
+ if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk>
+ id = 0;
+ }
+ else{
+ id = iterText2Id->second;
+ }
+
+ return id;
+}
+
+/// Return the text of the word given its vocID
+/// return <UNK> if specified vocID does not exist
+C_String C_IDVocabulary::getText(IndexType id)
+{
+ map<IndexType, C_String>::iterator iterId2Text;
+ iterId2Text = this->id2text.find(id);
+
+ if(iterId2Text==this->id2text.end()){
+ return C_String("<UNK>");
+ }
+
+ return iterId2Text->second;
+}
+
+IndexType C_IDVocabulary::getSize()
+{
+ return this->text2id.size();
+}
+
+
+/// Load the vocabulary file into memory
+/// The format of the vocabulary file is:
+/// word vocID
+// in each line.
+void C_IDVocabulary::loadFromFile(const char *fileName)
+{
+
+ ifstream existingVocFile;
+ existingVocFile.open(fileName);
+
+ if(!existingVocFile){
+ cerr<<"Can not open existing vocabulary file "<<fileName<<endl;
+ exit(0);
+ }
+
+ cerr<<"Loading existing vocabulary file: "<<fileName<<endl;
+
+ char aLine[1024];
+ char * aToken;
+ char delimit[] = " \t\r\n";
+ IndexType vocId = 0;
+
+ while(!existingVocFile.eof()){
+ existingVocFile.getline(aLine, 1024, '\n');
+
+ if(strlen(aLine)>0){ //a meaningful word, esp for the last line during reading file
+ vector<C_String> tokensInLine;
+
+ aToken = strtok(aLine, delimit);
+ while( aToken != NULL ) {
+ tokensInLine.push_back(C_String(aToken));
+ aToken = strtok( NULL, delimit);
+ }
+
+ if(tokensInLine.size()!=2){
+ cerr<<"Not valid format for Vocabulary: "<<aLine<<endl;
+ }
+
+ vocId = atoi(tokensInLine[1].toString());
+
+ if(vocId>this->maxIdInVoc){
+ this->maxIdInVoc = vocId;
+ }
+
+ this->text2id.insert(make_pair(tokensInLine[0], vocId));
+ this->id2text.insert(make_pair(vocId, tokensInLine[0] ));
+
+ }
+
+ aLine[0]=0;
+ }
+ cerr<<"Total "<<this->text2id.size()<<" word types loaded\n";
+ cerr<<"Max VocID="<<this->maxIdInVoc<<endl;
+}
+
+/// Return the maximum ID from all words in the vocabulary
+/// Usually equals to the size of the vocabulary if the vocabulary is created from this corpus only.
+/// If the vocabulary includes words from other corpora and the vocabulary only lists words in this corpus,
+/// then max voc ID could be different from the vocabulary size
+IndexType C_IDVocabulary::returnMaxID()
+{
+ return this->maxIdInVoc;
+}
+
+IndexType C_IDVocabulary::returnNullWordID()
+{
+ return 0;
+}
+
+/**
+* Output the vocabulary to a file
+**/
+void C_IDVocabulary::outputToFile(char *filename)
+{
+
+ ofstream outputVocFile;
+ outputVocFile.open(filename);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<filename<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ map<C_String, IndexType, ltstr>::iterator iterText2Id;
+
+ iterText2Id = this->text2id.begin();
+ while(iterText2Id!=this->text2id.end()){
+ outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl;
+ iterText2Id++;
+ }
+
+ outputVocFile.close();
+}
+
+/// Reserver vocID 0-NUMBER_OF_RESERVED_WORDS_IN_VOC for special words that might be useful for applications
+/// Here we reserved 5 words:
+/// _SENT_ID_PLACEHOLDER_ 1
+/// _END_OF_SENTENCE_ 2
+/// _TOO_LONG_TOKEN_ 3
+/// _SENTENCE_START_ 4
+/// _END_OF_CORPUS_ 5
+/// You can add other special words to the list as long as the assignment of vocID and its interpretation is consistent between application and indexing
+void C_IDVocabulary::addingReservedWords()
+{
+ this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1);
+ this->insertWord(C_String("_END_OF_SENTENCE_"), 2);
+ this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3);
+ this->insertWord(C_String("_SENTENCE_START_"), 4);
+ this->insertWord(C_String("_END_OF_CORPUS_"), 5);
+
+ char reservedWord[20];
+ for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){
+ memset(reservedWord, 0, 20);
+ sprintf(reservedWord, "_RESERVED_WORDS_%d", i);
+ this->insertWord(C_String(reservedWord), i);
+ }
+}
+
+void C_IDVocabulary::insertWord(C_String text, IndexType id)
+{
+ this->text2id.insert(make_pair(text, id));
+ this->id2text.insert(make_pair(id, text));
+
+}
+
+/**
+* Check if the word already exist in the voc,
+* if so, return the vocID of the word,
+* otherwise assign an ID to this word and insert it into the voc
+**/
+IndexType C_IDVocabulary::getId(C_String text)
+{
+ IndexType id = this->returnId(text);
+ if(id==0){
+ this->maxIdInVoc++;
+ this->insertWord(text, this->maxIdInVoc);
+ return this->maxIdInVoc;
+ }
+
+ //else, already exist
+ return id;
+}
diff --git a/Src/Shared/_IDVocabulary.h b/Src/Shared/_IDVocabulary.h
new file mode 100755
index 0000000..fa50add
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.h
@@ -0,0 +1,55 @@
+#if !defined(__IDVocabulary_H__INCLUDED_)
+#define __IDVocabulary_H__INCLUDED_
+
+#include "_String.h"
+#include <string>
+#include <map>
+#include <vector>
+#include "salm_shared.h"
+
+using namespace std;
+
+
+struct ltstr
+{
+ bool operator()(C_String s1, C_String s2) const
+ {
+ return s1<s2;
+ }
+};
+
+/**
+* Vocabulary class
+* Mapping between words and their IDs
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_IDVocabulary
+{
+
+public:
+ ///Return the ID of word "text", if the word does not exist, add the word into the voc and return the newly assigned ID
+ IndexType getId(C_String text);
+
+ void addingReservedWords();
+ void outputToFile(char * filename);
+ IndexType returnNullWordID();
+ IndexType returnMaxID();
+ IndexType returnId(C_String text);
+
+ IndexType getSize();
+ C_String getText(IndexType);
+
+ C_IDVocabulary();
+ C_IDVocabulary(const char * fileName);
+ virtual ~C_IDVocabulary();
+
+private:
+ void insertWord(C_String text, IndexType id);
+ void loadFromFile(const char * fileName);
+ IndexType maxIdInVoc;
+ map<C_String, IndexType, ltstr> text2id;
+ map<IndexType, C_String> id2text;
+};
+
+#endif // !defined(__IDVocabulary_H__INCLUDED_)
diff --git a/Src/Shared/_String.cpp b/Src/Shared/_String.cpp
new file mode 100755
index 0000000..75ba8e8
--- /dev/null
+++ b/Src/Shared/_String.cpp
@@ -0,0 +1,253 @@
+/**
+* _String.cpp: implementation of the C_String class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_String.h"
+#include "malloc.h"
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_String::C_String()
+{
+ this->content = (char *) malloc(sizeof(char));
+ this->content[0]='\0';
+ this->hasContent = true;
+}
+
+void C_String::freeContent()
+{
+ if(this->hasContent){
+ this->hasContent = false;
+ free(this->content);
+ }
+}
+
+C_String::~C_String()
+{
+ this->freeContent();
+}
+
+/**
+* Copy constructor from a char string
+**/
+C_String::C_String(char * str1)
+{
+
+ this->content = (char *) malloc(sizeof(char)*strlen(str1)+1);
+ if(this->content==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ strcpy(this->content, str1);
+
+ this->hasContent = true;
+}
+
+
+C_String::C_String(C_String const &strObj1)
+{
+ this->hasContent = false;
+ copy(strObj1);
+}
+
+C_String::C_String(const C_String & obj1, const C_String & obj2)
+{
+ this->freeContent();
+
+ int len1 = strlen(obj1.content);
+ int len2 = strlen(obj2.content);
+
+ int fullLen = len1+len2;
+ this->content = (char *) malloc(sizeof(char)*len1 + sizeof(char)*len2 + 1);
+
+ if(this->content==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ char * pointer = (char*) this->content;
+ strcpy(pointer, obj1.content); //copy first part
+ pointer += len1;
+ strcpy(pointer, obj2.content); //copy second part
+
+ this->content[fullLen]='\0';
+
+ this->hasContent = true;
+}
+
+void C_String::operator=(const C_String &strObj2)
+{
+ copy(strObj2);
+}
+
+void C_String::copy(const C_String &strObj)
+{
+ this->freeContent();
+
+ this->content = (char *) malloc(sizeof(char)*strlen(strObj.content)+1);
+ if(this->content==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ strcpy(this->content, strObj.content);
+ this->hasContent = true;
+}
+
+void C_String::copy(const C_String &strObj, int copyLen)
+{
+ this->freeContent();
+
+ this->content = (char *) malloc(sizeof(char)*(copyLen+1) );
+ if(this->content==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ for(int i=0;i<copyLen;i++){
+ this->content[i]=strObj.getCharAtPos(i);
+ }
+
+ this->content[copyLen]='\0';
+
+ this->hasContent = true;
+
+}
+
+void C_String::print2stream(FILE *stream)
+{
+ fprintf(stream, content);
+}
+
+
+int C_String::length() const
+{
+ if(this->hasContent){
+ return strlen(this->content);
+ }
+
+ return 0;
+}
+
+bool C_String::operator==(const C_String &obj1) const
+{
+ if(strcmp(this->content, obj1.content)==0){
+ return true;
+ };
+
+ return false;
+}
+
+bool C_String::operator!=(const C_String &obj1) const
+{
+ if(strcmp(this->content, obj1.content)!=0){
+ return true;
+ };
+
+ return false;
+}
+
+bool C_String::operator<(const C_String &obj1) const
+{
+ if(strcmp(this->content, obj1.content)<0){
+ return true;
+ };
+
+ return false;
+}
+
+char * C_String::toString() const
+{
+ return this->content;
+}
+
+void C_String::clear()
+{
+ this->freeContent();
+
+ this->content = (char *) malloc(sizeof(char));
+ this->content[0]='\0';
+ this->hasContent = true;
+}
+
+
+char C_String::getCharAtPos(int pos) const
+{
+ if(pos>=this->length()){
+ fprintf(stderr,"Can not get char at pos %d, out of bound! Exit.\n", pos);
+ exit(0);
+ }
+
+ return this->content[pos];
+}
+
+
+void C_String::appending(const C_String &obj)
+{
+ int len1 = 0;
+
+ if(this->hasContent){
+ len1 = strlen(this->content);
+ }
+
+ int len2 = strlen(obj.content);
+
+ int fullLen = len1+len2;
+
+ char * newContent = (char *) malloc(sizeof(char)*fullLen + 1);
+
+ if(newContent==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ char * pointer = newContent;
+ if(this->hasContent){
+ strcpy(pointer, content); //copy first part
+ pointer += len1;
+ }
+
+ strcpy(pointer, obj.content); //copy second part
+ newContent[fullLen]='\0';
+
+ //free old content
+ this->freeContent();
+
+ //point to new content
+ this->content = newContent;
+
+ this->hasContent = true;
+}
+
+void C_String::appending(const char nextChar)
+{
+ int len1 = 0;
+
+ if(this->hasContent){
+ len1 = strlen(this->content);
+ }
+
+ int fullLen = len1+1;
+
+ char * newContent = (char *) malloc(sizeof(char)*fullLen + 1);
+
+ if(newContent==NULL){
+ fprintf(stderr,"Memory allocation error, Quit.\n");
+ }
+
+ strcpy(newContent, content); //copy first part
+
+ newContent[len1]=nextChar; //copy second part
+ newContent[fullLen]='\0';
+
+ //free old content
+ this->freeContent();
+
+ //point to new content
+ this->content = newContent;
+
+ this->hasContent = true;
+}
diff --git a/Src/Shared/_String.h b/Src/Shared/_String.h
new file mode 100755
index 0000000..d8f633d
--- /dev/null
+++ b/Src/Shared/_String.h
@@ -0,0 +1,45 @@
+#if !defined(__STRING_H__INCLUDED_)
+#define __STRING_H__INCLUDED_
+
+/**
+* Definition of class C_String
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+#include "stdio.h"
+
+class C_String
+{
+public:
+
+ char getCharAtPos(int) const;
+ void clear();
+ char * toString() const;
+ int length() const;
+ void print2stream(FILE *);
+
+ C_String(const C_String & obj1, const C_String & obj2);
+ C_String(C_String const&);
+ C_String(char *);
+ C_String();
+
+ bool operator==(const C_String &) const;
+ bool operator!=(const C_String &) const;
+ bool operator<(const C_String &) const;
+ void operator=(const C_String &strObj2);
+
+ void appending(const C_String & obj);
+ void appending(const char nextChar);
+
+ virtual ~C_String();
+
+private:
+ void freeContent();
+ void copy(const C_String &);
+ void copy(const C_String &strObj, int copyLen);
+
+ bool hasContent;
+ char * content;
+};
+
+#endif // !defined(__STRING_H__INCLUDED_)
diff --git a/Src/Shared/salm_shared.h b/Src/Shared/salm_shared.h
new file mode 100755
index 0000000..2c0e186
--- /dev/null
+++ b/Src/Shared/salm_shared.h
@@ -0,0 +1,36 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+#if !defined(_SA_common_h)
+#define _SA_common_h
+
+#include "math.h"
+
+typedef unsigned int IndexType;
+typedef unsigned int TextLenType;
+typedef unsigned short int SearchLenType;
+
+//constants
+const int SIZE_ONE_READ = 16384; //when loading the data, each I/O read in SIZE_ONE_READ data points
+const int MAX_TOKEN_LEN = 1024; //length of the longest word
+
+const int NUMBER_OF_RESERVED_WORDS_IN_VOC = 100;
+
+/// for language modeling
+const double SALM_PROB_UNK = 0.00000000023283064365386962890625; // 1/4G
+const double SALM_LOG_PROB_UNK = log(SALM_PROB_UNK);
+const double SALM_LOG_0 = -20;
+
+/**
+* \ingroup scan
+**/
+typedef struct s_nGramScanningInfoElement
+{
+ IndexType vocId;
+ TextLenType freqThreshForOutput;
+ TextLenType freqSoFar;
+}S_nGramScanningInfoElement;
+
+#endif
+
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp
new file mode 100755
index 0000000..ab2915d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp
@@ -0,0 +1,63 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "time.h"
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <cstring>
+
+using namespace std;
+
+/**
+* A simple example of using the C_SuffixArrayLanguageModel class to calculate the LM prob of input sentences
+*
+* Revision $Rev: 3816 $
+* Last Modified $LastChangedDate: 2007-07-06 14:36:11 -0400 (Fri, 06 Jul 2007) $
+**/
+int main(int argc, char * argv[]){
+ if(argc<2){
+ cerr<<"\nUsage:\n\t"<<argv[0]<<" configurationFileName < sentences\n";
+ exit(0);
+ }
+
+ C_SuffixArrayLanguageModel salm(argv[1]);
+
+ long ltime1, ltime2;
+ time( &ltime1 );
+
+ string aWord;
+ char aLine[10240];
+ while(!cin.eof()){
+ cin.getline(aLine, 10240, '\n');
+
+ if(strlen(aLine)>0){
+ istringstream inputLine(aLine, istringstream::in);
+ LMState lmState = salm.beginOfSentenceState();
+
+ LMState nextState;
+ double logProb = 0;
+
+ while(! inputLine.eof()){
+ inputLine>>aWord;
+ if(aWord.length()>0){
+ IndexType vocId = salm.returnVocId(C_String((char *) aWord.c_str()));
+ logProb+=salm.logProb(lmState, vocId, nextState);
+ lmState = nextState;
+ }
+ aWord="";
+ }
+
+ logProb+=salm.logProbEnd(lmState);
+ cout<<"LogProb="<<logProb<<endl;
+
+ }
+
+ aLine[0]=0;
+ }
+
+ time( &ltime2 );
+ cerr<<"\n"<<ltime2-ltime1<<" seconds spent."<<endl;
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~
new file mode 100755
index 0000000..95e7993
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~
@@ -0,0 +1,62 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "time.h"
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+
+using namespace std;
+
+/**
+* A simple example of using the C_SuffixArrayLanguageModel class to calculate the LM prob of input sentences
+*
+* Revision $Rev: 3816 $
+* Last Modified $LastChangedDate: 2007-07-06 14:36:11 -0400 (Fri, 06 Jul 2007) $
+**/
+int main(int argc, char * argv[]){
+ if(argc<2){
+ cerr<<"\nUsage:\n\t"<<argv[0]<<" configurationFileName < sentences\n";
+ exit(0);
+ }
+
+ C_SuffixArrayLanguageModel salm(argv[1]);
+
+ long ltime1, ltime2;
+ time( &ltime1 );
+
+ string aWord;
+ char aLine[10240];
+ while(!cin.eof()){
+ cin.getline(aLine, 10240, '\n');
+
+ if(strlen(aLine)>0){
+ istringstream inputLine(aLine, istringstream::in);
+ LMState lmState = salm.beginOfSentenceState();
+
+ LMState nextState;
+ double logProb = 0;
+
+ while(! inputLine.eof()){
+ inputLine>>aWord;
+ if(aWord.length()>0){
+ IndexType vocId = salm.returnVocId(C_String((char *) aWord.c_str()));
+ logProb+=salm.logProb(lmState, vocId, nextState);
+ lmState = nextState;
+ }
+ aWord="";
+ }
+
+ logProb+=salm.logProbEnd(lmState);
+ cout<<"LogProb="<<logProb<<endl;
+
+ }
+
+ aLine[0]=0;
+ }
+
+ time( &ltime2 );
+ cerr<<"\n"<<ltime2-ltime1<<" seconds spent."<<endl;
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt
new file mode 100755
index 0000000..17cd5a8
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt
@@ -0,0 +1,5 @@
+June 27, 2007
+
+Working branch of applying KN smoothing in LM.
+Not finished yet.
+Do not distribute! \ No newline at end of file
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp
new file mode 100755
index 0000000..583b222
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp
@@ -0,0 +1,1113 @@
+/**
+* Revision $Rev: 3665 $
+* $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <set>
+
+#include "math.h"
+
+using namespace std;
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+
+}
+
+
+/**
+* Construct the suffix array language model object
+* Take the configuration filename as the parameter for the constructor
+*
+* The configuration file is of the following format for each line:
+*
+* Keyword<tab>value
+* <p>
+* Note: keywords are all case sensitive.
+* <ul>
+* <li> <b>CORPUS</b> filename of the corpus for LM training. It should be the same as used in IndexSA
+* <li> <b>N</b> Highest order of n considered for n-gram LM. Default value = <i>5</i>
+* <li> <b>SMOOTHING_STRATEGY</b> Smoothing strategy.
+* <ul>
+* <li> <i>k</i> : default value. Modified Kneser-Ney Smoothing @see <a href=http://acl.ldc.upenn.edu/P/P96/P96-1041.pdf> An Empirical Study of Smoothing Techniques for Language Modeling </a>
+* <li> <i>g</i> : Good-Turing discounting @see <a href=http://l2r.cs.uiuc.edu/~danr/Teaching/CS598-05/Papers/Gale-Sampson-smoothgoodturing.pdf> Good Turing without Tears</a>
+* </ul>
+* <li> <b>INTERPOLATION_STRATEGY</b> : Interpolation strategy
+* <ul>
+* <li> <i>e</i> : Probability of the next word predicted by histories of different orders are equally interpolated
+* <li> <i>m</i> : Use the maximum conditional probability from all different order of history as the probability for the next word
+* <li> <i>i</i> : Use deleted interpolation based on heuristics developed by IBM
+* </ul>
+* <li> <b>MAX_FREQ_DISC</b>: <br>
+* <i>default</i>=50<br>
+* If the frequency of an n-gram is lower than this value and SMOOTHING is set, discounting will be applied. <br>
+* If this value is set to 0 or negative values, smoothing/discounting will not be used. <br>
+* <li> <b>PURGE_CACHE</b>: Check entries in the cache after "PURGE_CACHE" number of sentences have been processed. Default = 100.
+* <li> <b>FRESH_TIME</b>: Entries in the cache that are not used since "current time - FRESH_TIME" will be purged from the cache. Mesured in seconds of wall clock time.
+** </ul>
+* @param Configuration File Name
+* @param corpusFileNameStem The training corpus filename used by IndexSA.
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
+{
+
+ fstream cfgFile;
+ cfgFile.open(cfgFileName,ios::in);
+
+ if(!cfgFile){
+ fprintf(stderr,"Configuration file %s does not exist! quit!!\n", cfgFileName);
+ exit(-1);
+ }
+
+ //-----------------------------------------------------------------------------
+ //reading parameters
+ char paraName[1024];
+ char corpusFileNameStem[1024];
+
+ corpusFileNameStem[0]='\0';
+
+ //default values for member variables
+ this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob
+ this->smoothingStrategy = 'k'; //default smoothing strategy: modified Kneser-Ney smoothing
+ this->maxFreqForDiscounting = 50; //default, freq that is lower than this value will not be applied with discounting
+ this->maxN= 5; // default value; consider up to 5 words
+
+ this->numberOfSentSeenToPurgeCache = 100; //default value, purge cache after processing 100 sentences
+ this->freshTime = 50; //entries in the cache that are older than 50 seconds are subject to purging
+ this->sentenceProcessedSoFar = 0;
+ this->typeOfBigrams = 0;
+
+ while(!cfgFile.eof()){
+ cfgFile>>paraName;
+
+ if(strcmp(paraName,"CORPUS")==0){
+ cfgFile>>corpusFileNameStem;
+ }
+ else if(strcmp(paraName, "SMOOTHING_STRATEGY")==0){
+ cfgFile>>this->smoothingStrategy;
+ }
+ else if(strcmp(paraName,"N")==0){
+ cfgFile>>this->maxN;
+ }
+ else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+ cfgFile>>this->maxFreqForDiscounting;
+ }
+ else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+ cfgFile>>this->interpolationStrategy;
+ }
+ else if(strcmp(paraName,"FRESH_TIME")==0){
+ cfgFile>>this->freshTime;
+ }
+ else if(strcmp(paraName, "PURGE_CACHE")==0){
+ cfgFile>>this->numberOfSentSeenToPurgeCache;
+ }
+
+ paraName[0]=0;
+
+ }
+
+
+ if(strlen(corpusFileNameStem)==0){
+ cerr<<"CORPUS not specified in the configuration file! Quit!"<<endl;
+ exit(-1);
+ }
+
+
+ this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class
+ //corpusName, with vocabulary, no offset,
+
+
+ this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+
+ //initialize the scanning list
+ for(int i=0;i<this->maxN;i++){
+ this->nGramScanningList[i].freqSoFar=0;
+ this->nGramScanningList[i].vocId = 0;
+ this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ if(this->maxFreqForDiscounting<=0){
+ this->applyDiscounting = false;
+ }
+ else{
+ if(this->maxFreqForDiscounting<3){
+ cerr<<"MAX_FREQ_DISC has to be at least 3!"<<endl;
+ exit(-1);
+ }
+
+ this->applyDiscounting = true;
+ this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map
+ }
+
+}
+
+/**
+* Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 'e' for equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+void C_SuffixArrayLanguageModel::setParam_interpolationStrategy(char interpolationStrategy)
+{
+ this->interpolationStrategy = interpolationStrategy;
+}
+
+/**
+* Set the value for parameter :numberOfSentSeenToPurgeCache
+* LM will purge the entries in the cache that have not been used in 'freshTime'
+**/
+void C_SuffixArrayLanguageModel::setParam_numberOfSentSeenToPurgeCache(int numberOfSentSeenToPurgeCache)
+{
+ this->numberOfSentSeenToPurgeCache = numberOfSentSeenToPurgeCache;
+}
+
+/**
+* Set the value for parameter: freshTime
+* LM will purge the entries in the cache that have not been used in 'freshTime'
+**/
+void C_SuffixArrayLanguageModel::setParam_freshTime(long freshTime)
+{
+ this->freshTime = freshTime;
+}
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information
+* and construct the discounting using Good-Turing smoothing
+* Also, estimate the Y, D1, D2, D3+ values as needed for the modified Kneser-Ney smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+ unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+ this->typeOfBigrams = 0;
+
+ if(countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+ countOfCountsTable[c]=0;
+ }
+
+
+ int i,j;
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match
+
+ this->nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+
+
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ if(j==1){ //a new bigram type, this information is important for KN-smoothing
+ this->typeOfBigrams++;
+ }
+
+
+ freqSoFar = this->nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ this->nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ this->nGramScanningList[j].freqSoFar = 1;
+ }
+
+ this->nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(this->nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+
+ if(i==1){
+ this->typeOfBigrams++;
+ }
+
+ freqSoFar = this->nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+ }
+
+ //now, use Good-Turing discounting to create frequency mapping
+ //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+ this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+
+ for(i=0;i<this->maxN;i++){
+ //for (i+1)-gram
+
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+ for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq
+ //for all (freq+1) ngrams
+ if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists
+ discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);
+ }
+ else{
+ discountingMapForThisN[freq] = -1;
+ }
+ }
+
+ discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency
+ }
+
+
+ //estimate the Y, D1, D2 and D3+ values for each order of n.
+ //these values will be used for KN-smoothing to estimate the gamma, the discounting factor
+ this->Y = (double *) malloc(sizeof(double) * this->maxN);
+ this->D1 = (double *) malloc(sizeof(double) * this->maxN);
+ this->D2 = (double *) malloc(sizeof(double) * this->maxN);
+ this->D3plus = (double *) malloc(sizeof(double) * this->maxN);
+
+ for(i=0;i<this->maxN;i++){
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double n1 = ccTableForThisN[0]; //number of n-gram types that have freq equals 1
+ double n2 = ccTableForThisN[1]; //number of n-gram types that have freq equals 2;
+ double n3 = ccTableForThisN[2]; //number of n-gram types that have freq equals 3;
+ double n4 = ccTableForThisN[3]; //number of n-gram types that have freq equals 4;
+
+ this->Y[i] = n1/(n1+2*n2); //for (i+1)-gram
+ this->D1[i] = 1-2*Y[i]*n2/n1;
+ this->D2[i] = 2-3*Y[i]*n3/n2;
+ this->D3plus[i] = 3 - 4*Y[i]*n4/n3;
+ }
+
+ free(countOfCountsTable);
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+/// 3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ if(this->smoothingStrategy=='g'){ //if use Good-Turing for discounting
+ freqTable[0] = this->discountFreq_GT(1, totalOccurrences);
+ }
+ else{
+ freqTable[0] = totalOccurrences;
+ }
+
+ freqTable[1] = this->corpusSize;
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now
+ startPosForNgram = sentLen - 2;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ if(this->applyDiscounting){
+ freqTable[2*n-2] = this->discountFreq_GT(n, totalOccurrences);
+ }
+ else{
+ freqTable[2*n-2] = (double)totalOccurrences;
+ }
+
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+ free(table);
+
+}
+
+
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, S_ContextTypeInfo * contextTypeInfo, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+
+ freqTable[0] = totalOccurrences;
+ freqTable[1] = this->corpusSize;
+
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now for token freq
+ startPosForNgram = sentLen - n;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ freqTable[2*n-2] = (double)totalOccurrences;
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+
+ //estimate the context type information which will be used for KN-smoothing
+ for(n=2;n<=sentLen;n++){
+ startPosForNgram = sentLen - n;
+ TextLenType w_in2_i1_startPos_in_SA = 0;
+ TextLenType w_in2_i1_endPos_in_SA = 0;
+
+ if(n>2){
+ int indexForW_in2_i1 = (n-3) * sentLen + startPosForNgram + 1; //the location information for w_{i-n+2}^{i-1} of length n-2
+ w_in2_i1_startPos_in_SA = table[indexForW_in2_i1].startPosInSA;
+ w_in2_i1_endPos_in_SA = table[indexForW_in2_i1].endingPosInSA;
+ }
+
+ int indexForW_in1_i1 = (n-2) * sentLen + startPosForNgram; //the location information of w_{i-n+1}^{i-1} of length n-1
+
+ this->scanCorpusForContextTypeInfo(n, nextWord,
+ w_in2_i1_startPos_in_SA, w_in2_i1_endPos_in_SA,
+ table[indexForW_in1_i1].startPosInSA, table[indexForW_in1_i1].endingPosInSA,
+ contextTypeInfo[n-1]);
+ }
+
+ free(table);
+
+
+}
+
+///given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq_GT(int n, unsigned int observedFreq)
+{
+ if(n>=this->maxN){ //do not discount
+ return (double) observedFreq;
+ }
+
+ if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq
+ return (double) observedFreq;
+ }
+
+ //else, check the discount map
+ double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+ if(discountedFreq>0){
+ return discountedFreq;
+ }
+
+ //else, no discounting
+ return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+///Increase the count of 'sentenceProcessedSoFar'
+///If LM has processed 'numberOfSentSeenToPurgeCache' sentences
+///it is time to check if old entries in the cache should be cleaned
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+ long currentTime;
+ time(&currentTime);
+
+ this->resetLmStates();
+ this->initialLmState();
+
+ this->sentenceProcessedSoFar++;
+
+ if(this->sentenceProcessedSoFar==this->numberOfSentSeenToPurgeCache){
+ //purge the cache
+ this->purgeCache(currentTime-this->freshTime);
+
+ this->sentenceProcessedSoFar = 0;
+ }
+
+ return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+ //add sentence start
+ S_LMStateInfo sentStartNode;
+ sentStartNode.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s>
+ sentStartNode.len = 1;
+
+ this->allLMStates.push_back(sentStartNode);
+ this->lmStateInfo2Id.insert(make_pair(sentStartNode, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+ this->buffer.clear();
+ this->allLMStates.clear();
+ this->lmStateInfo2Id.clear();
+}
+
+/**
+* Purge entries in the cache that are not visited after "lastVisitedTime"
+* @param lastVisitedTime Entries in the cache that are older than 'lastVisitedTime' parameter will be purged
+**/
+void C_SuffixArrayLanguageModel::purgeCache(long lastVisitedTime)
+{
+ //cerr<<this->cached_sa_access.size()<<" entries in cache, purged to ";
+
+ map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key>::iterator iter1,iter2;
+
+ iter1 = this->cached_sa_access.begin();
+
+ while(iter1!=this->cached_sa_access.end()){
+ iter2=iter1;
+ iter2++;
+
+ if(iter1->second.lastTimedUsed<lastVisitedTime){
+ this->cached_sa_access.erase(iter1);
+ }
+
+ iter1=iter2;
+ }
+ //cerr<<this->cached_sa_access.size()<<" entries"<<endl;
+}
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+
+ //first check if we have already seen this before
+ map< pair<LMState, IndexType>, S_BufferedLmInfo>::iterator iterBuffer;
+ iterBuffer = this->buffer.find( make_pair( lmState, nextWord) );
+
+ if(iterBuffer==this->buffer.end()){ //we haven't seen this lmState+word yet
+ //search for it in the corpus
+ S_LMStateInfo lmStateInfo = this->allLMStates[lmState];
+ TextLenType updatedMatchingStart;
+ unsigned char updatedMatchingLen;
+
+ double logProb = this->logProbOfNgramFromCorpusInfo(lmStateInfo.posInCorpus, lmStateInfo.len, nextWord, updatedMatchingStart, updatedMatchingLen);
+
+
+ S_LMStateInfo updatedLmStateInfo;
+ updatedLmStateInfo.posInCorpus = updatedMatchingStart;
+ updatedLmStateInfo.len = updatedMatchingLen;
+
+ int updatedLmStateId;
+ map<S_LMStateInfo, int, lt_lmStateInfo>::iterator iterLmStateInfo2Id;
+ iterLmStateInfo2Id = this->lmStateInfo2Id.find(updatedLmStateInfo);
+ if(iterLmStateInfo2Id==this->lmStateInfo2Id.end()){ //this updated lm state does not exist yet
+ this->allLMStates.push_back(updatedLmStateInfo);
+ updatedLmStateId = this->allLMStates.size()-1;
+ this->lmStateInfo2Id.insert(make_pair(updatedLmStateInfo, updatedLmStateId));
+ }
+ else{
+ updatedLmStateId = iterLmStateInfo2Id->second;
+ }
+
+ //buffer this
+ S_BufferedLmInfo bufferedLmInfo;
+ bufferedLmInfo.logProb = logProb;
+ bufferedLmInfo.nextState = updatedLmStateId;
+
+ this->buffer.insert(make_pair( make_pair(lmState, nextWord), bufferedLmInfo));
+
+ //updated next state
+ nextState = updatedLmStateId;
+
+ return logProb;
+ }
+
+ nextState = iterBuffer->second.nextState;
+
+ return iterBuffer->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+ double logProb = 0;
+ for(int i=0;i<phrase.size();i++){
+ logProb+=this->logProb(lmState, phrase[i], nextState);
+ lmState = nextState;
+ }
+
+ return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+ LMState dummyNextState;
+ return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbOfNgramFromCorpusInfo(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ long currentTime;
+ time(&currentTime);
+
+ double logProb;
+
+ //first check if information is already in cache
+ S_CachedSA_Access_Key accessKey;
+ accessKey.currentMatchStart = currentMatchStart;
+ accessKey.currentMatchLen = currentMatchLen;
+ accessKey.nextWord = nextWord;
+
+ map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key>::iterator iter_cached_sa_access;
+
+ iter_cached_sa_access = this->cached_sa_access.find(accessKey);
+
+ if(iter_cached_sa_access==this->cached_sa_access.end()){ //information not in cache yet
+ double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+ memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+ S_ContextTypeInfo * contextTypeInfo = (S_ContextTypeInfo *) malloc(sizeof(S_ContextTypeInfo)*this->maxN);
+
+ switch(this->smoothingStrategy){
+ case 'k': //for Modified Kneser-Ney smoothing
+
+ this->calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, contextTypeInfo, updatedMatchingStart, updatedMatchingLen);
+ logProb = this->calcLogProb_kneserNeySmoothing(freqTable, contextTypeInfo);
+ break;
+ default: //all other cases including 'g' (Good-Turing smoothing)
+ this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+ logProb = this->calcLogProb(freqTable);
+ }
+
+ free(freqTable);
+ free(contextTypeInfo);
+
+ //insert the info into the cache
+ S_Cached_SA_Access_Info accessInfo;
+ accessInfo.updatedMatchingStart = updatedMatchingStart;
+ accessInfo.updatedMatchingLen = updatedMatchingLen;
+ accessInfo.logProb = logProb;
+ accessInfo.lastTimedUsed = currentTime;
+
+ this->cached_sa_access.insert(make_pair(accessKey, accessInfo));
+
+ return logProb;
+ }
+
+ //otherwise, already exist in the cache, just update the last touched time
+ updatedMatchingStart = iter_cached_sa_access->second.updatedMatchingStart;
+ updatedMatchingLen = iter_cached_sa_access->second.updatedMatchingLen;
+ logProb = iter_cached_sa_access->second.logProb;
+
+ return logProb;
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+ switch(this->interpolationStrategy){
+ case 'e':
+ return this->calcLogProb_equalWeightedInterpolation(freq);
+ break;
+ case 'i':
+ return this->calcLogProb_ibmHeuristicInterpolation(freq);
+ break;
+ case 'm':
+ return this->calcLogProb_maxProbInterpolation(freq);
+ break;
+ default:
+ cerr<<"Unknown interpolation strategy!\n";
+ exit(0);
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+ double prob = 0.0;
+
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ prob+=freq[2*i]/freq[2*i+1];
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(prob/(double)this->maxN);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+ double prob = 0.0;
+ if(freq[0]==0){ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+
+ double remainingWeightSum = 1.0;
+
+ //find the first non-zero match
+ int i = this->maxN - 1;
+
+ while(freq[2*i]==0){ //will stop for sure because freq[0]!=0
+ i--;
+ }
+
+ for(int j=i;j>=0;j--){
+ //for (j+1)-gram
+ double historyFreq = freq[2*j+1];
+ double logHistoryFreq = log(historyFreq);
+ if(logHistoryFreq>1){
+ logHistoryFreq = 1.0; //cap it to 1
+ }
+
+ double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history
+ double adjustedWeights = remainingWeightSum * reliability;
+
+ prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+ remainingWeightSum -= adjustedWeights;
+ }
+
+ return log(prob);
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+ double maxProb = 0.0;
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ double prob=freq[2*i]/freq[2*i+1];
+
+ if(prob>maxProb){
+ maxProb = prob;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(maxProb);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+/**
+* Follow the implementation described in page 23 of Chen & Goodman tech report (section 4.1.6 and 4.1.7)
+* Use notation described in James 2000 pp3 for MODKN-COUNT
+**/
+double C_SuffixArrayLanguageModel::calcLogProb_kneserNeySmoothing(double *freq, S_ContextTypeInfo * contextTypeFreq)
+{
+ double prob = 0.0;
+ int i;
+
+ if(freq[0]>0){
+ contextTypeFreq[i].
+ }
+
+ //unknown word
+ return SALM_LOG_PROB_UNK;
+}
+
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+ return this->voc->returnId(aWord);
+}
+
+
+/**
+* Scan corpus to collect important context-type information needed for KN-smoothing
+* Knowing where n-gram w_(i-n+2)^(i-1) occurs, scan corpus for N_{1+}(dot w_{i-n+2}^i)
+* and N_{1+}(dot w_{i-n+2}^{i-1} dot)
+* Also, collect type freq of n-grams w_{i-n+1}^{i-1} that occur exactly 1, 2 and 3+ times
+* to estimate the discounting factor gammar
+*
+* @see Chen & Goodman 1998 page 19-20 for detailed description
+*
+* @param n order of n-gram
+* @param w_in1 VocId of w<sub>i-n+1</sub>
+* @param w_i VocId of w<sub>i</sub>, the next word to be predicted
+* @param leftBoundaryOfSaRangeFor_w_in2_i1
+* @param rightBoundaryOfSaRangeFor_w_in2_i1 [leftBoundaryOfSaRangeFor_w_in2_i1, rightBoundaryOfSaRangeFor_w_in2_i1] is the range of suffix array positions that correspond to the locations of phrase w<sub>i-n+2</sub><sup>i-1</sup>
+* @param leftBoundaryOfSaRangeFor_w_in1
+* @param rigthBoundaryOfSaRangeFor_w_i1 [leftBoundaryOfSaRangeFor_w_in1, rigthBoundaryOfSaRangeFor_w_i1] is the range of suffix array positions that correspond to the locations of phrase w<sub>i-n+1</sub><sup>i-1</sup>
+* @return S_ContextTypeInfo containing the context type information
+**/
+void C_SuffixArrayLanguageModel::scanCorpusForContextTypeInfo(int n, IndexType w_i, TextLenType leftBoundaryOfSaRangeFor_w_in2_i1, TextLenType rightBoundaryOfSaRangeFor_w_in2_i1, TextLenType leftBoundaryOfSaRangeFor_w_in1_i1, TextLenType rigthBoundaryOfSaRangeFor_w_in1_i1, S_ContextTypeInfo & result)
+{
+
+ TextLenType i;
+ TextLenType posInCorpus;
+ IndexType nextWordInCorpus;
+ int n1 = n-1; //this value will be used frequently here
+
+ //first scan the corpus for all the word types that follow w_{i-n+1}^{i-1}
+ //to collect N1(w_in1^i1 dot) N2, and N3+ info needed
+ result.N1_w_in1_i1_dot = 0;
+ result.N2_w_in1_i1_dot = 0;
+ result.N3plus_w_in1_i1_dot = 0;
+
+ int freqOfCurrentType = -1; //freq of 'dot' with current type
+ IndexType currentNextWordType = 0;
+ for(i=leftBoundaryOfSaRangeFor_w_in1_i1;i<=rigthBoundaryOfSaRangeFor_w_in1_i1;i++){
+ posInCorpus = this->suffix_list[i] + n1;
+ //suffix_list[i] is the position of w_{i-n+1} in the corpus
+ //suffix_list[i]+n-1 is hte position of the word (the dot in the equation) that follows w_{i-n+1}^{i-1}
+
+ nextWordInCorpus = this->corpus_list[posInCorpus];
+ freqOfCurrentType++;
+ if(nextWordInCorpus!=currentNextWordType){
+
+ if(freqOfCurrentType==1){
+ result.N1_w_in1_i1_dot++;
+ }
+ else if(freqOfCurrentType==2){
+ result.N2_w_in1_i1_dot++;
+ }
+ else{ //freq of this type is >=3
+ result.N3plus_w_in1_i1_dot++;
+ }
+
+ currentNextWordType = nextWordInCorpus;
+ freqOfCurrentType=0;
+ }
+ }
+
+ //for the last type in the range
+ freqOfCurrentType++;
+
+ if(freqOfCurrentType==1){
+ result.N1_w_in1_i1_dot++;
+ }
+ else if(freqOfCurrentType==2){
+ result.N2_w_in1_i1_dot++;
+ }
+ else{ //freq of this type is >=3
+ result.N3plus_w_in1_i1_dot++;
+ }
+
+
+ //step 2, scan the corpus for N_{1+}(dot w_{i-n+2}^{i}) and N_{1+}(dot w_{i-n+2}^{i-1} dot)
+ IndexType precedingWord;
+ IndexType followingWord;
+ if(n==2){ //the special case
+ result.N1plus_dot_w_in2_i1_dot = this->typeOfBigrams;
+
+ //check if we have the N_1+(dot w_i) information already
+ map<IndexType, unsigned int>::iterator iterTypeFreqPrecedingWord;
+ iterTypeFreqPrecedingWord = this->typeFreqPrecedingWord.find(w_i);
+
+ if(iterTypeFreqPrecedingWord==this->typeFreqPrecedingWord.end()){ //does not exist yet
+ TextLenType startPosInSA = this->level1Buckets[w_i].first;
+ TextLenType endPosInSA = this->level1Buckets[w_i].last;
+
+ set<IndexType> wordTypePrecedesW_i;
+ for(i=startPosInSA;i<=endPosInSA;i++){
+ posInCorpus = this->suffix_list[i] - 1;
+ precedingWord = this->corpus_list[posInCorpus];
+
+ wordTypePrecedesW_i.insert(precedingWord);
+ }
+
+ result.N1plus_dot_w_in2_i = (double) wordTypePrecedesW_i.size();
+
+ //and save this for future references
+ this->typeFreqPrecedingWord.insert(make_pair(w_i, wordTypePrecedesW_i.size()));
+ }
+ else{ //already has the information in typeFreqPrecedingWord
+ result.N1plus_dot_w_in2_i = (double) (iterTypeFreqPrecedingWord->second);
+ }
+ }
+ else{
+ set<IndexType> wordTypesPrecedesW_in2_i;
+ set< pair<IndexType, IndexType> > wordTypesSurroundW_in2_i1;
+
+ for(i=leftBoundaryOfSaRangeFor_w_in2_i1;i<=rightBoundaryOfSaRangeFor_w_in2_i1;i++){
+ posInCorpus = this->suffix_list[i] -1; //pos of preceding word (w_{i-n+1}) in the corpus
+ precedingWord = this->corpus_list[posInCorpus];
+
+ posInCorpus+=n1; //pos of following word w_i in the corpus
+ followingWord = this->corpus_list[posInCorpus];
+
+ pair<IndexType, IndexType> tmpPair = make_pair(precedingWord, followingWord);
+
+ //if w_i equals next word, add the preceding word to set
+ if(followingWord==w_i){
+ wordTypesPrecedesW_in2_i.insert(precedingWord);
+ }
+
+ //add the pair to set
+ wordTypesSurroundW_in2_i1.insert(tmpPair);
+
+ }
+
+
+ result.N1plus_dot_w_in2_i = wordTypesPrecedesW_in2_i.size();
+ result.N1plus_dot_w_in2_i1_dot = wordTypesSurroundW_in2_i1.size();
+ }
+
+ result.valid = true;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h
new file mode 100755
index 0000000..9f9155a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h
@@ -0,0 +1,210 @@
+#if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__)
+#define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__
+
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include "salm_shared.h"
+#include "time.h"
+
+/**
+* \ingroup lm
+* Context type information needed in KN-smoothing
+**/
+typedef struct s_contextTypeInfo{
+ double N1plus_dot_w_in2_i; //Goodman and Chen 98, eq 23
+ double N1plus_dot_w_in2_i1_dot;
+ double N1_w_in1_i1_dot; //Goodman and Chen 98, eq 19
+ double N2_w_in1_i1_dot;
+ double N3plus_w_in1_i1_dot;
+ bool valid;
+}S_ContextTypeInfo;
+
+
+/**
+* \ingroup lm
+**/
+typedef unsigned int LMState;
+
+
+/**
+* \ingroup lm
+**/
+typedef struct s_lmStateInfo{
+ TextLenType posInCorpus;
+ unsigned char len;
+}S_LMStateInfo;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_bufferedLmInfo{
+ int nextState;
+ double logProb;
+}S_BufferedLmInfo;
+
+
+/**
+* \ingroup lm
+**/
+struct lt_lmStateInfo
+{
+ bool operator()(S_LMStateInfo a, S_LMStateInfo b) const{
+ if(a.posInCorpus<b.posInCorpus){
+ return true;
+ }
+
+ if(a.posInCorpus>b.posInCorpus){
+ return false;
+ }
+
+ if(a.len<b.len){
+ return true;
+ }
+
+ return false;
+ }
+};
+
+
+/**
+* \ingroup lm
+* structure for elements in the cache for accessing the suffix array for LM prob
+**/
+typedef struct s_cached_SA_access_key{
+ TextLenType currentMatchStart;
+ unsigned char currentMatchLen;
+ IndexType nextWord;
+}S_CachedSA_Access_Key;
+
+typedef struct s_cached_SA_access_info{
+ TextLenType updatedMatchingStart;
+ unsigned char updatedMatchingLen;
+ double logProb;
+ long lastTimedUsed;
+}S_Cached_SA_Access_Info;
+
+struct lt_s_cached_SA_access_key
+{
+ bool operator()(S_CachedSA_Access_Key a, S_CachedSA_Access_Key b) const{
+ if(a.currentMatchStart<b.currentMatchStart){
+ return true;
+ }
+
+ if(a.currentMatchStart>b.currentMatchStart){
+ return false;
+ }
+
+ if(a.currentMatchLen<b.currentMatchLen){
+ return true;
+ }
+
+ if(a.currentMatchLen>b.currentMatchLen){
+ return false;
+ }
+
+ if(a.nextWord<b.nextWord){
+ return true;
+ }
+
+ return false;
+ }
+};
+
+
+/**
+* \ingroup lm
+* C_SuffixArrayLanguageModel inherit the C_SuffixArraySearchApplicationBase class and C_SuffixArrayScanningBase
+* to provide functionalities of estimating the likelihood of a sentence given an indexed training corpus
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase
+{
+
+public:
+ IndexType returnVocId(C_String aWord);
+
+ /// At the beginning of a sentence, return the LMState and reset the cache
+ LMState beginOfSentenceState();
+
+ /// Calculate the log prob of a word predicted by the history LM state
+ double logProb(LMState lmState, IndexType nextWord, LMState & nextState);
+
+ /// The log prob of a phrase extending the history as a LMState
+ double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState);
+
+ /// End of sentence
+ double logProbEnd(LMState lmState);
+
+ /// Constructors
+ C_SuffixArrayLanguageModel(const char * cfgFileName);
+ C_SuffixArrayLanguageModel();
+ ~C_SuffixArrayLanguageModel();
+
+
+private:
+ void scanCorpusForContextTypeInfo(int n, IndexType w_i, TextLenType leftBoundaryOfSaRangeFor_w_in2_i1, TextLenType rightBoundaryOfSaRangeFor_w_in2_i1, TextLenType leftBoundaryOfSaRangeFor_w_in1_i1, TextLenType rigthBoundaryOfSaRangeFor_w_in1_i1, S_ContextTypeInfo & result);
+
+ void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+ void calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, S_ContextTypeInfo * contextTypeInfo, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+
+ //Log prob calculation
+ double logProbOfNgramFromCorpusInfo(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+ double calcLogProb(double *freq);
+ double calcLogProb_equalWeightedInterpolation(double *freq);
+ double calcLogProb_ibmHeuristicInterpolation(double *freq);
+ double calcLogProb_maxProbInterpolation(double * freq);
+ double calcLogProb_kneserNeySmoothing(double *freq, S_ContextTypeInfo * contextTypeFreq);
+
+ ///parameter and settings
+ ///set the interploation strategy
+ void setParam_interpolationStrategy(char interpolationStrategy);
+
+ ///set the number of sentences processed by the LM before purging the cache
+ void setParam_numberOfSentSeenToPurgeCache(int numberOfSentSeenToPurgeCache);
+
+ ///set the fresh time thresh for the cache entries
+ void setParam_freshTime(long freshTime);
+
+ char smoothingStrategy;
+ char interpolationStrategy;
+ int maxN;
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+
+ ///Discounting
+ void constructDiscountingMap();
+ double discountFreq_GT(int n, unsigned int observedFreq);
+
+ double * Y; // following the notation of Chen&Goodman 98, Eq. 26
+ double * D1;
+ double * D2;
+ double * D3plus;
+ double typeOfBigrams; //will be needed for KN-smoothing
+
+ double *discountingMap;
+ bool applyDiscounting;
+ int maxFreqForDiscounting;
+ S_nGramScanningInfoElement * nGramScanningList;
+ map<IndexType, unsigned int> typeFreqPrecedingWord;
+
+ ///LM State and related functions
+ void resetLmStates();
+ void initialLmState();
+ map< pair<LMState, IndexType>, S_BufferedLmInfo> buffer;
+ vector<S_LMStateInfo> allLMStates;
+ map<S_LMStateInfo, int, lt_lmStateInfo> lmStateInfo2Id;
+
+ //caching information for SA access
+ unsigned int sentenceProcessedSoFar;
+ long freshTime;
+ unsigned int numberOfSentSeenToPurgeCache;
+ map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key> cached_sa_access;
+ void purgeCache(long lastVisitedTime);
+
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp
new file mode 100755
index 0000000..0a94ff0
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp
@@ -0,0 +1,691 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <memory.h>
+#include <cstring>
+
+#include "math.h"
+
+using namespace std;
+
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+
+}
+
+
+/**
+* Construct the suffix array language model object
+* Using the training data corpusFileNameStem that has been indexed by IndexSA
+* Consider at most maxN-gram in language modeling
+* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting
+* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history
+* @param cfgFileName Configuration file that specifies the value of parameters for SALM
+*
+* Each line in the configuration file is a Keyword Value pair. Legal keywords are:
+* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified!
+* N : Highest order of n considered for n-gram LM estimation, default value = 5
+* MAX_FREQ_DISC : When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1.
+* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
+{
+
+ fstream cfgFile;
+ cfgFile.open(cfgFileName,ios::in);
+
+ if(!cfgFile){
+ fprintf(stderr,"Configuration file does not exist! quit!!\n");
+ exit(0);
+ }
+
+ //-----------------------------------------------------------------------------
+ //reading parameters
+ char paraName[1024];
+ char corpusFileNameStem[1024];
+ corpusFileNameStem[0]=0;
+ this->maxFreqForDiscounting=-1;
+
+ this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob
+ this->maxN = 5; // default value; consider up to 5 words
+
+ while(!cfgFile.eof()){
+ cfgFile>>paraName;
+
+ if(strcmp(paraName,"CORPUS")==0){
+ cfgFile>>corpusFileNameStem;
+ }
+ else if(strcmp(paraName,"N")==0){
+ cfgFile>>this->maxN;
+ }
+ else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+ cfgFile>>maxFreqForDiscounting;
+ }
+ else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+ cfgFile>>this->interpolationStrategy;
+ }
+
+ paraName[0]=0;
+
+ }
+
+ //load corpus and suffix array
+ if(strlen(corpusFileNameStem)==0){
+ cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n";
+ exit(-1);
+ }
+ this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset,
+
+
+ //if apply discounting construct the discounting map
+ if(this->maxFreqForDiscounting<=0){
+ this->applyDiscounting = false;
+ }
+ else{
+ this->applyDiscounting = true;
+ this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->interpolationStrategy = 'e'; //default: interpolation strategy: equally weighted n-gram conditional prob
+
+}
+
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information
+* and construct the discounting using Good-Turing smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+ int i,j;
+ unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+
+ if(countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ //initialize count of counts table
+ for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+ countOfCountsTable[c]=0;
+ }
+
+ //initialize the scanning list
+ S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ for(i=0;i<this->maxN;i++){
+ nGramScanningList[i].freqSoFar=0;
+ nGramScanningList[i].vocId = 0;
+ nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==nGramScanningList[i].vocId)){ //still match
+
+ nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ freqSoFar = nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ nGramScanningList[j].freqSoFar = 1;
+ }
+
+ nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+
+ freqSoFar = nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+ }
+
+ //now, use Good-Turing discounting to create frequency mapping
+ //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+ this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+
+ for(i=0;i<this->maxN;i++){
+ //for (i+1)-gram
+
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+ for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq
+ //for all (freq+1) ngrams
+ if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists
+ discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);
+ }
+ else{
+ discountingMapForThisN[freq] = -1;
+ }
+ }
+
+ discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency
+ }
+
+
+ free(countOfCountsTable);
+
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+/// 3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ if(this->applyDiscounting){
+ freqTable[0] = this->discountFreq(1, totalOccurrences);
+ }
+ else{
+ freqTable[0] = totalOccurrences;
+ }
+
+ freqTable[1] = this->corpusSize;
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now
+ startPosForNgram = sentLen - 2;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ if(this->applyDiscounting){
+ freqTable[2*n-2] = this->discountFreq(n, totalOccurrences);
+ }
+ else{
+ freqTable[2*n-2] = (double)totalOccurrences;
+ }
+
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+ free(table);
+
+}
+
+
+//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq)
+{
+ if(n>=this->maxN){ //do not discount
+ return (double) observedFreq;
+ }
+
+ if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq
+ return (double) observedFreq;
+ }
+
+ //else, check the discount map
+ double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+ if(discountedFreq>0){
+ return discountedFreq;
+ }
+
+ //else, no discounting
+ return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+
+ this->resetLmStates();
+ this->initialLmState();
+
+ return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+ //add sentence start
+ S_LMStateInfo sentStartNode;
+ sentStartNode.locationInCorpus.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s>
+ sentStartNode.locationInCorpus.len = 1;
+ sentStartNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(sentStartNode);
+ this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+ this->allLMStates.clear();
+ this->ngramLocation2LmStateId.clear();
+}
+
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+ if(lmState>=this->allLMStates.size()){
+ cerr<<"Invalid LM State: "<<lmState<<endl;
+ exit(-1);
+ }
+
+ //first check if we have already seen this 'nextWord' before
+ map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache;
+ iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord );
+
+ if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet
+
+ //search for it in the corpus
+ S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus;
+ S_NgramLocationInCorpus updatedNgramLocation;
+
+ double logProb = this->logProbFromFreq(
+ correspondingNgramLocation.posInCorpus,
+ correspondingNgramLocation.len,
+ nextWord,
+ updatedNgramLocation.posInCorpus,
+ updatedNgramLocation.len);
+
+ //caching the logprob of 'nextword' given the lmState
+ int updatedLmStateId;
+ map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId;
+ iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation);
+ if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){ //this updated lm state does not exist yet
+ S_LMStateInfo newLmStateNode;
+
+ newLmStateNode.locationInCorpus = updatedNgramLocation;
+ newLmStateNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(newLmStateNode);
+ updatedLmStateId = this->allLMStates.size() -1 ;
+ this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId));
+ }
+ else{
+ updatedLmStateId = iterNgramLocation2LmStateId->second;
+ }
+
+ //cache this
+ S_CachedLmInfo cachedLmInfo;
+ cachedLmInfo.logProb = logProb;
+ cachedLmInfo.nextState = updatedLmStateId;
+
+ this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo));
+
+ //updated next state
+ nextState = updatedLmStateId;
+
+ return logProb;
+ }
+
+ nextState = iterNextWordExtensionCache->second.nextState;
+
+ return iterNextWordExtensionCache->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase.
+* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function.
+ * @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+ double logProb = 0;
+
+ if (phrase.size() == 0) {
+ nextState = lmState;
+ return logProb;
+ }
+
+ for(int i=0;i<phrase.size();i++){
+ logProb+=this->logProb(lmState, phrase[i], nextState);
+ lmState = nextState;
+ }
+
+ return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+ LMState dummyNextState;
+ return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+
+ double logProb;
+
+ double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+ memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+ this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+
+ logProb = this->calcLogProb(freqTable);
+
+ free(freqTable);
+
+ return logProb;
+
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+ switch(this->interpolationStrategy){
+ case 'e':
+ return this->calcLogProb_equalWeightedInterpolation(freq);
+ break;
+ case 'i':
+ return this->calcLogProb_ibmHeuristicInterpolation(freq);
+ break;
+ case 'm':
+ return this->calcLogProb_maxProbInterpolation(freq);
+ break;
+ default:
+ cerr<<"Unknown interpolation strategy!\n";
+ exit(0);
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+ double prob = 0.0;
+
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ prob+=freq[2*i]/freq[2*i+1];
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(prob/(double)this->maxN);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+ double prob = 0.0;
+ if(freq[0]==0){ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+
+ double remainingWeightSum = 1.0;
+
+ //find the first non-zero match
+ int i = this->maxN - 1;
+
+ while(freq[2*i]==0){ //will stop for sure because freq[0]!=0
+ i--;
+ }
+
+ for(int j=i;j>=0;j--){
+ //for (j+1)-gram
+ double historyFreq = freq[2*j+1];
+ double logHistoryFreq = log(historyFreq);
+ if(logHistoryFreq>1){
+ logHistoryFreq = 1.0; //cap it to 1
+ }
+
+ double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history
+ double adjustedWeights = remainingWeightSum * reliability;
+
+ prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+ remainingWeightSum -= adjustedWeights;
+ }
+
+ return log(prob);
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+ double maxProb = 0.0;
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ double prob=freq[2*i]/freq[2*i+1];
+
+ if(prob>maxProb){
+ maxProb = prob;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(maxProb);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+ return this->voc->returnId(aWord);
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~
new file mode 100755
index 0000000..5241621
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~
@@ -0,0 +1,690 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <memory.h>
+
+#include "math.h"
+
+using namespace std;
+
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+
+}
+
+
+/**
+* Construct the suffix array language model object
+* Using the training data corpusFileNameStem that has been indexed by IndexSA
+* Consider at most maxN-gram in language modeling
+* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting
+* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history
+* @param cfgFileName Configuration file that specifies the value of parameters for SALM
+*
+* Each line in the configuration file is a Keyword Value pair. Legal keywords are:
+* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified!
+* N : Highest order of n considered for n-gram LM estimation, default value = 5
+* MAX_FREQ_DISC : When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1.
+* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
+{
+
+ fstream cfgFile;
+ cfgFile.open(cfgFileName,ios::in);
+
+ if(!cfgFile){
+ fprintf(stderr,"Configuration file does not exist! quit!!\n");
+ exit(0);
+ }
+
+ //-----------------------------------------------------------------------------
+ //reading parameters
+ char paraName[1024];
+ char corpusFileNameStem[1024];
+ corpusFileNameStem[0]=0;
+ this->maxFreqForDiscounting=-1;
+
+ this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob
+ this->maxN = 5; // default value; consider up to 5 words
+
+ while(!cfgFile.eof()){
+ cfgFile>>paraName;
+
+ if(strcmp(paraName,"CORPUS")==0){
+ cfgFile>>corpusFileNameStem;
+ }
+ else if(strcmp(paraName,"N")==0){
+ cfgFile>>this->maxN;
+ }
+ else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+ cfgFile>>maxFreqForDiscounting;
+ }
+ else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+ cfgFile>>this->interpolationStrategy;
+ }
+
+ paraName[0]=0;
+
+ }
+
+ //load corpus and suffix array
+ if(strlen(corpusFileNameStem)==0){
+ cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n";
+ exit(-1);
+ }
+ this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset,
+
+
+ //if apply discounting construct the discounting map
+ if(this->maxFreqForDiscounting<=0){
+ this->applyDiscounting = false;
+ }
+ else{
+ this->applyDiscounting = true;
+ this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->interpolationStrategy = 'e'; //default: interpolation strategy: equally weighted n-gram conditional prob
+
+}
+
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information
+* and construct the discounting using Good-Turing smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+ int i,j;
+ unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+
+ if(countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ //initialize count of counts table
+ for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+ countOfCountsTable[c]=0;
+ }
+
+ //initialize the scanning list
+ S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ for(i=0;i<this->maxN;i++){
+ nGramScanningList[i].freqSoFar=0;
+ nGramScanningList[i].vocId = 0;
+ nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==nGramScanningList[i].vocId)){ //still match
+
+ nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ freqSoFar = nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ nGramScanningList[j].freqSoFar = 1;
+ }
+
+ nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+
+ freqSoFar = nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+ }
+
+ //now, use Good-Turing discounting to create frequency mapping
+ //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+ this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+
+ for(i=0;i<this->maxN;i++){
+ //for (i+1)-gram
+
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+ for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq
+ //for all (freq+1) ngrams
+ if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists
+ discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);
+ }
+ else{
+ discountingMapForThisN[freq] = -1;
+ }
+ }
+
+ discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency
+ }
+
+
+ free(countOfCountsTable);
+
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+/// 3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ if(this->applyDiscounting){
+ freqTable[0] = this->discountFreq(1, totalOccurrences);
+ }
+ else{
+ freqTable[0] = totalOccurrences;
+ }
+
+ freqTable[1] = this->corpusSize;
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now
+ startPosForNgram = sentLen - 2;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ if(this->applyDiscounting){
+ freqTable[2*n-2] = this->discountFreq(n, totalOccurrences);
+ }
+ else{
+ freqTable[2*n-2] = (double)totalOccurrences;
+ }
+
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+ free(table);
+
+}
+
+
+//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq)
+{
+ if(n>=this->maxN){ //do not discount
+ return (double) observedFreq;
+ }
+
+ if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq
+ return (double) observedFreq;
+ }
+
+ //else, check the discount map
+ double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+ if(discountedFreq>0){
+ return discountedFreq;
+ }
+
+ //else, no discounting
+ return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+
+ this->resetLmStates();
+ this->initialLmState();
+
+ return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+ //add sentence start
+ S_LMStateInfo sentStartNode;
+ sentStartNode.locationInCorpus.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s>
+ sentStartNode.locationInCorpus.len = 1;
+ sentStartNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(sentStartNode);
+ this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+ this->allLMStates.clear();
+ this->ngramLocation2LmStateId.clear();
+}
+
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+ if(lmState>=this->allLMStates.size()){
+ cerr<<"Invalid LM State: "<<lmState<<endl;
+ exit(-1);
+ }
+
+ //first check if we have already seen this 'nextWord' before
+ map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache;
+ iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord );
+
+ if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet
+
+ //search for it in the corpus
+ S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus;
+ S_NgramLocationInCorpus updatedNgramLocation;
+
+ double logProb = this->logProbFromFreq(
+ correspondingNgramLocation.posInCorpus,
+ correspondingNgramLocation.len,
+ nextWord,
+ updatedNgramLocation.posInCorpus,
+ updatedNgramLocation.len);
+
+ //caching the logprob of 'nextword' given the lmState
+ int updatedLmStateId;
+ map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId;
+ iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation);
+ if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){ //this updated lm state does not exist yet
+ S_LMStateInfo newLmStateNode;
+
+ newLmStateNode.locationInCorpus = updatedNgramLocation;
+ newLmStateNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(newLmStateNode);
+ updatedLmStateId = this->allLMStates.size() -1 ;
+ this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId));
+ }
+ else{
+ updatedLmStateId = iterNgramLocation2LmStateId->second;
+ }
+
+ //cache this
+ S_CachedLmInfo cachedLmInfo;
+ cachedLmInfo.logProb = logProb;
+ cachedLmInfo.nextState = updatedLmStateId;
+
+ this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo));
+
+ //updated next state
+ nextState = updatedLmStateId;
+
+ return logProb;
+ }
+
+ nextState = iterNextWordExtensionCache->second.nextState;
+
+ return iterNextWordExtensionCache->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase.
+* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function.
+ * @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+ double logProb = 0;
+
+ if (phrase.size() == 0) {
+ nextState = lmState;
+ return logProb;
+ }
+
+ for(int i=0;i<phrase.size();i++){
+ logProb+=this->logProb(lmState, phrase[i], nextState);
+ lmState = nextState;
+ }
+
+ return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+ LMState dummyNextState;
+ return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+
+ double logProb;
+
+ double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+ memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+ this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+
+ logProb = this->calcLogProb(freqTable);
+
+ free(freqTable);
+
+ return logProb;
+
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+ switch(this->interpolationStrategy){
+ case 'e':
+ return this->calcLogProb_equalWeightedInterpolation(freq);
+ break;
+ case 'i':
+ return this->calcLogProb_ibmHeuristicInterpolation(freq);
+ break;
+ case 'm':
+ return this->calcLogProb_maxProbInterpolation(freq);
+ break;
+ default:
+ cerr<<"Unknown interpolation strategy!\n";
+ exit(0);
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+ double prob = 0.0;
+
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ prob+=freq[2*i]/freq[2*i+1];
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(prob/(double)this->maxN);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+ double prob = 0.0;
+ if(freq[0]==0){ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+
+ double remainingWeightSum = 1.0;
+
+ //find the first non-zero match
+ int i = this->maxN - 1;
+
+ while(freq[2*i]==0){ //will stop for sure because freq[0]!=0
+ i--;
+ }
+
+ for(int j=i;j>=0;j--){
+ //for (j+1)-gram
+ double historyFreq = freq[2*j+1];
+ double logHistoryFreq = log(historyFreq);
+ if(logHistoryFreq>1){
+ logHistoryFreq = 1.0; //cap it to 1
+ }
+
+ double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history
+ double adjustedWeights = remainingWeightSum * reliability;
+
+ prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+ remainingWeightSum -= adjustedWeights;
+ }
+
+ return log(prob);
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+ double maxProb = 0.0;
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ double prob=freq[2*i]/freq[2*i+1];
+
+ if(prob>maxProb){
+ maxProb = prob;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(maxProb);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+ return this->voc->returnId(aWord);
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h
new file mode 100755
index 0000000..62427e5
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h
@@ -0,0 +1,137 @@
+// Revision $Rev: 3794 $
+// Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+
+#if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__)
+#define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__
+
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include "salm_shared.h"
+
+/**
+* \ingroup lm
+**/
+typedef unsigned int LMState;
+
+
+/**
+* \ingroup lm
+**/
+typedef struct s_cachedLmInfo{
+ int nextState;
+ double logProb;
+}S_CachedLmInfo;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_NgramLocationInCorpus{
+ TextLenType posInCorpus;
+ unsigned char len;
+}S_NgramLocationInCorpus;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_lmStateInfo{
+ S_NgramLocationInCorpus locationInCorpus;
+ map<IndexType, S_CachedLmInfo> cachedNextWordExtension; //cached information of this LMState extended by the next word
+}S_LMStateInfo;
+
+/**
+* \ingroup lm
+**/
+struct lt_ngramLocationInCorpus
+{
+ bool operator()(S_NgramLocationInCorpus a, S_NgramLocationInCorpus b) const{
+ if(a.posInCorpus<b.posInCorpus){
+ return true;
+ }
+
+ if(a.posInCorpus>b.posInCorpus){
+ return false;
+ }
+
+ if(a.len<b.len){
+ return true;
+ }
+
+ return false;
+ }
+};
+
+
+/**
+* \ingroup lm
+* C_SuffixArrayLanguageModel inherit the C_SuffixArraySearchApplicationBase class and C_SuffixArrayScanningBase
+* to provide functionalities of estimating the likelihood of a sentence given an indexed training corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase
+{
+
+public:
+ IndexType returnVocId(C_String aWord);
+
+ /// At the beginning of a sentence, return the LMState and reset the cache
+ LMState beginOfSentenceState();
+
+ /// Calculate the log prob of a word predicted by the history LM state
+ double logProb(LMState lmState, IndexType nextWord, LMState & nextState);
+
+ /// The log prob of a phrase extending the history as a LMState
+ double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState);
+
+ /// End of sentence
+ double logProbEnd(LMState lmState);
+
+ ///set the interploation strategy
+ void setParam_interpolationStrategy(char interpolationStrategy);
+
+
+ C_SuffixArrayLanguageModel(const char * cfgFileName);
+ C_SuffixArrayLanguageModel();
+ ~C_SuffixArrayLanguageModel();
+
+
+private:
+
+ void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+
+ //Log prob calculation
+ double logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+ double calcLogProb(double *freq);
+ double calcLogProb_equalWeightedInterpolation(double *freq);
+ double calcLogProb_ibmHeuristicInterpolation(double *freq);
+ double calcLogProb_maxProbInterpolation(double * freq);
+
+ char interpolationStrategy;
+ int maxN;
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+ ///Discounting
+ void constructDiscountingMap();
+ double *discountingMap;
+ double discountFreq(int n, unsigned int observedFreq);
+ bool applyDiscounting;
+ int maxFreqForDiscounting;
+ S_nGramScanningInfoElement * nGramScanningList;
+
+
+ ///LM State and related functions
+ void resetLmStates();
+ void initialLmState();
+
+ //caching lm prob for each sentence
+ vector<S_LMStateInfo> allLMStates;
+ map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus> ngramLocation2LmStateId;
+
+
+
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp
new file mode 100755
index 0000000..d7c96a2
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp
@@ -0,0 +1,34 @@
+
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, output the count-of-count information
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ //-----------------------------------------------------------------------------
+ if(argc<4){
+ fprintf(stderr,"\nGiven an indexed corpus, output the count of counts for n-grams.\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem maxN maxFreq\n\n",argv[0]);
+ exit(0);
+ }
+
+ unsigned int maxN = atoi(argv[2]);
+ unsigned int maxFreq = atoi(argv[3]);
+
+ C_SuffixArrayScanningBase saObj(argv[1], maxN);
+ saObj.scanSuffixArrayForCountofCounts(maxFreq);
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp
new file mode 100755
index 0000000..8e9544a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp
@@ -0,0 +1,70 @@
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Output n-gram types that have frequencies equal or higher than specified
+*
+*
+* CfgFile Format:
+* n1<tab>freq thresh for output n1-gram
+* n2<tab>freq thresh for output n2-gram
+* ... ... ...
+* n1<tab>freq thresh for output n1-gram
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ //-----------------------------------------------------------------------------
+ if(argc<3){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem cfgFile\n\n",argv[0]);
+
+ fprintf(stderr,"\n\tCfgFile Format:");
+ fprintf(stderr,"\n\t\tn1<tab>freq thresh for output n1-gram");
+ fprintf(stderr,"\n\t\tn2<tab>freq thresh for output n2-gram");
+ fprintf(stderr,"\n\t\t... ... ...");
+ fprintf(stderr,"\n\t\tn1<tab>freq thresh for output n1-gram\n");
+
+
+ exit(0);
+ }
+
+ //processing the threshold file
+ map<int, unsigned int> threshMap;
+ map<int, unsigned int>::iterator iterThreshMap;
+ fstream threshFile;
+ threshFile.open(argv[2]);
+ int n;
+ int maxN = 0;
+ unsigned int thresh;
+ while(! threshFile.eof()){
+ threshFile>>n>>thresh;
+ if(n>maxN){
+ maxN=n;
+ }
+ iterThreshMap = threshMap.find(n);
+ if(iterThreshMap==threshMap.end()){
+ threshMap.insert(make_pair(n,thresh)); //a little over-kill here, should have a well defined cfg file
+ }
+ }
+
+ C_SuffixArrayScanningBase saObj(argv[1], maxN);
+ iterThreshMap = threshMap.begin();
+ while(iterThreshMap!=threshMap.end()){
+ saObj.setNgramOutputFreqThresh(iterThreshMap->first, iterThreshMap->second);
+ iterThreshMap++;
+ }
+
+ saObj.scanSuffixArrayForHighFreqNgramType();
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp
new file mode 100755
index 0000000..35f9d3d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp
@@ -0,0 +1,32 @@
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Given an indexed corpus, output the type/token information of the n-grams in the corpus.
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ //-----------------------------------------------------------------------------
+ if(argc<3){
+ fprintf(stderr,"\nGiven an indexed corpus, output the type token information for n-grams.\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem maxN \n\n",argv[0]);
+ exit(0);
+ }
+
+ unsigned int maxN = atoi(argv[2]);
+
+ C_SuffixArrayScanningBase saObj(argv[1], maxN);
+ saObj.scanSuffixArrayForTypeToken();
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp
new file mode 100755
index 0000000..9050408
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp
@@ -0,0 +1,338 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArrayScanningBase.h"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase()
+{
+ this->countOfCountsTable = 0; //no memory has been allocated
+ this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+}
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN)
+{
+ this->countOfCountsTable = 0; //no memory has been allocated
+ this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+
+ //load suffix array
+ this->loadData(filename, false, true, true);
+
+ this->initializeForScanning(filename, maxN);
+}
+
+void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered)
+{
+ this->maxFreqConsidered = maxFreqConsidered;
+}
+
+
+/**
+* Initialize data structure needed for scanning after the suffix array has been loaded
+**/
+void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN)
+{
+ this->maxN = maxN;
+ this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ this->countOfCountsTable = 0; //no memory has been allocated
+
+ //initialize the scanning list
+ for(int i=0;i<this->maxN;i++){
+ this->nGramScanningList[i].freqSoFar=0;
+ this->nGramScanningList[i].vocId = 0;
+ this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+}
+
+C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase()
+{
+ free(this->nGramScanningList);
+
+ if(this->countOfCountsTable!=0){
+ free(this->countOfCountsTable);
+ }
+
+}
+
+void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh)
+{
+ if(n>this->maxN){
+ cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl;
+ exit(0);
+ }
+
+ this->nGramScanningList[n-1].freqThreshForOutput = freqThresh;
+}
+
+void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType()
+{
+ this->scanSuffixArray('H');
+
+}
+
+/// Count of counts is the number of n-gram types that occur a certain times in the corpus.
+/// Count of counts is important information in LM smoothing
+/// We scan the corpus for n-gram's type/token frequency and collect information for 1-gram, 2-gram,...and up to maxFreqConsidered-gram
+void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered)
+{
+ this->maxFreqConsidered = maxFreqConsidered;
+ this->constructCountOfCountsTable();
+
+ //output the count of counts
+ cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl;
+ for(int i=0;i<this->maxN;i++){
+ cout<<i+1<<endl;
+
+ unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered;
+ for(int freq=0;freq<maxFreqConsidered;freq++){
+ cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl;
+ }
+ }
+
+}
+
+///Check from 1-gram to maxN-gram for type-token information
+///the process is similar to "scanSuffixArrayForHighFreqNgramType"
+void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken()
+{
+ this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+ this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+
+ //initialize
+ for(int n=0;n<maxN;n++){
+ this->typeFreq[n]=0;
+ this->tokenFreq[n]=0;
+ }
+
+
+ //scan the suffix array
+ this->scanSuffixArray('T');
+
+ //output
+ cout<<"n\tType\tToken\n";
+ for(int i=0;i<this->maxN;i++){
+ cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl;
+ }
+}
+
+/**
+* Allocate memory for count-of-counts table and scan the corpus to fill in count of counts
+* memory will be freed in the destructor
+**/
+void C_SuffixArrayScanningBase::constructCountOfCountsTable()
+{
+ if(this->countOfCountsTable!=0){ //if there is already a count of counts table
+ free(this->countOfCountsTable);
+ }
+
+ this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered);
+
+ if(this->countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){
+ this->countOfCountsTable[c]=0;
+ }
+
+ this->scanSuffixArray('C');
+
+
+}
+
+/**
+* Scan through the indexed corpus and according to the action type,
+* perform actions accordingly when seeing a new n-gram type
+**/
+void C_SuffixArrayScanningBase::scanSuffixArray(char actionType)
+{
+
+ int i,j;
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match
+
+ this->nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ C_String tmpPhrase; //for output high freq n-grams
+
+ //prepare the prefix of the n-grams
+ if(actionType=='H'){
+ //common i-gram
+ for(j=0;j<=i-1;j++){
+ if(this->nGramScanningList[j].vocId==0){ //one of the word in the common i-gram is a NULL word, not a valid n-gram
+ validNgramUpSoFar = false;
+ }
+ tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+ tmpPhrase.appending(C_String(" "));
+ }
+ }
+
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ switch(actionType){
+
+ case 'C': //count of counts
+ freqSoFar = this->nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++;
+ }
+ break;
+
+ case 'H': //output high-freq n-grams
+ tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+ tmpPhrase.appending(C_String(" "));
+
+ if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){
+ cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl;
+ }
+ break;
+
+ case 'T': //type-token statistics
+ if(this->nGramScanningList[j].freqSoFar>0){
+ typeFreq[j]++;
+ }
+
+ tokenFreq[j]+=this->nGramScanningList[j].freqSoFar;
+
+ break;
+ default:
+ cerr<<"Unknown action!\n";
+ exit(-1);
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ this->nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ this->nGramScanningList[j].freqSoFar = 1;
+ }
+
+ this->nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ C_String finalTmpString; //for output high-freq n-gram type
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(this->nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+ switch(actionType){
+ case 'C': //for count-of-counts
+ freqSoFar = this->nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++;
+ }
+ break;
+
+ case 'H': //for high-freq n-gram types
+ finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId));
+ finalTmpString.appending(C_String(" "));
+ if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){
+ cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl;
+ }
+ break;
+
+ case 'T': //for type-token statistics
+ if(this->nGramScanningList[i].freqSoFar>0){
+ typeFreq[i]++;
+ }
+
+ tokenFreq[i]+=this->nGramScanningList[i].freqSoFar;
+ break;
+
+ default:
+ cerr<<"Unknown action!\n";
+ exit(-1);
+ }
+ }
+ }
+
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~
new file mode 100755
index 0000000..fd8bae8
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~
@@ -0,0 +1,338 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArrayScanningBase.h"
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase()
+{
+ this->countOfCountsTable = 0; //no memory has been allocated
+ this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+}
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN)
+{
+ this->countOfCountsTable = 0; //no memory has been allocated
+ this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+
+ //load suffix array
+ this->loadData(filename, false, true, true);
+
+ this->initializeForScanning(filename, maxN);
+}
+
+void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered)
+{
+ this->maxFreqConsidered = maxFreqConsidered;
+}
+
+
+/**
+* Initialize data structure needed for scanning after the suffix array has been loaded
+**/
+void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN)
+{
+ this->maxN = maxN;
+ this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ this->countOfCountsTable = 0; //no memory has been allocated
+
+ //initialize the scanning list
+ for(int i=0;i<this->maxN;i++){
+ this->nGramScanningList[i].freqSoFar=0;
+ this->nGramScanningList[i].vocId = 0;
+ this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+}
+
+C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase()
+{
+ free(this->nGramScanningList);
+
+ if(this->countOfCountsTable!=0){
+ free(this->countOfCountsTable);
+ }
+
+}
+
+void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh)
+{
+ if(n>this->maxN){
+ cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl;
+ exit(0);
+ }
+
+ this->nGramScanningList[n-1].freqThreshForOutput = freqThresh;
+}
+
+void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType()
+{
+ this->scanSuffixArray('H');
+
+}
+
+/// Count of counts is the number of n-gram types that occur a certain times in the corpus.
+/// Count of counts is important information in LM smoothing
+/// We scan the corpus for n-gram's type/token frequency and collect information for 1-gram, 2-gram,...and up to maxFreqConsidered-gram
+void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered)
+{
+ this->maxFreqConsidered = maxFreqConsidered;
+ this->constructCountOfCountsTable();
+
+ //output the count of counts
+ cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl;
+ for(int i=0;i<this->maxN;i++){
+ cout<<i+1<<endl;
+
+ unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered;
+ for(int freq=0;freq<maxFreqConsidered;freq++){
+ cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl;
+ }
+ }
+
+}
+
+///Check from 1-gram to maxN-gram for type-token information
+///the process is similar to "scanSuffixArrayForHighFreqNgramType"
+void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken()
+{
+ this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+ this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+
+ //initialize
+ for(int n=0;n<maxN;n++){
+ this->typeFreq[n]=0;
+ this->tokenFreq[n]=0;
+ }
+
+
+ //scan the suffix array
+ this->scanSuffixArray('T');
+
+ //output
+ cout<<"n\tType\tToken\n";
+ for(int i=0;i<this->maxN;i++){
+ cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl;
+ }
+}
+
+/**
+* Allocate memory for count-of-counts table and scan the corpus to fill in count of counts
+* memory will be freed in the destructor
+**/
+void C_SuffixArrayScanningBase::constructCountOfCountsTable()
+{
+ if(this->countOfCountsTable!=0){ //if there is already a count of counts table
+ free(this->countOfCountsTable);
+ }
+
+ this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered);
+
+ if(this->countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){
+ this->countOfCountsTable[c]=0;
+ }
+
+ this->scanSuffixArray('C');
+
+
+}
+
+/**
+* Scan through the indexed corpus and according to the action type,
+* perform actions accordingly when seeing a new n-gram type
+**/
+void C_SuffixArrayScanningBase::scanSuffixArray(char actionType)
+{
+
+ int i,j;
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==this->nGramScanningList[i].vocId)){ //still match
+
+ this->nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ C_String tmpPhrase; //for output high freq n-grams
+
+ //prepare the prefix of the n-grams
+ if(actionType=='H'){
+ //common i-gram
+ for(j=0;j<=i-1;j++){
+ if(this->nGramScanningList[j].vocId==0){ //one of the word in the common i-gram is a NULL word, not a valid n-gram
+ validNgramUpSoFar = false;
+ }
+ tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+ tmpPhrase.appending(C_String(" "));
+ }
+ }
+
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(this->nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ switch(actionType){
+
+ case 'C': //count of counts
+ freqSoFar = this->nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++;
+ }
+ break;
+
+ case 'H': //output high-freq n-grams
+ tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+ tmpPhrase.appending(C_String(" "));
+
+ if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){
+ cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl;
+ }
+ break;
+
+ case 'T': //type-token statistics
+ if(this->nGramScanningList[j].freqSoFar>0){
+ typeFreq[j]++;
+ }
+
+ tokenFreq[j]+=this->nGramScanningList[j].freqSoFar;
+
+ break;
+ default:
+ cerr<<"Unknown action!\n";
+ exit(-1);
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ this->nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ this->nGramScanningList[j].freqSoFar = 1;
+ }
+
+ this->nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ C_String finalTmpString; //for output high-freq n-gram type
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(this->nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+ switch(actionType){
+ case 'C': //for count-of-counts
+ freqSoFar = this->nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++;
+ }
+ break;
+
+ case 'H': //for high-freq n-gram types
+ finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId));
+ finalTmpString.appending(C_String(" "));
+ if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){
+ cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl;
+ }
+ break;
+
+ case 'T': //for type-token statistics
+ if(this->nGramScanningList[i].freqSoFar>0){
+ typeFreq[i]++;
+ }
+
+ tokenFreq[i]+=this->nGramScanningList[i].freqSoFar;
+ break;
+
+ default:
+ cerr<<"Unknown action!\n";
+ exit(-1);
+ }
+ }
+ }
+
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h
new file mode 100755
index 0000000..c517b72
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h
@@ -0,0 +1,53 @@
+#if !defined (_HEADER_SUFFIX_ARRAY_SCANNING_BASE_CLASS_)
+#define _HEADER_SUFFIX_ARRAY_SCANNING_BASE_CLASS_
+
+
+#include "_SuffixArrayApplicationBase.h"
+
+
+
+
+/**
+* \ingroup scan
+* C_SuffixArrayScanningBase class provides functions to scan through an indexed corpus
+* and output information such as the type/token frequency of the data
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArrayScanningBase : public C_SuffixArrayApplicationBase
+{
+public:
+ void setNgramOutputFreqThresh(int n, unsigned int freqThresh);
+ void scanSuffixArrayForHighFreqNgramType();
+ void scanSuffixArrayForCountofCounts(int maxFreqConsidered);
+ void scanSuffixArrayForTypeToken();
+
+ C_SuffixArrayScanningBase(const char * filename, unsigned int maxN);
+ C_SuffixArrayScanningBase();
+ ~C_SuffixArrayScanningBase();
+
+protected:
+ void setParam_maxFreqConsidered(int maxFreqConsidered);
+ void constructCountOfCountsTable();
+ void initializeForScanning(const char * filename, unsigned int maxN);
+
+ int maxN;
+ int maxFreqConsidered;
+
+ unsigned int * countOfCountsTable;
+
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+private:
+ void scanSuffixArray(char actionType);
+
+ S_nGramScanningInfoElement * nGramScanningList;
+
+
+ unsigned int * typeFreq;
+ unsigned int * tokenFreq;
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp
new file mode 100755
index 0000000..24b8cc4
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp
@@ -0,0 +1,130 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+int SHOW_DEBUG_INFO = 0;
+
+typedef struct s_ngram_freq_info{
+ C_String ngramText;
+ vector<IndexType> ngram;
+ unsigned int freq;
+}S_Ngram_Freq_Info;
+
+/**
+* Given several corpora indexed by their suffix array,
+* collect counts of n-grams in a list from all the corpora.
+* This is useful when a corpus is very large,
+* one can split the data into many chunks and sum up the n-gram frquencies.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //check parameters
+ if(argc<2){
+ cerr<<"\n-------------------------------------------";
+ cerr<<"\nUsage:";
+ cerr<<"\n\t"<<argv[0]<<" n-gram_list_filename < list of suffix arry used";
+ cerr<<"\nNote:";
+ cerr<<"\n\tn-gram_list_filename.id_voc must exist first.";
+ cerr<<"\n-------------------------------------------\n\n";
+
+ exit(0);
+ }
+
+ //load vocabulary
+ char id_voc_filename[1024];
+ sprintf(id_voc_filename, "%s.id_voc", argv[1]);
+ C_IDVocabulary voc(id_voc_filename);
+
+ //load the n-gram list
+ vector<S_Ngram_Freq_Info> ngramList;
+
+ ifstream NgramListFile;
+ NgramListFile.open(argv[1]);
+ char tmpString[4096];
+ while(!NgramListFile.eof()){
+
+ NgramListFile.getline(tmpString, 4096, '\n');
+
+ if(strlen(tmpString)>0){
+ S_Ngram_Freq_Info tmpNode;
+ tmpNode.ngramText = C_String(tmpString);
+ tmpNode.freq = 1;
+ tmpNode.ngram.clear();
+
+ //conver the n-gram as string to vocId
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+ int pos = 0;
+ int inputLen = strlen(tmpString);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = tmpString[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+ }
+
+ ngramList.push_back(tmpNode);
+ }
+ tmpString[0]='\0';
+ }
+ cerr<<"Total "<<ngramList.size()<<" ngrams loaded.\n";
+
+ //loop over all suffix array and collec the n-gram counts
+ char sa_filename[1024];
+ while(! cin.eof()){
+ cin>>sa_filename;
+
+ if(strlen(sa_filename)>0){
+ cerr<<"Considering "<<sa_filename<<endl;
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(sa_filename, true, true);
+
+ for(int i=0; i<ngramList.size(); i++){
+ unsigned int freq;
+
+ freq = sa.freqOfExactPhraseMatch(ngramList[i].ngram);
+
+ ngramList[i].freq+=freq;
+ }
+ }
+
+ sa_filename[0]=0;
+ }
+
+
+ for(int m=0;m<ngramList.size();m++){
+ cout<<ngramList[m].freq<<"\t";
+ cout<<ngramList[m].ngramText.toString()<<"\n";
+ }
+
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~
new file mode 100755
index 0000000..492b770
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~
@@ -0,0 +1,129 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+using namespace std;
+int SHOW_DEBUG_INFO = 0;
+
+typedef struct s_ngram_freq_info{
+ C_String ngramText;
+ vector<IndexType> ngram;
+ unsigned int freq;
+}S_Ngram_Freq_Info;
+
+/**
+* Given several corpora indexed by their suffix array,
+* collect counts of n-grams in a list from all the corpora.
+* This is useful when a corpus is very large,
+* one can split the data into many chunks and sum up the n-gram frquencies.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //check parameters
+ if(argc<2){
+ cerr<<"\n-------------------------------------------";
+ cerr<<"\nUsage:";
+ cerr<<"\n\t"<<argv[0]<<" n-gram_list_filename < list of suffix arry used";
+ cerr<<"\nNote:";
+ cerr<<"\n\tn-gram_list_filename.id_voc must exist first.";
+ cerr<<"\n-------------------------------------------\n\n";
+
+ exit(0);
+ }
+
+ //load vocabulary
+ char id_voc_filename[1024];
+ sprintf(id_voc_filename, "%s.id_voc", argv[1]);
+ C_IDVocabulary voc(id_voc_filename);
+
+ //load the n-gram list
+ vector<S_Ngram_Freq_Info> ngramList;
+
+ ifstream NgramListFile;
+ NgramListFile.open(argv[1]);
+ char tmpString[4096];
+ while(!NgramListFile.eof()){
+
+ NgramListFile.getline(tmpString, 4096, '\n');
+
+ if(strlen(tmpString)>0){
+ S_Ngram_Freq_Info tmpNode;
+ tmpNode.ngramText = C_String(tmpString);
+ tmpNode.freq = 1;
+ tmpNode.ngram.clear();
+
+ //conver the n-gram as string to vocId
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+ int pos = 0;
+ int inputLen = strlen(tmpString);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = tmpString[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+ }
+
+ ngramList.push_back(tmpNode);
+ }
+ tmpString[0]='\0';
+ }
+ cerr<<"Total "<<ngramList.size()<<" ngrams loaded.\n";
+
+ //loop over all suffix array and collec the n-gram counts
+ char sa_filename[1024];
+ while(! cin.eof()){
+ cin>>sa_filename;
+
+ if(strlen(sa_filename)>0){
+ cerr<<"Considering "<<sa_filename<<endl;
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(sa_filename, true, true);
+
+ for(int i=0; i<ngramList.size(); i++){
+ unsigned int freq;
+
+ freq = sa.freqOfExactPhraseMatch(ngramList[i].ngram);
+
+ ngramList[i].freq+=freq;
+ }
+ }
+
+ sa_filename[0]=0;
+ }
+
+
+ for(int m=0;m<ngramList.size();m++){
+ cout<<ngramList[m].freq<<"\t";
+ cout<<ngramList[m].ngramText.toString()<<"\n";
+ }
+
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp
new file mode 100755
index 0000000..9d47f3a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp
@@ -0,0 +1,72 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <map>
+#include <cstring>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, filter out the duplicated sentences in the data
+* and output the unique sentences within.
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem < original corpus > corpus with uniq sentences\n",argv[0]);
+
+ exit(0);
+ }
+
+ map< pair<TextLenType, int>, bool> duplicatedSentAlreadyOutput;
+ map< pair<TextLenType, int>, bool>::iterator iterDuplicatedSentAlreadyOutput;
+
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(argv[1], false, true);
+
+ unsigned long totalFilteredSent = 0;
+
+ cerr<<"Filtering duplicated sentences:\n";
+ char tmpString[4000];
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ TextLenType freq = 0;
+ TextLenType firstOccurrence;
+ int sentLen;
+
+ freq = sa.freqOfExactPhraseMatchAndFirstOccurrence(tmpString, firstOccurrence, sentLen);
+
+ if(freq>1){ //freq is at least 1, because this is the same corpus
+ //then there are multiple occurrences of this sentence
+ //check if we have already output it
+ iterDuplicatedSentAlreadyOutput = duplicatedSentAlreadyOutput.find(make_pair(firstOccurrence, sentLen));
+
+ if(iterDuplicatedSentAlreadyOutput == duplicatedSentAlreadyOutput.end()){ //we haven't output it
+ cout<<tmpString<<endl;
+ duplicatedSentAlreadyOutput.insert(make_pair(make_pair(firstOccurrence, sentLen), true));
+ }
+ else{
+ //it has been output already, ignore it
+ totalFilteredSent++;
+ }
+ }
+ else{ //freq==1, no duplication
+ cout<<tmpString<<endl;
+ }
+
+ }
+ }
+
+ cerr<<"Total "<<totalFilteredSent<<" duplicated sentences are filtered\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~
new file mode 100755
index 0000000..1278b3f
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~
@@ -0,0 +1,71 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <map>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, filter out the duplicated sentences in the data
+* and output the unique sentences within.
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem < original corpus > corpus with uniq sentences\n",argv[0]);
+
+ exit(0);
+ }
+
+ map< pair<TextLenType, int>, bool> duplicatedSentAlreadyOutput;
+ map< pair<TextLenType, int>, bool>::iterator iterDuplicatedSentAlreadyOutput;
+
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(argv[1], false, true);
+
+ unsigned long totalFilteredSent = 0;
+
+ cerr<<"Filtering duplicated sentences:\n";
+ char tmpString[4000];
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ TextLenType freq = 0;
+ TextLenType firstOccurrence;
+ int sentLen;
+
+ freq = sa.freqOfExactPhraseMatchAndFirstOccurrence(tmpString, firstOccurrence, sentLen);
+
+ if(freq>1){ //freq is at least 1, because this is the same corpus
+ //then there are multiple occurrences of this sentence
+ //check if we have already output it
+ iterDuplicatedSentAlreadyOutput = duplicatedSentAlreadyOutput.find(make_pair(firstOccurrence, sentLen));
+
+ if(iterDuplicatedSentAlreadyOutput == duplicatedSentAlreadyOutput.end()){ //we haven't output it
+ cout<<tmpString<<endl;
+ duplicatedSentAlreadyOutput.insert(make_pair(make_pair(firstOccurrence, sentLen), true));
+ }
+ else{
+ //it has been output already, ignore it
+ totalFilteredSent++;
+ }
+ }
+ else{ //freq==1, no duplication
+ cout<<tmpString<<endl;
+ }
+
+ }
+ }
+
+ cerr<<"Total "<<totalFilteredSent<<" duplicated sentences are filtered\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp
new file mode 100755
index 0000000..3daf337
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp
@@ -0,0 +1,47 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+
+/**
+* Application main functionL ExactNgramMatchingFreq
+* Input from stdin ngrams with each line containing one n-gram
+* Search the corpus for the occurrences of each n-gram and output their frequencies in the corpus
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem \n",argv[0]);
+
+ exit(0);
+ }
+
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(argv[1], false, true); //we need vocabulary, but do not need offset information here
+
+ cerr<<"Input N-grams:\n";
+ char tmpString[1000];
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ TextLenType freq = 0;
+ freq = sa.freqOfExactPhraseMatch(tmpString);
+ cout<<freq<<": "<<tmpString<<endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~
new file mode 100755
index 0000000..4c63c0b
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~
@@ -0,0 +1,46 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+
+/**
+* Application main functionL ExactNgramMatchingFreq
+* Input from stdin ngrams with each line containing one n-gram
+* Search the corpus for the occurrences of each n-gram and output their frequencies in the corpus
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s fileNameStem \n",argv[0]);
+
+ exit(0);
+ }
+
+
+ C_SuffixArraySearchApplicationBase sa;
+ sa.loadData_forSearch(argv[1], false, true); //we need vocabulary, but do not need offset information here
+
+ cerr<<"Input N-grams:\n";
+ char tmpString[1000];
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ TextLenType freq = 0;
+ freq = sa.freqOfExactPhraseMatch(tmpString);
+ cout<<freq<<": "<<tmpString<<endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp
new file mode 100755
index 0000000..421e503
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp
@@ -0,0 +1,85 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include <vector>
+#include <iostream>
+#include <cstring>
+#include "_SuffixArraySearchApplicationBase.h"
+
+using namespace std;
+
+
+/**
+* Return locations of all the embedded n-grams of a sentence in the indexed corpus
+*
+* Revison $Rev: 3794 $
+* Last modified: $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+
+ //-----------------------------------------------------------------------------
+ //check arguments
+ if(argc<2){
+ fprintf(stderr,"\n\nOutput locations of all the matched embedded n-grams of a sentence in an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem [highestFreq maxRet smallestUnit longestUnit] < list of sentences\n\n",argv[0]);
+
+ exit(-1);
+ }
+
+
+ int highFreq;
+ int maxRet;
+ int smallestUnit;
+ int longestUnit;
+
+ C_SuffixArraySearchApplicationBase saObj;
+
+ saObj.loadData_forSearch(argv[1], false, false);
+
+ if(argc>=6){ //if argument of highestFreq, maxRet, smallestUnits are set
+ highFreq = atoi(argv[2]);
+ maxRet = atoi(argv[3]);
+ smallestUnit = atoi(argv[4]);
+ longestUnit = atoi(argv[5]);
+
+ saObj.setParam_highestFreqThresholdForReport(highFreq);
+ saObj.setParam_reportMaxOccurrenceOfOneNgram(maxRet);
+ saObj.setParam_shortestUnitToReport(smallestUnit);
+ saObj.setParam_longestUnitToReport(longestUnit);
+ }
+
+ cerr<<"Input sentences:\n";
+
+ char sentence[10000];
+
+ while(!cin.eof()){
+ cin.getline(sentence,10000,'\n');
+ if(strlen(sentence)>0){
+
+ vector<C_String> sentAsCStringVector = saObj.convertCharStringToCStringVector(sentence); //for later display purpose
+
+
+ vector<S_phraseLocationElement> locations;
+ locations = saObj.findPhrasesInASentence(sentence);
+
+ if(locations.size()==0){
+ cout<<"Nothing can be found in the corpus.\n";
+ }
+ else{
+ for(int i=0;i<locations.size(); i++){
+ cout<<"N-gram ["<<(int)locations[i].posStartInSrcSent<<", "<<(int)locations[i].posEndInSrcSent<<"]: ";
+ for(int j=locations[i].posStartInSrcSent; j<=locations[i].posEndInSrcSent; j++){
+ cout<<sentAsCStringVector[j-1].toString()<<" ";
+ }
+ cout<<" found in corpus: ";
+ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ }
+ }
+ cout<<endl;
+ }
+ }
+
+
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~
new file mode 100755
index 0000000..cd7a86a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~
@@ -0,0 +1,84 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include <vector>
+#include <iostream>
+#include "_SuffixArraySearchApplicationBase.h"
+
+using namespace std;
+
+
+/**
+* Return locations of all the embedded n-grams of a sentence in the indexed corpus
+*
+* Revison $Rev: 3794 $
+* Last modified: $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+
+ //-----------------------------------------------------------------------------
+ //check arguments
+ if(argc<2){
+ fprintf(stderr,"\n\nOutput locations of all the matched embedded n-grams of a sentence in an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem [highestFreq maxRet smallestUnit longestUnit] < list of sentences\n\n",argv[0]);
+
+ exit(-1);
+ }
+
+
+ int highFreq;
+ int maxRet;
+ int smallestUnit;
+ int longestUnit;
+
+ C_SuffixArraySearchApplicationBase saObj;
+
+ saObj.loadData_forSearch(argv[1], false, false);
+
+ if(argc>=6){ //if argument of highestFreq, maxRet, smallestUnits are set
+ highFreq = atoi(argv[2]);
+ maxRet = atoi(argv[3]);
+ smallestUnit = atoi(argv[4]);
+ longestUnit = atoi(argv[5]);
+
+ saObj.setParam_highestFreqThresholdForReport(highFreq);
+ saObj.setParam_reportMaxOccurrenceOfOneNgram(maxRet);
+ saObj.setParam_shortestUnitToReport(smallestUnit);
+ saObj.setParam_longestUnitToReport(longestUnit);
+ }
+
+ cerr<<"Input sentences:\n";
+
+ char sentence[10000];
+
+ while(!cin.eof()){
+ cin.getline(sentence,10000,'\n');
+ if(strlen(sentence)>0){
+
+ vector<C_String> sentAsCStringVector = saObj.convertCharStringToCStringVector(sentence); //for later display purpose
+
+
+ vector<S_phraseLocationElement> locations;
+ locations = saObj.findPhrasesInASentence(sentence);
+
+ if(locations.size()==0){
+ cout<<"Nothing can be found in the corpus.\n";
+ }
+ else{
+ for(int i=0;i<locations.size(); i++){
+ cout<<"N-gram ["<<(int)locations[i].posStartInSrcSent<<", "<<(int)locations[i].posEndInSrcSent<<"]: ";
+ for(int j=locations[i].posStartInSrcSent; j<=locations[i].posEndInSrcSent; j++){
+ cout<<sentAsCStringVector[j-1].toString()<<" ";
+ }
+ cout<<" found in corpus: ";
+ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ }
+ }
+ cout<<endl;
+ }
+ }
+
+
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp
new file mode 100755
index 0000000..deb8b81
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp
@@ -0,0 +1,67 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <vector>
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+/**
+* \ingroup search
+*
+* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs
+* SentID and offset are all 1-based
+*
+* Note:
+* The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement
+* To output it as a number, one needs to cast it to integer type for proper display
+*
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]);
+
+ exit(-1);
+ }
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase saObj;
+
+ //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
+ saObj.loadData_forSearch(argv[1], false, false);
+
+
+ cerr<<"Input N-grams:\n";
+ char tmpString[10000];
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+ if(strlen(tmpString)>0){
+ vector<S_SimplePhraseLocationElement> locations;
+
+ locations = saObj.locateExactPhraseInCorpus(tmpString);
+
+ if(locations.size()==0){
+ cout<<"No occurrences found.\n";
+ }
+ else{
+ for(int i=0;i<locations.size(); i++){
+ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ }
+ }
+ cout<<endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~
new file mode 100755
index 0000000..71097f9
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~
@@ -0,0 +1,66 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup search
+*
+* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs
+* SentID and offset are all 1-based
+*
+* Note:
+* The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement
+* To output it as a number, one needs to cast it to integer type for proper display
+*
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+ if(argc<2){
+ fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]);
+
+ exit(-1);
+ }
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase saObj;
+
+ //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
+ saObj.loadData_forSearch(argv[1], false, false);
+
+
+ cerr<<"Input N-grams:\n";
+ char tmpString[10000];
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+ if(strlen(tmpString)>0){
+ vector<S_SimplePhraseLocationElement> locations;
+
+ locations = saObj.locateExactPhraseInCorpus(tmpString);
+
+ if(locations.size()==0){
+ cout<<"No occurrences found.\n";
+ }
+ else{
+ for(int i=0;i<locations.size(); i++){
+ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+ }
+ }
+ cout<<endl;
+ }
+ }
+
+ return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp
new file mode 100755
index 0000000..e614fdc
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp
@@ -0,0 +1,132 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+/**
+* Given the indexed training corpus, analyze the token/type matching ratio of the n-grams in the testing data.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+ fprintf(stderr,"\nOutput the n-gram matching statistics of a testing data given an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ map<int, pair<int, unsigned long> > results4OneSent;
+ map<int, pair<int, unsigned long> >::iterator iterResult;
+
+ vector<int> nGramTokenCountsInTest;
+ vector<int> nGramInTestMatched;
+ vector<double> nGramFreqInTrainMatched;
+
+ int maxSentLen = 4086;
+ nGramTokenCountsInTest.reserve(maxSentLen);
+ nGramInTestMatched.reserve(maxSentLen);
+ nGramFreqInTrainMatched.reserve(maxSentLen);
+
+ //initialize
+ for(int i=0;i<maxSentLen;i++){
+ nGramTokenCountsInTest.push_back(0);
+ nGramInTestMatched.push_back(0);
+ nGramFreqInTrainMatched.push_back(0);
+ }
+
+ char fileName[1000];
+ char tmpString[10000];
+
+ strcpy(fileName, argv[1]);
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(fileName, false, true);
+
+ fprintf(stderr,"Input sentences:\n");
+
+ long ltime1, ltime2;
+
+ time( &ltime1 );
+
+ int totalSentences = 0;
+ int matchedSentences = 0;
+ while(!cin.eof()){
+ int sentLen;
+ cin.getline(tmpString,10000,'\n');
+
+ if(strlen(tmpString)>0){
+
+ totalSentences++;
+
+ results4OneSent.clear();
+ results4OneSent = SA.returnNGramMatchingStatForOneSent(tmpString, sentLen);
+
+ if(sentLen>maxSentLen){
+ cerr<<"Sentence too long, we can not handle it! Exit.\n";
+ exit(0);
+ }
+
+ for(int j=1;j<=sentLen;j++){ //j-gram
+ nGramTokenCountsInTest[j]+=(sentLen-j+1); //number of j-grams in the sentence;
+ }
+
+ iterResult=results4OneSent.begin();
+ while(iterResult!=results4OneSent.end()){
+
+ nGramInTestMatched[iterResult->first]+=iterResult->second.first;
+ nGramFreqInTrainMatched[iterResult->first]+=iterResult->second.second;
+
+ if(iterResult->first==sentLen){ //a complete match
+ matchedSentences++;
+ }
+
+ iterResult++;
+ }
+ }
+
+ tmpString[0]=0;
+
+ }
+
+ int n = 1;
+ while(nGramInTestMatched[n]!=0){
+ int matched = nGramInTestMatched[n];
+ int totalInTest = nGramTokenCountsInTest[n];
+ cout<<"N="<<n<<":\t"<<matched<<" / "<<totalInTest<<"\t";
+ printf("%.1f\t", double(matched)/double(totalInTest)*100.0);
+ cout<<"OccInTrain= "<<nGramFreqInTrainMatched[n]<<endl;
+
+ n++;
+ }
+
+ cout<<"\nOut of "<<totalSentences<<" input sentences, "<<matchedSentences<<" can be found in the training data.\n";;
+ time( &ltime2 );
+ cout<<"Time cost:"<<ltime2-ltime2<<" seconds\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~
new file mode 100755
index 0000000..d33d3a9
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~
@@ -0,0 +1,131 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+/**
+* Given the indexed training corpus, analyze the token/type matching ratio of the n-grams in the testing data.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+ fprintf(stderr,"\nOutput the n-gram matching statistics of a testing data given an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ map<int, pair<int, unsigned long> > results4OneSent;
+ map<int, pair<int, unsigned long> >::iterator iterResult;
+
+ vector<int> nGramTokenCountsInTest;
+ vector<int> nGramInTestMatched;
+ vector<double> nGramFreqInTrainMatched;
+
+ int maxSentLen = 4086;
+ nGramTokenCountsInTest.reserve(maxSentLen);
+ nGramInTestMatched.reserve(maxSentLen);
+ nGramFreqInTrainMatched.reserve(maxSentLen);
+
+ //initialize
+ for(int i=0;i<maxSentLen;i++){
+ nGramTokenCountsInTest.push_back(0);
+ nGramInTestMatched.push_back(0);
+ nGramFreqInTrainMatched.push_back(0);
+ }
+
+ char fileName[1000];
+ char tmpString[10000];
+
+ strcpy(fileName, argv[1]);
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(fileName, false, true);
+
+ fprintf(stderr,"Input sentences:\n");
+
+ long ltime1, ltime2;
+
+ time( &ltime1 );
+
+ int totalSentences = 0;
+ int matchedSentences = 0;
+ while(!cin.eof()){
+ int sentLen;
+ cin.getline(tmpString,10000,'\n');
+
+ if(strlen(tmpString)>0){
+
+ totalSentences++;
+
+ results4OneSent.clear();
+ results4OneSent = SA.returnNGramMatchingStatForOneSent(tmpString, sentLen);
+
+ if(sentLen>maxSentLen){
+ cerr<<"Sentence too long, we can not handle it! Exit.\n";
+ exit(0);
+ }
+
+ for(int j=1;j<=sentLen;j++){ //j-gram
+ nGramTokenCountsInTest[j]+=(sentLen-j+1); //number of j-grams in the sentence;
+ }
+
+ iterResult=results4OneSent.begin();
+ while(iterResult!=results4OneSent.end()){
+
+ nGramInTestMatched[iterResult->first]+=iterResult->second.first;
+ nGramFreqInTrainMatched[iterResult->first]+=iterResult->second.second;
+
+ if(iterResult->first==sentLen){ //a complete match
+ matchedSentences++;
+ }
+
+ iterResult++;
+ }
+ }
+
+ tmpString[0]=0;
+
+ }
+
+ int n = 1;
+ while(nGramInTestMatched[n]!=0){
+ int matched = nGramInTestMatched[n];
+ int totalInTest = nGramTokenCountsInTest[n];
+ cout<<"N="<<n<<":\t"<<matched<<" / "<<totalInTest<<"\t";
+ printf("%.1f\t", double(matched)/double(totalInTest)*100.0);
+ cout<<"OccInTrain= "<<nGramFreqInTrainMatched[n]<<endl;
+
+ n++;
+ }
+
+ cout<<"\nOut of "<<totalSentences<<" input sentences, "<<matchedSentences<<" can be found in the training data.\n";;
+ time( &ltime2 );
+ cout<<"Time cost:"<<ltime2-ltime2<<" seconds\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp
new file mode 100755
index 0000000..ca12119
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp
@@ -0,0 +1,50 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+/**
+* Given a corpus indexed by its suffix array, input a sentence from STDIN and output the frequencies of its embedded n-grams in the corpus.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ char tmpString[1000];
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(argv[1], false, true);
+
+ fprintf(stderr,"Input Sentences:\n");
+
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ SA.displayNgramMatchingFreq4Sent(tmpString);
+ }
+ }
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~
new file mode 100755
index 0000000..5e2433b
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~
@@ -0,0 +1,49 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+/**
+* Given a corpus indexed by its suffix array, input a sentence from STDIN and output the frequencies of its embedded n-grams in the corpus.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ char tmpString[1000];
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(argv[1], false, true);
+
+ fprintf(stderr,"Input Sentences:\n");
+
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+ SA.displayNgramMatchingFreq4Sent(tmpString);
+ }
+ }
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp
new file mode 100755
index 0000000..544a230
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp
@@ -0,0 +1,144 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "float.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+ n = index / sentLen + 1;
+ posInSrcSent = index % sentLen;
+}
+
+///Given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+ unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+ return indexInTable;
+}
+
+/**
+* Given a corpus indexed by its suffix array
+* calcuate the non-compositionalities of the embedded n-grams in a testing sentence
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ char tmpString[1000];
+ double bigN = 1000000;
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(argv[1], false, true);
+
+ fprintf(stderr,"Input Sentences:\n");
+
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+
+ SA.displayNgramMatchingFreq4Sent(tmpString);
+
+ printf("\n");
+
+ int sentLen;
+
+ S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+
+ //convert this to frequency table
+ double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen);
+
+ for(unsigned int i=0;i<(sentLen*sentLen);i++){
+ //all the short n-grams should all exist and their frequency information should be in table now
+ unsigned int startPos, n;
+ double minNc;
+ int leftNWithMinNc;
+
+ local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n);
+
+ if(matchingTable[i].found){
+ double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1;
+ freqTable[i]=freq;
+
+
+
+ //consider all splitting method
+ minNc = DBL_MAX;
+
+ for(unsigned int leftN=1;leftN<n;leftN++){
+ int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen);
+ int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen);
+
+ double leftFreq = freqTable[index_left];
+ double rightFreq = freqTable[index_right];
+
+ double nc = freq*bigN/(leftFreq*rightFreq);
+
+ if(nc<minNc){
+ minNc = nc;
+ leftNWithMinNc = leftN;
+ }
+
+ }
+ }
+ else{
+ freqTable[i]=0;
+ minNc = 0;
+ }
+
+ if(startPos==0){
+ printf("\n%d\t",n);
+ }
+
+ if(n==1){
+ printf("A\t"); //atom word, no way to break it
+ }
+ else{
+ if(minNc>0){
+ printf("%.1f[%d]\t", minNc, leftNWithMinNc);
+ }
+ else{
+ printf("_\t");
+ }
+ }
+ }
+
+ printf("\n");
+
+
+ free(matchingTable);
+ free(freqTable);
+
+
+ }
+ }
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~
new file mode 100755
index 0000000..294724e
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~
@@ -0,0 +1,145 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "float.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+ n = index / sentLen + 1;
+ posInSrcSent = index % sentLen;
+}
+
+///Given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+ unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+ return indexInTable;
+}
+
+/**
+* Given a corpus indexed by its suffix array
+* calcuate the non-compositionalities of the embedded n-grams in a testing sentence
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ char tmpString[1000];
+ double bigN = 1000000;
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(argv[1], false, true);
+
+ fprintf(stderr,"Input Sentences:\n");
+
+ while(!cin.eof()){
+ cin.getline(tmpString,100000,'\n');
+ if(strlen(tmpString)>0){
+
+ SA.displayNgramMatchingFreq4Sent(tmpString);
+
+ printf("\n");
+
+ int sentLen;
+
+ S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+
+ //convert this to frequency table
+ double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen);
+
+ for(unsigned int i=0;i<(sentLen*sentLen);i++){
+ //all the short n-grams should all exist and their frequency information should be in table now
+ unsigned int startPos, n;
+ double minNc;
+ int leftNWithMinNc;
+
+ local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n);
+
+ if(matchingTable[i].found){
+ double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1;
+ freqTable[i]=freq;
+
+
+
+ //consider all splitting method
+ minNc = DBL_MAX;
+
+ for(unsigned int leftN=1;leftN<n;leftN++){
+ int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen);
+ int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen);
+
+ double leftFreq = freqTable[index_left];
+ double rightFreq = freqTable[index_right];
+
+ double nc = freq*bigN/(leftFreq*rightFreq);
+
+ if(nc<minNc){
+ minNc = nc;
+ leftNWithMinNc = leftN;
+ }
+
+ }
+ }
+ else{
+ freqTable[i]=0;
+ minNc = 0;
+ }
+
+ if(startPos==0){
+ printf("\n%d\t",n);
+ }
+
+ if(n==1){
+ printf("A\t"); //atom word, no way to break it
+ }
+ else{
+ if(minNc>0){
+ printf("%.1f[%d]\t", minNc, leftNWithMinNc);
+ }
+ else{
+ printf("_\t");
+ }
+ }
+ }
+
+ printf("\n");
+
+
+ free(matchingTable);
+ free(freqTable);
+
+
+ }
+ }
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp
new file mode 100755
index 0000000..9697f4a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp
@@ -0,0 +1,178 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_String.h"
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+#include <cstring>
+
+using namespace std;
+
+
+vector<C_String> convertTextToStringVector(const char * sentText)
+{
+
+ vector<C_String> sentAsStringVect;
+
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+
+ int pos = 0;
+
+ int inputLen = strlen(sentText);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = sentText[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ sentAsStringVect.push_back(C_String(tmpToken));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ sentAsStringVect.push_back(C_String(tmpToken));
+ }
+
+ return sentAsStringVect;
+}
+
+/**
+* \ingroup search
+*
+* Given the training corpus indexed by its suffix array,
+* output all the n-grams in a testing data that can be found in the training corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+ fprintf(stderr,"\nOutput the matched n-gram types a testing data set given an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ map<C_String, double> matchedNgrams;
+ map<C_String, double>::iterator iterMatchedNgrams;
+
+
+ int maxSentLen = 4086;
+
+
+ char fileName[1000];
+ char tmpString[10000];
+
+ strcpy(fileName, argv[1]);
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(fileName, false, true);
+
+ cerr<<"Input sentences:\n";
+
+ long ltime1, ltime2;
+
+ time( &ltime1 );
+
+ int totalSentences = 0;
+ int matchedSentences = 0;
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+
+ if(strlen(tmpString)>0){
+ vector<C_String> sentAsStringVector = convertTextToStringVector(tmpString);
+
+ int sentLen;
+ S_sentSearchTableElement * freqTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+
+ if(sentLen!=sentAsStringVector.size()){
+ cerr<<"Something wrong, can not proceed.!\n";
+ exit(-1);
+ }
+
+
+ //go over the frequency table
+ for(int startPos = 0; startPos<sentLen; startPos++){
+ C_String ngram;
+ bool stillMatching = true;
+ int n=1;
+ while(stillMatching & (n<=(sentLen-startPos)) ){
+
+ ngram.appending(sentAsStringVector[startPos+n-1]);
+
+ int posInFreqTable = (n-1)*sentLen+startPos;
+ if(freqTable[posInFreqTable].found){
+ double frequency = freqTable[posInFreqTable].endingPosInSA - freqTable[posInFreqTable].startPosInSA + 1;
+
+ iterMatchedNgrams = matchedNgrams.find(ngram);
+ if(iterMatchedNgrams!=matchedNgrams.end()){ //exist already
+ iterMatchedNgrams->second=frequency; //frequency is not meaningful in this case, just use it because map need some values to be mapped to
+ }
+ else{
+ matchedNgrams.insert(make_pair(ngram, frequency));
+ }
+ }
+ else{
+ stillMatching = false;
+ }
+
+
+ ngram.appending(C_String(" "));
+
+ n++;
+ }
+ }
+
+ }
+
+ tmpString[0]=0;
+
+ }
+
+
+ //now output all the n-grams
+ iterMatchedNgrams = matchedNgrams.begin();
+ while(iterMatchedNgrams != matchedNgrams.end()){
+ cout<<(iterMatchedNgrams->first).toString()<<endl;
+
+ iterMatchedNgrams++;
+ }
+
+
+ time( &ltime2 );
+ cerr<<"Time spent:"<<ltime2-ltime2<<" seconds\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~
new file mode 100755
index 0000000..5418db6
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~
@@ -0,0 +1,177 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_String.h"
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+
+vector<C_String> convertTextToStringVector(const char * sentText)
+{
+
+ vector<C_String> sentAsStringVect;
+
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+
+ int pos = 0;
+
+ int inputLen = strlen(sentText);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = sentText[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ sentAsStringVect.push_back(C_String(tmpToken));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ sentAsStringVect.push_back(C_String(tmpToken));
+ }
+
+ return sentAsStringVect;
+}
+
+/**
+* \ingroup search
+*
+* Given the training corpus indexed by its suffix array,
+* output all the n-grams in a testing data that can be found in the training corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+ //-----------------------------------------------------------------------------
+ //check parameter
+
+
+ if(argc<2){
+ fprintf(stderr,"\nOutput the matched n-gram types a testing data set given an indexed corpus\n");
+ fprintf(stderr,"\nUsage:\n");
+ fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+
+ exit(0);
+ }
+
+
+ //-----------------------------------------------------------------------------
+
+ C_SuffixArraySearchApplicationBase SA;
+
+ map<C_String, double> matchedNgrams;
+ map<C_String, double>::iterator iterMatchedNgrams;
+
+
+ int maxSentLen = 4086;
+
+
+ char fileName[1000];
+ char tmpString[10000];
+
+ strcpy(fileName, argv[1]);
+
+ fprintf(stderr,"Loading data...\n");
+ SA.loadData_forSearch(fileName, false, true);
+
+ cerr<<"Input sentences:\n";
+
+ long ltime1, ltime2;
+
+ time( &ltime1 );
+
+ int totalSentences = 0;
+ int matchedSentences = 0;
+ while(!cin.eof()){
+ cin.getline(tmpString,10000,'\n');
+
+ if(strlen(tmpString)>0){
+ vector<C_String> sentAsStringVector = convertTextToStringVector(tmpString);
+
+ int sentLen;
+ S_sentSearchTableElement * freqTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+
+ if(sentLen!=sentAsStringVector.size()){
+ cerr<<"Something wrong, can not proceed.!\n";
+ exit(-1);
+ }
+
+
+ //go over the frequency table
+ for(int startPos = 0; startPos<sentLen; startPos++){
+ C_String ngram;
+ bool stillMatching = true;
+ int n=1;
+ while(stillMatching & (n<=(sentLen-startPos)) ){
+
+ ngram.appending(sentAsStringVector[startPos+n-1]);
+
+ int posInFreqTable = (n-1)*sentLen+startPos;
+ if(freqTable[posInFreqTable].found){
+ double frequency = freqTable[posInFreqTable].endingPosInSA - freqTable[posInFreqTable].startPosInSA + 1;
+
+ iterMatchedNgrams = matchedNgrams.find(ngram);
+ if(iterMatchedNgrams!=matchedNgrams.end()){ //exist already
+ iterMatchedNgrams->second=frequency; //frequency is not meaningful in this case, just use it because map need some values to be mapped to
+ }
+ else{
+ matchedNgrams.insert(make_pair(ngram, frequency));
+ }
+ }
+ else{
+ stillMatching = false;
+ }
+
+
+ ngram.appending(C_String(" "));
+
+ n++;
+ }
+ }
+
+ }
+
+ tmpString[0]=0;
+
+ }
+
+
+ //now output all the n-grams
+ iterMatchedNgrams = matchedNgrams.begin();
+ while(iterMatchedNgrams != matchedNgrams.end()){
+ cout<<(iterMatchedNgrams->first).toString()<<endl;
+
+ iterMatchedNgrams++;
+ }
+
+
+ time( &ltime2 );
+ cerr<<"Time spent:"<<ltime2-ltime2<<" seconds\n";
+
+ return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp
new file mode 100755
index 0000000..ebb2ed5
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp
@@ -0,0 +1,754 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <stdlib.h>
+#include <cstring>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase()
+{
+
+ this->reportMaxOccurrenceOfOneNgram = -1;
+ this->highestFreqThresholdForReport = -1;
+ this->shortestUnitToReport = 1;
+ this->longestUnitToReport = -1; //no constraint
+
+ this->level1Buckets = NULL;
+ this->noLevel1Bucket = false; //by default, build level1 bucket
+
+ this->noOffset = false; //by default, load offset
+}
+
+C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase()
+{
+
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter highestFreqThresholdForReport is set so that very high frequent n-grams such as unigram "the" is skipped
+* high frequent n-grams occur too often in the corpus and their statistics can often be estimated offline.
+* Default value = -1 (no effective threshold)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport)
+{
+ this->highestFreqThresholdForReport = highestFreqThresholdForReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter shortestUnitToReport is set so that short n-grams can be skipped to speed up the process
+* Default value = 1 (no effective constraint)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport)
+{
+ this->shortestUnitToReport = shortestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter longestUnitToReport is set to skip long n-gram matches
+*
+* Default value = -1 (no effective limit, output all the matched n-grams no matter how long they are)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport)
+{
+ this->longestUnitToReport = longestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter reportMaxOccurrenceOfOneNgram is set to output information of only the "first" few occurrences of the matched n-gram
+* Since the order is based on the order of the corresponding suffices in the corpus,
+* the output occurrences are usually not the first few occurrences of the n-gram in the corpus
+**/
+void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram)
+{
+ this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram;
+}
+
+
+
+/**
+* Load the indexed corpus, suffix array, offset and vocabulary into memory
+* Note: if C_SuffixArraySearchApplicationBase will be used in the application to return the sentenceId/offset in sentence for the matched n-gram
+* then noOffset needs to be set to be false (to load the offset)
+**/
+void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset)
+{
+
+ this->loadData(filename, noVoc, noOffset, false); //call the constructor of the super class, load data and build level1Bucket
+
+ if(! this->noOffset){
+ TextLenType lastSentId;
+ unsigned char tmpOffset;
+ this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset);
+ this->totalSentNum = lastSentId;
+ }
+ else{
+ //we do not have offset information, simply travel to the sentence head
+ TextLenType pos = this->corpusSize-3;
+ while(this->corpus_list[pos]<this->sentIdStart){ //still actual words
+ pos--;
+ }
+ //at this position, it should be the <sentId> for the last sentence
+ this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1;
+ }
+ cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n";
+
+}
+
+
+///return 0 if w = text
+///return 1 if w < text
+///return 2 if w > text
+///given that the prefix of lcp words are the same
+char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText)
+{
+
+ IndexType vocInText = this->corpus_list[posInText+lcp];
+
+ if(vocInWord == vocInText){
+ return 0;
+ }
+
+ if(vocInWord < vocInText){
+ return 1;
+ }
+
+ return 2;
+}
+
+/** Utility function
+* Convert an input sentence as char string into a vector of C_String objects
+**/
+vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText)
+{
+ vector<C_String> sentAsStringVector;
+
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+
+ int pos = 0;
+
+ int inputLen = strlen(sentText);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = sentText[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ sentAsStringVector.push_back(C_String(tmpToken));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ sentAsStringVector.push_back(C_String(tmpToken));
+ }
+
+ return sentAsStringVector;
+
+}
+
+/**
+* Utility function: convert a sentence as a vector of C_String to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector)
+{
+ if(this->noVocabulary){
+ cerr<<"Vocabulary not available!\n";
+ exit(-1);
+ }
+
+ vector<IndexType> sentAsVocIdVector;
+
+ for(int i=0;i<sentAsStringVector.size();i++){
+ sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i]));
+ }
+ return sentAsVocIdVector;
+}
+
+
+/**
+* Utility function:
+* Convert a sentence as character string to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText)
+{
+ vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText);
+ return this->convertCStringVectorToVocIdVector(sentAsCStringVector);
+}
+
+
+/**
+* If know the range where the phrase is, search in this range for it
+* position here are all positions in SA, not the positions in the textstring
+*
+* LCP indicates that all the suffixes in the range has the same prefix with LCP length with the proposed n-gram phrase
+* only need to compare the "nextWord" at LCP+1 position
+*
+* return true if such phrase can be found inside the range, false if not
+**/
+bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos)
+{
+ TextLenType leftPos, rightPos, middlePos;
+
+ //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket
+ //e.g. range correspondes to [ab, ad], but we are searching for (aa)
+ //so first step is to make sure the lcp+next word is still in this range
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){
+ //phrase+next word < text corresponding rangeStart, we could not find it inside this range
+ return false;
+ }
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){
+ //phrase+next word > text corresponding to rangeEnd
+ return false;
+ }
+
+ //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd])
+
+
+ //search for left bound ( the pos in text which is the min(text>=w))
+ //at any time, Left<w<=Right (actually Left<=w<=Right)
+ leftPos = rangeStartPos;
+ rightPos = rangeEndPos;
+ while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop
+
+ middlePos = (TextLenType)((leftPos + rightPos) / 2);
+ if(((leftPos + rightPos) % 2) != 0){
+ middlePos++; //bias towards right
+ }
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){
+ // phrase <= middlePos in Text, go left
+ rightPos = middlePos;
+ }
+ else{
+ leftPos = middlePos; //word > middle, go right
+ }
+
+ }
+ //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range
+ //here we can only guarantee that Left<=w, so need to check if Left==w at lcp
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){
+ resultStartPos = leftPos;
+ }
+ else{
+ resultStartPos = rightPos;
+ }
+
+ //search for right bound ( the value which is the max(text<=w))
+ //at any time, Left<w<=Right (actually Left<=w<=Right)
+ leftPos = rangeStartPos;
+ rightPos = rangeEndPos;
+ while( rightPos > (leftPos+1)){ //stop when right = left + 1
+ middlePos = (TextLenType) ((leftPos + rightPos) / 2 ); //bias towards left
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right
+ leftPos = middlePos;
+ }
+ else{
+ rightPos = middlePos; // ==1, phrase < middlePos
+ }
+ }
+ //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range
+ //here we can only guarantee that w<=Right, so need to check if Right==w at lcp
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){
+ resultEndPos = rightPos;
+ }
+ else{
+ resultEndPos = leftPos;
+ }
+
+ if(resultEndPos>=resultStartPos){
+ return true;
+ }
+
+ return false; //could not find this phrase
+}
+
+///memory allocated here, remember to free the memory when the table is not needed any more in the
+///calling function
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(sentText);
+ sentLen = sentInVocId.size();
+
+ return this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+}
+
+
+///constructing the n-gram search table
+///memory allocated here, remember to free the memory when the table is not needed any more in the
+///calling function
+///
+///faster than constructNgramSearchTable4Sent because the suffixes in the range given by n-1 gram can
+///guaranteed to have the first n-1 words to be the same as the n-1 gram
+///only needs to compare the following one word
+///
+/// for a sentence as:w1, w2,....
+/// cell [i,j] in the table is for n-gram from w_(j-1)...w_(j+i-1), that is a
+/// (i+1)-gram starting at position j+1 in sentence
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+ S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement));
+
+ //for consistency, initialize all cells
+ for(int c=0;c<(sentLen*sentLen);c++){
+ table[c].found = false;
+ table[c].startPosInSA = 0;
+ table[c].endingPosInSA = 0;
+ }
+
+ TextLenType startPos, endPos;
+
+ //initialize word level elements
+ for(int i=0;i<sentLen;i++){
+ IndexType vocId = sentInVocId[i];
+ //cout<<vocId<<" ";
+ if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word
+ table[i].found = false;
+ }
+ else{
+ table[i].startPosInSA = this->level1Buckets[vocId].first;
+ table[i].endingPosInSA = this->level1Buckets[vocId].last;
+
+ if(table[i].startPosInSA<=table[i].endingPosInSA){
+ table[i].found = true;
+ }
+ else{ //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc
+ table[i].found = false;
+ }
+ }
+ }
+
+
+ //filling in the cells in the table row by row
+ //basically this means we start by looking for smaller units first
+ //if they are found, search for longer n-grams
+ for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent
+ int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension
+ int levelN_0 = n * sentLen;
+ for(int j=0;j<= (sentLen - 1 - n); j++){ //possible starting point for n+1 gram
+ //necessary conditions that this n+1 gram exist are:
+ //the two sub n-gram all exist in the corpus
+ if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){
+ IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram
+
+ //n+1 gram has to be in the range of the n-gram in SA
+ startPos = table[levelN_1_0 + j].startPosInSA;
+ endPos = table[levelN_1_0 + j].endingPosInSA;
+
+ TextLenType foundPosStart = 0;
+ TextLenType foundPosEnd = 0;
+
+ //the prefix of n words of all suffixes between [startPos, endPos] is the same as the
+ //prefix of the n words in the proposed n+1 gram, no need to compare
+ //only need to compare the n+1 word, which is "nextWord" here
+ if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){
+ table[levelN_0 + j].found = true;
+ table[levelN_0 + j].startPosInSA = foundPosStart;
+ table[levelN_0 + j].endingPosInSA = foundPosEnd;
+ }
+ else{
+ table[levelN_0 + j].found = false;
+ }
+
+ }
+ else{
+ table[levelN_0 + j].found = false;
+ }
+ }
+ }
+ return table;
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(sent);
+ this->displayNgramMatchingFreq4Sent(sentInVocId);
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+
+ int i,j;
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ //show sentence
+ cout<<"\t";
+ for(i=0;i<sentLen;i++){
+ cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t";
+ }
+ cout<<endl;
+
+ //show frequency of each n-gram
+ i=0;
+ bool stillMatch = true;
+ while(stillMatch &&( i<sentLen)){
+ cout<<i+1<<"\t";
+ int startForRow = i*sentLen;
+ bool anyGood = false;
+ for(j=0;j<= (sentLen - 1 - i); j++){
+ if(table[startForRow+j].found){
+ //this is for regular case
+ if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){ //more than one occurrence
+ cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1;
+ anyGood = true;
+ }
+ else{
+ cout<<"0";
+ }
+
+ }
+ else{
+ cout<<"0";
+ }
+ cout<<"\t";
+ }
+
+ stillMatch = anyGood;
+ cout<<endl;
+ i++;
+ }
+
+ free(table);
+}
+
+///given the pos of a word in corpus, return its offset in the sentence
+///and the sentence ID
+///offset has to be loaded
+///we do not check it here for efficicency purposes
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset)
+{
+ offset = this->offset_list[pos];
+ sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+ offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen)
+{
+ offset = this->offset_list[pos];
+ sentLen = this->offset_list[pos-offset];
+ sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+ offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs)
+{
+ if(srcSentAsVocIDs.size()>255){
+ cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n";
+ exit(0);
+ }
+
+ unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs);
+
+ //Now, we know all the n-grams we are looking for
+ //output the results
+ vector<S_phraseLocationElement> allFoundNgrams;
+ S_phraseLocationElement tmpNode;
+
+ int longestUnitToReportForThisSent = sentLen;
+ if(this->longestUnitToReport!=-1){
+ //and if longestUnitToReport is shorter than sentLen
+ if(this->longestUnitToReport<sentLen){
+ longestUnitToReportForThisSent = this->longestUnitToReport;
+ }
+ }
+
+ for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){
+ int firstPosInRow = r*sentLen;
+ for(unsigned char c=0; c<= (sentLen - 1 - r); c++){
+ if(table[firstPosInRow + c].found){ //at this position the ngram was found
+ tmpNode.posStartInSrcSent = c + 1; //position starts from 1
+ tmpNode.posEndInSrcSent = r + c + 1;
+
+ //now for all ocurrences, find their sentId and realative positions
+ TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA;
+ TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA;
+
+ if( (this->highestFreqThresholdForReport <= 0) || //no limit
+ ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport ))
+ ){
+ // we don't want to retrieve high-freq n-gram which is very time consuming
+ //and meaningless for translation, such as 1M occurrences of "of the" in the corpus
+
+
+ if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){
+ //and for each n-gram, report only a limited amount of occurrences
+ endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1;
+ }
+
+ TextLenType sentId;
+ unsigned char posInSent;
+ for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){
+ this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent);
+ tmpNode.sentIdInCorpus = sentId;
+ tmpNode.posInSentInCorpus = posInSent;
+
+ allFoundNgrams.push_back(tmpNode);
+ }
+ }
+ }
+
+ }
+ }
+
+ free(table);
+
+ return allFoundNgrams;
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIDs
+ vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent);
+
+ return this->findPhrasesInASentence(srcSentAsVocIDs);
+}
+
+
+bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd)
+{
+ int phraseLen = phrase.size();
+
+ //first check if there are any <unk> in the phrase
+ for(int i=0;i<phrase.size();i++){
+ if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){
+ return false; //return empty matching result
+ }
+ }
+
+ TextLenType currentRangeStart, currentRangeEnd;
+ TextLenType narrowedRangeStart, narrowedRangeEnd;
+ IndexType vocId;
+
+ //for word 1
+ vocId = phrase[0];
+ currentRangeStart = this->level1Buckets[vocId].first;
+ currentRangeEnd = this->level1Buckets[vocId].last;
+
+ if(currentRangeStart>currentRangeEnd){
+ return false; //even this 1-gram does not exist
+ }
+
+ int posInPhrase = 1;
+ while( posInPhrase<phraseLen ){
+ vocId = phrase[posInPhrase];
+ bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd);
+
+ if(! stillExist){
+ return false;
+ }
+
+ currentRangeStart = narrowedRangeStart;
+ currentRangeEnd = narrowedRangeEnd;
+
+ posInPhrase++;
+ }
+
+ //we find the range of matching phrase, now get the sentId
+ rangeStart = currentRangeStart;
+ rangeEnd = currentRangeEnd;
+
+ return true;
+}
+
+///similar to construct the freq table
+///but only search for the exact phrase matching
+///Important: because locateSentIdFromPos is called which requires the offset information
+///Suffix array has to be initialized with offset loaded
+///i.e. initilized with loadData_forSearch(corpusName, bool noVoc, noOffset=fase)
+///otherwise the program will have segmentation fault
+///SALM does not check if offset has been loaded already for efficiency reasons because locateSendIdFromPos() is called frequently
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase)
+{
+ vector<S_SimplePhraseLocationElement> matchingResult;
+
+ TextLenType rangeStart, rangeEnd;
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ //we find some match
+ S_SimplePhraseLocationElement tmpNode;
+ for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){
+ this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus);
+ matchingResult.push_back(tmpNode);
+ }
+ }
+
+ return matchingResult;
+}
+
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->locateExactPhraseInCorpus(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase)
+{
+ TextLenType rangeStart, rangeEnd;
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ return rangeEnd - rangeStart + 1;
+ }
+
+ return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->freqOfExactPhraseMatch(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen)
+{
+ TextLenType rangeStart, rangeEnd;
+
+ sentLen = phrase.size();
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ startPosInSA = rangeStart;
+ return rangeEnd - rangeStart + 1;
+ }
+
+ return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber()
+{
+ return this->totalSentNum;
+}
+
+///given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+ n = index / sentLen + 1;
+ posInSrcSent = index % sentLen;
+}
+
+///given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+ unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+ return indexInTable;
+}
+
+///simple return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+ return this->numberOfMatcedNgram(sentInVocId);
+}
+
+///simply return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ unsigned int totalMatched = 0;
+
+ for(unsigned int i=0;i<(sentLen*sentLen);i++){
+ if(table[i].found){
+ totalMatched++;
+ }
+ }
+
+ free(table);
+ return totalMatched;
+}
+
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+ return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen);
+}
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen)
+{
+ sentLen = sentInVocId.size();
+ map<int, pair<int, unsigned long> > nGramMatched;
+ map<int, pair<int, unsigned long> >::iterator iterNGramMatched;
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ for(int n = 1; n <= sentLen; n++){
+ for(int startPos=0; startPos <= (sentLen - n); startPos++){
+ int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen);
+
+ if(table[indexInTable].found){
+
+ unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1;
+ iterNGramMatched = nGramMatched.find(n);
+ if(iterNGramMatched==nGramMatched.end()){//has not seen this before
+ nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) ));
+ }
+ else{
+ iterNGramMatched->second.first++;
+ iterNGramMatched->second.second+=freqInTraining;
+ }
+ }
+ }
+ }
+
+ free(table);
+
+ return nGramMatched;
+}
+
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~
new file mode 100755
index 0000000..94d272c
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~
@@ -0,0 +1,753 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase()
+{
+
+ this->reportMaxOccurrenceOfOneNgram = -1;
+ this->highestFreqThresholdForReport = -1;
+ this->shortestUnitToReport = 1;
+ this->longestUnitToReport = -1; //no constraint
+
+ this->level1Buckets = NULL;
+ this->noLevel1Bucket = false; //by default, build level1 bucket
+
+ this->noOffset = false; //by default, load offset
+}
+
+C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase()
+{
+
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter highestFreqThresholdForReport is set so that very high frequent n-grams such as unigram "the" is skipped
+* high frequent n-grams occur too often in the corpus and their statistics can often be estimated offline.
+* Default value = -1 (no effective threshold)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport)
+{
+ this->highestFreqThresholdForReport = highestFreqThresholdForReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter shortestUnitToReport is set so that short n-grams can be skipped to speed up the process
+* Default value = 1 (no effective constraint)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport)
+{
+ this->shortestUnitToReport = shortestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter longestUnitToReport is set to skip long n-gram matches
+*
+* Default value = -1 (no effective limit, output all the matched n-grams no matter how long they are)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport)
+{
+ this->longestUnitToReport = longestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter reportMaxOccurrenceOfOneNgram is set to output information of only the "first" few occurrences of the matched n-gram
+* Since the order is based on the order of the corresponding suffices in the corpus,
+* the output occurrences are usually not the first few occurrences of the n-gram in the corpus
+**/
+void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram)
+{
+ this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram;
+}
+
+
+
+/**
+* Load the indexed corpus, suffix array, offset and vocabulary into memory
+* Note: if C_SuffixArraySearchApplicationBase will be used in the application to return the sentenceId/offset in sentence for the matched n-gram
+* then noOffset needs to be set to be false (to load the offset)
+**/
+void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset)
+{
+
+ this->loadData(filename, noVoc, noOffset, false); //call the constructor of the super class, load data and build level1Bucket
+
+ if(! this->noOffset){
+ TextLenType lastSentId;
+ unsigned char tmpOffset;
+ this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset);
+ this->totalSentNum = lastSentId;
+ }
+ else{
+ //we do not have offset information, simply travel to the sentence head
+ TextLenType pos = this->corpusSize-3;
+ while(this->corpus_list[pos]<this->sentIdStart){ //still actual words
+ pos--;
+ }
+ //at this position, it should be the <sentId> for the last sentence
+ this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1;
+ }
+ cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n";
+
+}
+
+
+///return 0 if w = text
+///return 1 if w < text
+///return 2 if w > text
+///given that the prefix of lcp words are the same
+char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText)
+{
+
+ IndexType vocInText = this->corpus_list[posInText+lcp];
+
+ if(vocInWord == vocInText){
+ return 0;
+ }
+
+ if(vocInWord < vocInText){
+ return 1;
+ }
+
+ return 2;
+}
+
+/** Utility function
+* Convert an input sentence as char string into a vector of C_String objects
+**/
+vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText)
+{
+ vector<C_String> sentAsStringVector;
+
+ char tmpToken[MAX_TOKEN_LEN];
+ memset(tmpToken,0,MAX_TOKEN_LEN);
+
+ int pos = 0;
+
+ int inputLen = strlen(sentText);
+
+ for(int posInInput = 0; posInInput<inputLen; posInInput++){
+ char thisChar = sentText[posInInput];
+
+ if((thisChar==' ')||(thisChar=='\t')){ //delimiters
+ if(strlen(tmpToken)>0){
+ tmpToken[pos] = '\0';
+ sentAsStringVector.push_back(C_String(tmpToken));
+ pos=0;
+ tmpToken[pos] = '\0';
+ }
+ }
+ else{
+ tmpToken[pos] = thisChar;
+ pos++;
+ if(pos>=MAX_TOKEN_LEN){ //we can handle it
+ fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+ exit(0);
+ }
+ }
+ }
+
+ tmpToken[pos] = '\0';
+ if(strlen(tmpToken)>0){
+ sentAsStringVector.push_back(C_String(tmpToken));
+ }
+
+ return sentAsStringVector;
+
+}
+
+/**
+* Utility function: convert a sentence as a vector of C_String to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector)
+{
+ if(this->noVocabulary){
+ cerr<<"Vocabulary not available!\n";
+ exit(-1);
+ }
+
+ vector<IndexType> sentAsVocIdVector;
+
+ for(int i=0;i<sentAsStringVector.size();i++){
+ sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i]));
+ }
+ return sentAsVocIdVector;
+}
+
+
+/**
+* Utility function:
+* Convert a sentence as character string to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText)
+{
+ vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText);
+ return this->convertCStringVectorToVocIdVector(sentAsCStringVector);
+}
+
+
+/**
+* If know the range where the phrase is, search in this range for it
+* position here are all positions in SA, not the positions in the textstring
+*
+* LCP indicates that all the suffixes in the range has the same prefix with LCP length with the proposed n-gram phrase
+* only need to compare the "nextWord" at LCP+1 position
+*
+* return true if such phrase can be found inside the range, false if not
+**/
+bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos)
+{
+ TextLenType leftPos, rightPos, middlePos;
+
+ //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket
+ //e.g. range correspondes to [ab, ad], but we are searching for (aa)
+ //so first step is to make sure the lcp+next word is still in this range
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){
+ //phrase+next word < text corresponding rangeStart, we could not find it inside this range
+ return false;
+ }
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){
+ //phrase+next word > text corresponding to rangeEnd
+ return false;
+ }
+
+ //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd])
+
+
+ //search for left bound ( the pos in text which is the min(text>=w))
+ //at any time, Left<w<=Right (actually Left<=w<=Right)
+ leftPos = rangeStartPos;
+ rightPos = rangeEndPos;
+ while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop
+
+ middlePos = (TextLenType)((leftPos + rightPos) / 2);
+ if(((leftPos + rightPos) % 2) != 0){
+ middlePos++; //bias towards right
+ }
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){
+ // phrase <= middlePos in Text, go left
+ rightPos = middlePos;
+ }
+ else{
+ leftPos = middlePos; //word > middle, go right
+ }
+
+ }
+ //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range
+ //here we can only guarantee that Left<=w, so need to check if Left==w at lcp
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){
+ resultStartPos = leftPos;
+ }
+ else{
+ resultStartPos = rightPos;
+ }
+
+ //search for right bound ( the value which is the max(text<=w))
+ //at any time, Left<w<=Right (actually Left<=w<=Right)
+ leftPos = rangeStartPos;
+ rightPos = rangeEndPos;
+ while( rightPos > (leftPos+1)){ //stop when right = left + 1
+ middlePos = (TextLenType) ((leftPos + rightPos) / 2 ); //bias towards left
+
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right
+ leftPos = middlePos;
+ }
+ else{
+ rightPos = middlePos; // ==1, phrase < middlePos
+ }
+ }
+ //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range
+ //here we can only guarantee that w<=Right, so need to check if Right==w at lcp
+ if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){
+ resultEndPos = rightPos;
+ }
+ else{
+ resultEndPos = leftPos;
+ }
+
+ if(resultEndPos>=resultStartPos){
+ return true;
+ }
+
+ return false; //could not find this phrase
+}
+
+///memory allocated here, remember to free the memory when the table is not needed any more in the
+///calling function
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(sentText);
+ sentLen = sentInVocId.size();
+
+ return this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+}
+
+
+///constructing the n-gram search table
+///memory allocated here, remember to free the memory when the table is not needed any more in the
+///calling function
+///
+///faster than constructNgramSearchTable4Sent because the suffixes in the range given by n-1 gram can
+///guaranteed to have the first n-1 words to be the same as the n-1 gram
+///only needs to compare the following one word
+///
+/// for a sentence as:w1, w2,....
+/// cell [i,j] in the table is for n-gram from w_(j-1)...w_(j+i-1), that is a
+/// (i+1)-gram starting at position j+1 in sentence
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+ S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement));
+
+ //for consistency, initialize all cells
+ for(int c=0;c<(sentLen*sentLen);c++){
+ table[c].found = false;
+ table[c].startPosInSA = 0;
+ table[c].endingPosInSA = 0;
+ }
+
+ TextLenType startPos, endPos;
+
+ //initialize word level elements
+ for(int i=0;i<sentLen;i++){
+ IndexType vocId = sentInVocId[i];
+ //cout<<vocId<<" ";
+ if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word
+ table[i].found = false;
+ }
+ else{
+ table[i].startPosInSA = this->level1Buckets[vocId].first;
+ table[i].endingPosInSA = this->level1Buckets[vocId].last;
+
+ if(table[i].startPosInSA<=table[i].endingPosInSA){
+ table[i].found = true;
+ }
+ else{ //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc
+ table[i].found = false;
+ }
+ }
+ }
+
+
+ //filling in the cells in the table row by row
+ //basically this means we start by looking for smaller units first
+ //if they are found, search for longer n-grams
+ for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent
+ int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension
+ int levelN_0 = n * sentLen;
+ for(int j=0;j<= (sentLen - 1 - n); j++){ //possible starting point for n+1 gram
+ //necessary conditions that this n+1 gram exist are:
+ //the two sub n-gram all exist in the corpus
+ if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){
+ IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram
+
+ //n+1 gram has to be in the range of the n-gram in SA
+ startPos = table[levelN_1_0 + j].startPosInSA;
+ endPos = table[levelN_1_0 + j].endingPosInSA;
+
+ TextLenType foundPosStart = 0;
+ TextLenType foundPosEnd = 0;
+
+ //the prefix of n words of all suffixes between [startPos, endPos] is the same as the
+ //prefix of the n words in the proposed n+1 gram, no need to compare
+ //only need to compare the n+1 word, which is "nextWord" here
+ if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){
+ table[levelN_0 + j].found = true;
+ table[levelN_0 + j].startPosInSA = foundPosStart;
+ table[levelN_0 + j].endingPosInSA = foundPosEnd;
+ }
+ else{
+ table[levelN_0 + j].found = false;
+ }
+
+ }
+ else{
+ table[levelN_0 + j].found = false;
+ }
+ }
+ }
+ return table;
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(sent);
+ this->displayNgramMatchingFreq4Sent(sentInVocId);
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+
+ int i,j;
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ //show sentence
+ cout<<"\t";
+ for(i=0;i<sentLen;i++){
+ cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t";
+ }
+ cout<<endl;
+
+ //show frequency of each n-gram
+ i=0;
+ bool stillMatch = true;
+ while(stillMatch &&( i<sentLen)){
+ cout<<i+1<<"\t";
+ int startForRow = i*sentLen;
+ bool anyGood = false;
+ for(j=0;j<= (sentLen - 1 - i); j++){
+ if(table[startForRow+j].found){
+ //this is for regular case
+ if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){ //more than one occurrence
+ cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1;
+ anyGood = true;
+ }
+ else{
+ cout<<"0";
+ }
+
+ }
+ else{
+ cout<<"0";
+ }
+ cout<<"\t";
+ }
+
+ stillMatch = anyGood;
+ cout<<endl;
+ i++;
+ }
+
+ free(table);
+}
+
+///given the pos of a word in corpus, return its offset in the sentence
+///and the sentence ID
+///offset has to be loaded
+///we do not check it here for efficicency purposes
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset)
+{
+ offset = this->offset_list[pos];
+ sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+ offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen)
+{
+ offset = this->offset_list[pos];
+ sentLen = this->offset_list[pos-offset];
+ sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+ offset--; //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs)
+{
+ if(srcSentAsVocIDs.size()>255){
+ cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n";
+ exit(0);
+ }
+
+ unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs);
+
+ //Now, we know all the n-grams we are looking for
+ //output the results
+ vector<S_phraseLocationElement> allFoundNgrams;
+ S_phraseLocationElement tmpNode;
+
+ int longestUnitToReportForThisSent = sentLen;
+ if(this->longestUnitToReport!=-1){
+ //and if longestUnitToReport is shorter than sentLen
+ if(this->longestUnitToReport<sentLen){
+ longestUnitToReportForThisSent = this->longestUnitToReport;
+ }
+ }
+
+ for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){
+ int firstPosInRow = r*sentLen;
+ for(unsigned char c=0; c<= (sentLen - 1 - r); c++){
+ if(table[firstPosInRow + c].found){ //at this position the ngram was found
+ tmpNode.posStartInSrcSent = c + 1; //position starts from 1
+ tmpNode.posEndInSrcSent = r + c + 1;
+
+ //now for all ocurrences, find their sentId and realative positions
+ TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA;
+ TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA;
+
+ if( (this->highestFreqThresholdForReport <= 0) || //no limit
+ ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport ))
+ ){
+ // we don't want to retrieve high-freq n-gram which is very time consuming
+ //and meaningless for translation, such as 1M occurrences of "of the" in the corpus
+
+
+ if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){
+ //and for each n-gram, report only a limited amount of occurrences
+ endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1;
+ }
+
+ TextLenType sentId;
+ unsigned char posInSent;
+ for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){
+ this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent);
+ tmpNode.sentIdInCorpus = sentId;
+ tmpNode.posInSentInCorpus = posInSent;
+
+ allFoundNgrams.push_back(tmpNode);
+ }
+ }
+ }
+
+ }
+ }
+
+ free(table);
+
+ return allFoundNgrams;
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIDs
+ vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent);
+
+ return this->findPhrasesInASentence(srcSentAsVocIDs);
+}
+
+
+bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd)
+{
+ int phraseLen = phrase.size();
+
+ //first check if there are any <unk> in the phrase
+ for(int i=0;i<phrase.size();i++){
+ if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){
+ return false; //return empty matching result
+ }
+ }
+
+ TextLenType currentRangeStart, currentRangeEnd;
+ TextLenType narrowedRangeStart, narrowedRangeEnd;
+ IndexType vocId;
+
+ //for word 1
+ vocId = phrase[0];
+ currentRangeStart = this->level1Buckets[vocId].first;
+ currentRangeEnd = this->level1Buckets[vocId].last;
+
+ if(currentRangeStart>currentRangeEnd){
+ return false; //even this 1-gram does not exist
+ }
+
+ int posInPhrase = 1;
+ while( posInPhrase<phraseLen ){
+ vocId = phrase[posInPhrase];
+ bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd);
+
+ if(! stillExist){
+ return false;
+ }
+
+ currentRangeStart = narrowedRangeStart;
+ currentRangeEnd = narrowedRangeEnd;
+
+ posInPhrase++;
+ }
+
+ //we find the range of matching phrase, now get the sentId
+ rangeStart = currentRangeStart;
+ rangeEnd = currentRangeEnd;
+
+ return true;
+}
+
+///similar to construct the freq table
+///but only search for the exact phrase matching
+///Important: because locateSentIdFromPos is called which requires the offset information
+///Suffix array has to be initialized with offset loaded
+///i.e. initilized with loadData_forSearch(corpusName, bool noVoc, noOffset=fase)
+///otherwise the program will have segmentation fault
+///SALM does not check if offset has been loaded already for efficiency reasons because locateSendIdFromPos() is called frequently
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase)
+{
+ vector<S_SimplePhraseLocationElement> matchingResult;
+
+ TextLenType rangeStart, rangeEnd;
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ //we find some match
+ S_SimplePhraseLocationElement tmpNode;
+ for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){
+ this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus);
+ matchingResult.push_back(tmpNode);
+ }
+ }
+
+ return matchingResult;
+}
+
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->locateExactPhraseInCorpus(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase)
+{
+ TextLenType rangeStart, rangeEnd;
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ return rangeEnd - rangeStart + 1;
+ }
+
+ return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->freqOfExactPhraseMatch(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen)
+{
+ TextLenType rangeStart, rangeEnd;
+
+ sentLen = phrase.size();
+
+ if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+ startPosInSA = rangeStart;
+ return rangeEnd - rangeStart + 1;
+ }
+
+ return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen)
+{
+ //use the vocabulary associated with this corpus to convert words to vocIds
+ vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+ return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber()
+{
+ return this->totalSentNum;
+}
+
+///given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+ n = index / sentLen + 1;
+ posInSrcSent = index % sentLen;
+}
+
+///given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+ unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+ return indexInTable;
+}
+
+///simple return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+ return this->numberOfMatcedNgram(sentInVocId);
+}
+
+///simply return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId)
+{
+ int sentLen = sentInVocId.size();
+
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ unsigned int totalMatched = 0;
+
+ for(unsigned int i=0;i<(sentLen*sentLen);i++){
+ if(table[i].found){
+ totalMatched++;
+ }
+ }
+
+ free(table);
+ return totalMatched;
+}
+
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen)
+{
+ vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+ return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen);
+}
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen)
+{
+ sentLen = sentInVocId.size();
+ map<int, pair<int, unsigned long> > nGramMatched;
+ map<int, pair<int, unsigned long> >::iterator iterNGramMatched;
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+ for(int n = 1; n <= sentLen; n++){
+ for(int startPos=0; startPos <= (sentLen - n); startPos++){
+ int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen);
+
+ if(table[indexInTable].found){
+
+ unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1;
+ iterNGramMatched = nGramMatched.find(n);
+ if(iterNGramMatched==nGramMatched.end()){//has not seen this before
+ nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) ));
+ }
+ else{
+ iterNGramMatched->second.first++;
+ iterNGramMatched->second.second+=freqInTraining;
+ }
+ }
+ }
+ }
+
+ free(table);
+
+ return nGramMatched;
+}
+
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h
new file mode 100755
index 0000000..2c0070d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h
@@ -0,0 +1,127 @@
+#if !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)
+#define __SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_
+
+#include "_SuffixArrayApplicationBase.h"
+/**
+* \ingroup search
+* Used by locateExactPhraseInCorpus() to return the location of an matched n-gram in the corpus
+* as a pair of <sentenceId, offset pos in sentence>
+**/
+typedef struct simplePhraseLocationElement
+{
+ TextLenType sentIdInCorpus;
+ unsigned char posInSentInCorpus;
+}S_SimplePhraseLocationElement;
+
+/**
+* \ingroup search
+* Used by findPhraseInASentence() to return the location of an embedded n-gram in the corpus
+* <posStartInSrcSent, posEndInSrcSent> represents the embedded n-gram in the sentence
+* <sentIdInCorpus, posInSentInCorpus> represents the location in the corpus
+**/
+typedef struct phraseLocationElement
+{
+ unsigned char posStartInSrcSent;
+ unsigned char posEndInSrcSent;
+ TextLenType sentIdInCorpus;
+ unsigned char posInSentInCorpus;
+}S_phraseLocationElement;
+
+/**
+* \ingroup search
+**/
+typedef struct phraseLocationWithSrcSentElement
+{
+ int srcPosStart;
+ int srcPosEnd;
+ TextLenType sentId;
+ TextLenType posInSent;
+ vector<C_String> sentence;
+}S_phraseLocationWithSrcSentElement;
+
+/**
+* \ingroup search
+**/
+typedef struct sentSearchTableElement
+{
+ bool found;
+ TextLenType startPosInSA;
+ TextLenType endingPosInSA;
+}S_sentSearchTableElement;
+
+
+/**
+* \ingroup search
+* Base class for suffix array search applications
+* Provides functions to search n-grams in the corpus
+* Including the frequency of the n-gram and the actual location (sentenceID+offset in sentence)
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArraySearchApplicationBase : public C_SuffixArrayApplicationBase
+{
+public:
+ void loadData_forSearch(const char * filename, bool noVoc, bool noOffset);
+
+ unsigned int numberOfMatcedNgram(const char * srcSent);
+ unsigned int numberOfMatcedNgram(vector<IndexType> & sentInVocId);
+
+ TextLenType freqOfExactPhraseMatch(const char * phrase);
+ TextLenType freqOfExactPhraseMatch(vector<IndexType> & phrase);
+
+ TextLenType freqOfExactPhraseMatchAndFirstOccurrence(const char * phrase, TextLenType & startPosInSA, int & sentLen);
+ TextLenType freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen);
+
+ vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(const char * phrase);
+ vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(vector<IndexType> & phrase);
+
+ vector<S_phraseLocationElement> findPhrasesInASentence(const char * srcSent);
+ vector<S_phraseLocationElement> findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs);
+
+ void displayNgramMatchingFreq4Sent(const char *);
+ void displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId);
+
+ map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen);
+ map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int & sentLen);
+
+ S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen);
+ S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId);
+
+ void setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram);
+ void setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport);
+ void setParam_longestUnitToReport(int longestUnitToReport);
+ void setParam_shortestUnitToReport(int shortestUnitToReport);
+
+ TextLenType returnTotalSentNumber();
+
+ vector<IndexType> convertStringToVocId(const char * sentText);
+ vector<C_String> convertCharStringToCStringVector(const char * sentText);
+ vector<IndexType> convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector);
+
+
+ C_SuffixArraySearchApplicationBase();
+ virtual ~C_SuffixArraySearchApplicationBase();
+
+protected:
+ bool locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd);
+
+ bool searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType & resultStartPos, TextLenType & resultEndPos);
+ char comparePhraseWithTextWithLCP(IndexType, int, TextLenType);
+
+ void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset);
+ void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen);
+
+
+ unsigned int twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen);
+ void oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n);
+
+ int reportMaxOccurrenceOfOneNgram;
+ int highestFreqThresholdForReport;
+ int longestUnitToReport;
+ int shortestUnitToReport;
+
+ TextLenType totalSentNum;
+};
+
+#endif // !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp
new file mode 100755
index 0000000..91962fe
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp
@@ -0,0 +1,314 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayApplicationBase.h"
+
+#include "malloc.h"
+#include "time.h"
+
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase()
+{
+ this->level1Buckets = NULL;
+ this->noVocabulary = false; //by default, still load the vocabulary
+ this->noOffset = false; //by default, load offset
+ this->noLevel1Bucket = false; //by default, construct level1 bucket
+}
+
+C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase()
+{
+ if(this->level1Buckets!=NULL){
+ free(this->level1Buckets);
+ }
+
+ //not necessary too
+ free(this->corpus_list);
+ free(this->suffix_list);
+
+ if(! this->noOffset){
+ free(this->offset_list);
+ }
+
+ if(! this->noVocabulary){
+ delete(this->voc);
+ }
+}
+
+/**
+* Load the indexed corpus, suffix array, vocabulary, offset into memory for follow up applications
+* It is optional to load vocabulary, offset depends on the argument.
+* In the case when the testing data shares the same vocabulary as the training data and only vocIDs are used to represent the sentence/n-grams
+* then vocabulary which maps between vocId and the word text can be skipped to save some memory.
+*
+* If the suffix array object does not need to locate the sentence id of an occurred n-gram, then offset information is not needed.
+*
+* Be very careful here, the suffix array class does not check if offset has been loaded in the search function to make it efficient
+* you need to know what the suffix array class will be used (whether offset is needed) and load it properly
+* @param fileNameStem The filename of the corpus. This should be the same filename used in IndexSA
+* @param noVoc If set to be 'true', vocabulary will not be loaded
+* @param noOffset If set to be 'true', the offset information will not be loaded. <sentId, offsetInSent> information for an n-gram's occurrences can not be calculated.
+* @param noLevel1Bucket Level1Bucket is used to speed up the search at the cost of additional memory. For applications which do not need to locate n-grams in the corpus (such as the corpus scanning application), then there is no need to create Level1Bucket
+**/
+void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket)
+{
+ long ltime1, ltime2;
+
+ this->noVocabulary = noVoc;
+ this->noOffset = noOffset;
+ this->noLevel1Bucket = noLevel1Bucket;
+
+
+ char tmpString[1000];
+
+ //the order of loading the data is important, do not change
+ if(! this->noVocabulary){
+ time( &ltime1 );
+ cerr<<"Loading Vocabulary...\n";
+ sprintf(tmpString,"%s.id_voc",fileNameStem);
+ this->loadVoc(tmpString);
+ time( &ltime2);
+ cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n";
+ }
+
+ time( &ltime1 );
+ cerr<<"Loading corpus...\n";
+ sprintf(tmpString,"%s.sa_corpus",fileNameStem);
+ this->loadCorpusAndInitMem(tmpString);
+ time( &ltime2);
+ cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+ time( &ltime1 );
+ cerr<<"Loading suffix...\n";
+ sprintf(tmpString,"%s.sa_suffix",fileNameStem);
+ this->loadSuffix(tmpString);
+ time( &ltime2);
+ cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+ if(! this->noOffset){
+ time( &ltime1 );
+ cerr<<"Loading offset...\n";
+ sprintf(tmpString,"%s.sa_offset",fileNameStem);
+ this->loadOffset(tmpString);
+ time( &ltime2);
+ cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n";
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadVoc(const char *filename)
+{
+ this->voc = new C_IDVocabulary(filename);
+}
+
+void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * CorpusInputFile = fopen(filename, "rb");
+
+ if(!CorpusInputFile){
+ cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n";
+ exit(0);
+ }
+
+ //first, read the size of the corpus
+ dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile);
+
+ //allocate memory for all data structure
+ this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize);
+ if(! this->corpus_list){
+ cerr<<"Can not allocate memory to load the corpus!\n";
+ exit(0);
+ }
+
+ this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize);
+ if(! this->suffix_list){
+ cerr<<"Can not allocate memory to load the suffix!\n";
+ exit(0);
+ }
+
+ if(! this->noOffset){
+ this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize);
+ if(! this->offset_list){
+ cerr<<"Can not allocate memory to load the offset!\n";
+ exit(0);
+ }
+ }
+
+ //read the corpus file
+ unsigned int totalRead = 0;
+ unsigned int remaining = this->corpusSize;
+ unsigned int oneBatchReadSize;
+ char * currentPosInCorpusList = (char *) this->corpus_list;
+ while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+
+ currentPosInCorpusList+=sizeof(IndexType)*dwRead;
+ }
+ if(totalRead!=this->corpusSize){
+ cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+ fclose(CorpusInputFile);
+
+ this->sentIdStart = this->corpus_list[0];
+ this->vocIdForSentStart = this->corpus_list[1];
+ this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1];
+ this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2];
+
+ if(! this->noLevel1Bucket){
+ //in this corpus, we will have at most sentIdStart-1 word types
+ //the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data
+ this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart);
+
+ //initialize the level1 buckets
+ for(IndexType i=0;i<this->sentIdStart;i++){
+ this->level1Buckets[i].first = (TextLenType) -1;
+ this->level1Buckets[i].last = 0;
+ }
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadSuffix(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * SuffixInputFile = fopen(filename, "rb");
+ if(!SuffixInputFile){
+ cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl;
+ exit(0);
+ }
+
+ //first, read in the size of the suffix array
+ TextLenType suffixArraySize;
+ dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile);
+
+ if(suffixArraySize!=this->corpusSize){
+ cerr<<"Something wrong, the suffix array size is different from the corpus size.\n";
+ cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl;
+ exit(0);
+ }
+
+ //read all the suffix into memory
+ unsigned int totalRead = 0;
+ unsigned int remaining = suffixArraySize;
+ unsigned int oneBatchReadSize;
+ char * currentPosInSuffixList = (char *) this->suffix_list;
+ while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
+
+ totalRead+=dwRead;
+ remaining -= dwRead;
+
+ currentPosInSuffixList+=sizeof(TextLenType)*dwRead;
+ }
+ if(totalRead!=suffixArraySize){
+ cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+
+ fclose(SuffixInputFile);
+
+ if(! this->noLevel1Bucket){
+ //build level-1 bucket
+ cerr<<"Initialize level-1 buckets...\n";
+ IndexType currentVocId = 0;
+ IndexType vocId;
+ TextLenType pos;
+ TextLenType lastSaIndex = 0;
+
+ for(TextLenType i=0; i<suffixArraySize; i++){
+ pos = this->suffix_list[i];
+
+ //for level1 bucket
+ vocId = this->corpus_list[pos];
+
+ if(vocId<this->sentIdStart){ //is a meaningful word type
+ if(vocId!=currentVocId){
+ this->level1Buckets[currentVocId].last = lastSaIndex; //for first word which is <unk> this does not matter
+ this->level1Buckets[vocId].first = i;
+
+ currentVocId=vocId;
+ }
+
+ lastSaIndex = i;
+ }
+ }
+
+ //for the last word type
+ this->level1Buckets[currentVocId].last = lastSaIndex;
+ }
+ else{
+ this->level1Buckets = NULL;
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadOffset(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * OffsetInputFile = fopen(filename, "rb");
+
+ if(!OffsetInputFile){
+ cerr<<"Offset file: "<<filename<<" does not exist!"<<endl;
+ exit(0);
+ }
+
+ //first, read the size of the corpus
+ TextLenType offsetListLen;
+ dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile);
+ if(offsetListLen!=this->corpusSize){
+ cerr<<"Text length is inconsistent with the length of the offset.\n";
+ exit(0);
+ }
+
+ //read all the suffix into memory
+ unsigned int totalRead = 0;
+ unsigned int remaining = offsetListLen;
+ unsigned int oneBatchReadSize;
+ char * currentOffsetListPos = (char *) this->offset_list;
+ while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){
+ oneBatchReadSize = SIZE_ONE_READ;
+
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+
+ currentOffsetListPos+=sizeof(unsigned char)*dwRead;
+
+ }
+ if(totalRead!=offsetListLen){
+ cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+ fclose(OffsetInputFile);
+
+}
+
+TextLenType C_SuffixArrayApplicationBase::returnCorpusSize()
+{
+ return this->corpusSize;
+}
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~
new file mode 100755
index 0000000..bd17287
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~
@@ -0,0 +1,313 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayApplicationBase.h"
+
+#include "malloc.h"
+#include "time.h"
+
+#include <iostream>
+#include <fstream>
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase()
+{
+ this->level1Buckets = NULL;
+ this->noVocabulary = false; //by default, still load the vocabulary
+ this->noOffset = false; //by default, load offset
+ this->noLevel1Bucket = false; //by default, construct level1 bucket
+}
+
+C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase()
+{
+ if(this->level1Buckets!=NULL){
+ free(this->level1Buckets);
+ }
+
+ //not necessary too
+ free(this->corpus_list);
+ free(this->suffix_list);
+
+ if(! this->noOffset){
+ free(this->offset_list);
+ }
+
+ if(! this->noVocabulary){
+ delete(this->voc);
+ }
+}
+
+/**
+* Load the indexed corpus, suffix array, vocabulary, offset into memory for follow up applications
+* It is optional to load vocabulary, offset depends on the argument.
+* In the case when the testing data shares the same vocabulary as the training data and only vocIDs are used to represent the sentence/n-grams
+* then vocabulary which maps between vocId and the word text can be skipped to save some memory.
+*
+* If the suffix array object does not need to locate the sentence id of an occurred n-gram, then offset information is not needed.
+*
+* Be very careful here, the suffix array class does not check if offset has been loaded in the search function to make it efficient
+* you need to know what the suffix array class will be used (whether offset is needed) and load it properly
+* @param fileNameStem The filename of the corpus. This should be the same filename used in IndexSA
+* @param noVoc If set to be 'true', vocabulary will not be loaded
+* @param noOffset If set to be 'true', the offset information will not be loaded. <sentId, offsetInSent> information for an n-gram's occurrences can not be calculated.
+* @param noLevel1Bucket Level1Bucket is used to speed up the search at the cost of additional memory. For applications which do not need to locate n-grams in the corpus (such as the corpus scanning application), then there is no need to create Level1Bucket
+**/
+void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket)
+{
+ long ltime1, ltime2;
+
+ this->noVocabulary = noVoc;
+ this->noOffset = noOffset;
+ this->noLevel1Bucket = noLevel1Bucket;
+
+
+ char tmpString[1000];
+
+ //the order of loading the data is important, do not change
+ if(! this->noVocabulary){
+ time( &ltime1 );
+ cerr<<"Loading Vocabulary...\n";
+ sprintf(tmpString,"%s.id_voc",fileNameStem);
+ this->loadVoc(tmpString);
+ time( &ltime2);
+ cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n";
+ }
+
+ time( &ltime1 );
+ cerr<<"Loading corpus...\n";
+ sprintf(tmpString,"%s.sa_corpus",fileNameStem);
+ this->loadCorpusAndInitMem(tmpString);
+ time( &ltime2);
+ cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+ time( &ltime1 );
+ cerr<<"Loading suffix...\n";
+ sprintf(tmpString,"%s.sa_suffix",fileNameStem);
+ this->loadSuffix(tmpString);
+ time( &ltime2);
+ cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+ if(! this->noOffset){
+ time( &ltime1 );
+ cerr<<"Loading offset...\n";
+ sprintf(tmpString,"%s.sa_offset",fileNameStem);
+ this->loadOffset(tmpString);
+ time( &ltime2);
+ cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n";
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadVoc(const char *filename)
+{
+ this->voc = new C_IDVocabulary(filename);
+}
+
+void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * CorpusInputFile = fopen(filename, "rb");
+
+ if(!CorpusInputFile){
+ cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n";
+ exit(0);
+ }
+
+ //first, read the size of the corpus
+ dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile);
+
+ //allocate memory for all data structure
+ this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize);
+ if(! this->corpus_list){
+ cerr<<"Can not allocate memory to load the corpus!\n";
+ exit(0);
+ }
+
+ this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize);
+ if(! this->suffix_list){
+ cerr<<"Can not allocate memory to load the suffix!\n";
+ exit(0);
+ }
+
+ if(! this->noOffset){
+ this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize);
+ if(! this->offset_list){
+ cerr<<"Can not allocate memory to load the offset!\n";
+ exit(0);
+ }
+ }
+
+ //read the corpus file
+ unsigned int totalRead = 0;
+ unsigned int remaining = this->corpusSize;
+ unsigned int oneBatchReadSize;
+ char * currentPosInCorpusList = (char *) this->corpus_list;
+ while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+
+ currentPosInCorpusList+=sizeof(IndexType)*dwRead;
+ }
+ if(totalRead!=this->corpusSize){
+ cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+ fclose(CorpusInputFile);
+
+ this->sentIdStart = this->corpus_list[0];
+ this->vocIdForSentStart = this->corpus_list[1];
+ this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1];
+ this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2];
+
+ if(! this->noLevel1Bucket){
+ //in this corpus, we will have at most sentIdStart-1 word types
+ //the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data
+ this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart);
+
+ //initialize the level1 buckets
+ for(IndexType i=0;i<this->sentIdStart;i++){
+ this->level1Buckets[i].first = (TextLenType) -1;
+ this->level1Buckets[i].last = 0;
+ }
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadSuffix(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * SuffixInputFile = fopen(filename, "rb");
+ if(!SuffixInputFile){
+ cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl;
+ exit(0);
+ }
+
+ //first, read in the size of the suffix array
+ TextLenType suffixArraySize;
+ dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile);
+
+ if(suffixArraySize!=this->corpusSize){
+ cerr<<"Something wrong, the suffix array size is different from the corpus size.\n";
+ cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl;
+ exit(0);
+ }
+
+ //read all the suffix into memory
+ unsigned int totalRead = 0;
+ unsigned int remaining = suffixArraySize;
+ unsigned int oneBatchReadSize;
+ char * currentPosInSuffixList = (char *) this->suffix_list;
+ while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
+ oneBatchReadSize = SIZE_ONE_READ;
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
+
+ totalRead+=dwRead;
+ remaining -= dwRead;
+
+ currentPosInSuffixList+=sizeof(TextLenType)*dwRead;
+ }
+ if(totalRead!=suffixArraySize){
+ cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+
+ fclose(SuffixInputFile);
+
+ if(! this->noLevel1Bucket){
+ //build level-1 bucket
+ cerr<<"Initialize level-1 buckets...\n";
+ IndexType currentVocId = 0;
+ IndexType vocId;
+ TextLenType pos;
+ TextLenType lastSaIndex = 0;
+
+ for(TextLenType i=0; i<suffixArraySize; i++){
+ pos = this->suffix_list[i];
+
+ //for level1 bucket
+ vocId = this->corpus_list[pos];
+
+ if(vocId<this->sentIdStart){ //is a meaningful word type
+ if(vocId!=currentVocId){
+ this->level1Buckets[currentVocId].last = lastSaIndex; //for first word which is <unk> this does not matter
+ this->level1Buckets[vocId].first = i;
+
+ currentVocId=vocId;
+ }
+
+ lastSaIndex = i;
+ }
+ }
+
+ //for the last word type
+ this->level1Buckets[currentVocId].last = lastSaIndex;
+ }
+ else{
+ this->level1Buckets = NULL;
+ }
+}
+
+void C_SuffixArrayApplicationBase::loadOffset(const char *filename)
+{
+ unsigned int dwRead = 0;
+ FILE * OffsetInputFile = fopen(filename, "rb");
+
+ if(!OffsetInputFile){
+ cerr<<"Offset file: "<<filename<<" does not exist!"<<endl;
+ exit(0);
+ }
+
+ //first, read the size of the corpus
+ TextLenType offsetListLen;
+ dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile);
+ if(offsetListLen!=this->corpusSize){
+ cerr<<"Text length is inconsistent with the length of the offset.\n";
+ exit(0);
+ }
+
+ //read all the suffix into memory
+ unsigned int totalRead = 0;
+ unsigned int remaining = offsetListLen;
+ unsigned int oneBatchReadSize;
+ char * currentOffsetListPos = (char *) this->offset_list;
+ while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){
+ oneBatchReadSize = SIZE_ONE_READ;
+
+ if(remaining<SIZE_ONE_READ){
+ oneBatchReadSize = remaining;
+ }
+
+ dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile);
+
+ totalRead+=dwRead;
+ remaining-=dwRead;
+
+ currentOffsetListPos+=sizeof(unsigned char)*dwRead;
+
+ }
+ if(totalRead!=offsetListLen){
+ cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl;
+ exit(0);
+ }
+ fclose(OffsetInputFile);
+
+}
+
+TextLenType C_SuffixArrayApplicationBase::returnCorpusSize()
+{
+ return this->corpusSize;
+}
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h
new file mode 100755
index 0000000..74fad4e
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h
@@ -0,0 +1,58 @@
+#if !defined(__SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_)
+#define __SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_
+
+#include "salm_shared.h"
+#include "_IDVocabulary.h"
+#include "_String.h"
+
+using namespace std;
+
+typedef struct level1BucketElement
+{
+ TextLenType first;
+ TextLenType last;
+} S_level1BucketElement;
+
+
+/**
+* Base class of Suffix Array applications
+* Providing functions to load the suffix array and initialize the required vocIDs
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+class C_SuffixArrayApplicationBase
+{
+public:
+ void loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket);
+ TextLenType returnCorpusSize();
+
+ C_SuffixArrayApplicationBase();
+ virtual ~C_SuffixArrayApplicationBase();
+
+protected:
+ TextLenType corpusSize;
+
+ void loadVoc(const char * filename);
+ void loadOffset(const char * filename);
+ void loadSuffix(const char * filename);
+ void loadCorpusAndInitMem(const char * filename);
+
+ bool noVocabulary;
+ bool noOffset;
+ bool noLevel1Bucket;
+
+ C_IDVocabulary * voc;
+ IndexType sentIdStart;
+ IndexType vocIdForSentStart;
+ IndexType vocIdForSentEnd;
+ IndexType vocIdForCorpusEnd;
+
+ IndexType * corpus_list;
+ unsigned char * offset_list;
+ TextLenType * suffix_list;
+
+ S_level1BucketElement * level1Buckets;
+
+};
+
+#endif // !defined(__SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_)
diff --git a/Src/Utils/InitializeVocabulary.cpp b/Src/Utils/InitializeVocabulary.cpp
new file mode 100755
index 0000000..b749568
--- /dev/null
+++ b/Src/Utils/InitializeVocabulary.cpp
@@ -0,0 +1,30 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_IDVocabulary.h"
+
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Intialize an empty vocabulary with reserved words
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ if(argc<2){
+ cerr<<"\nUsage:";
+ cerr<<"\n\t"<<argv[0]<<" vocabularyFileName\n\n";
+ exit(0);
+ }
+
+ C_IDVocabulary voc;
+
+ voc.addingReservedWords();
+ voc.outputToFile(argv[1]);
+
+ return 0;
+
+}
diff --git a/Src/Utils/UpdateUniversalVoc.cpp b/Src/Utils/UpdateUniversalVoc.cpp
new file mode 100755
index 0000000..02ea6cb
--- /dev/null
+++ b/Src/Utils/UpdateUniversalVoc.cpp
@@ -0,0 +1,28 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_UniversalVocabulary.h"
+
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Update the universal vocabulary with words in corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+ if(argc<3){
+ cerr<<"\nUsage:";
+ cerr<<"\n\t"<<argv[0]<<" universal_voc corpusFileName\n\n";
+ exit(0);
+ }
+
+ C_UniversalVocabulary universalVoc(argv[1]);
+
+ universalVoc.updateWithNewCorpus(argv[2]);
+
+ return 1;
+}
diff --git a/Src/Utils/_UniversalVocabulary.cpp b/Src/Utils/_UniversalVocabulary.cpp
new file mode 100755
index 0000000..3be91d2
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.cpp
@@ -0,0 +1,118 @@
+#include "_UniversalVocabulary.h"
+#include "malloc.h"
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <stdlib.h>
+
+using namespace std;
+
+C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
+{
+ int fileNameSize=strlen(universalVocFileName);
+ fileNameSize++;
+
+ this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
+ sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);
+
+ this->universalVoc = new C_IDVocabulary(universalVocFileName);
+
+}
+
+C_UniversalVocabulary::~C_UniversalVocabulary()
+{
+ free(this->universalCorpusFileName);
+ delete(this->universalVoc);
+}
+
+
+/**
+* Update the universal vocabulary with words in a new corpus
+* Output the updated universal vocabulary
+* Output the vocabulary needed for the new corpus too
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
+{
+
+ ifstream textStream;
+ textStream.open(newCorpusFileName);
+
+ if(textStream==NULL){
+ fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
+ exit(-1);
+ }
+
+
+ //add reserved words from universal voc
+ for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
+ C_String reservedWordText = this->universalVoc->getText(vocId);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
+ }
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+ map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
+
+
+ getline(textStream, aLine);
+ while(!textStream.eof()){
+
+ if(aLine.length()>0){
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ C_String thisWord(thisToken);
+
+ //check if this word has already been seen
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);
+
+ if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
+ //new type
+ IndexType vocId = this->universalVoc->getId(thisWord);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
+ }
+
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ }
+
+ getline(textStream, aLine);
+ }
+
+
+ //now output the updated universal vocabulary
+ this->universalVoc->outputToFile(this->universalCorpusFileName);
+
+ //output the vocabulary needed for the new corpus
+ char vocabularyForNewCorpusFileName[1024];
+ sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);
+
+ ofstream outputVocFile;
+ outputVocFile.open(vocabularyForNewCorpusFileName);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
+ while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
+ outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
+ iterWordsUsedInTheNewCorpus++;
+ }
+
+ outputVocFile.close();
+}
diff --git a/Src/Utils/_UniversalVocabulary.cpp~ b/Src/Utils/_UniversalVocabulary.cpp~
new file mode 100755
index 0000000..50a7396
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.cpp~
@@ -0,0 +1,117 @@
+#include "_UniversalVocabulary.h"
+#include "malloc.h"
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
+{
+ int fileNameSize=strlen(universalVocFileName);
+ fileNameSize++;
+
+ this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
+ sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);
+
+ this->universalVoc = new C_IDVocabulary(universalVocFileName);
+
+}
+
+C_UniversalVocabulary::~C_UniversalVocabulary()
+{
+ free(this->universalCorpusFileName);
+ delete(this->universalVoc);
+}
+
+
+/**
+* Update the universal vocabulary with words in a new corpus
+* Output the updated universal vocabulary
+* Output the vocabulary needed for the new corpus too
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
+{
+
+ ifstream textStream;
+ textStream.open(newCorpusFileName);
+
+ if(textStream==NULL){
+ fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
+ exit(-1);
+ }
+
+
+ //add reserved words from universal voc
+ for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
+ C_String reservedWordText = this->universalVoc->getText(vocId);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
+ }
+
+ string aLine;
+ unsigned int sentNumber = 1;
+ unsigned int corpusSize = 0;
+
+ char * thisToken;
+ char delimit[] =" \t\r\n";
+ map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
+
+
+ getline(textStream, aLine);
+ while(!textStream.eof()){
+
+ if(aLine.length()>0){
+
+ thisToken = strtok((char*) aLine.c_str(), delimit );
+ while( thisToken != NULL ) {
+
+ C_String thisWord(thisToken);
+
+ //check if this word has already been seen
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);
+
+ if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
+ //new type
+ IndexType vocId = this->universalVoc->getId(thisWord);
+ this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
+ }
+
+
+ // While there are tokens in "string"
+ // Get next token:
+ thisToken = strtok( NULL, delimit);
+ }
+
+ }
+
+ getline(textStream, aLine);
+ }
+
+
+ //now output the updated universal vocabulary
+ this->universalVoc->outputToFile(this->universalCorpusFileName);
+
+ //output the vocabulary needed for the new corpus
+ char vocabularyForNewCorpusFileName[1024];
+ sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);
+
+ ofstream outputVocFile;
+ outputVocFile.open(vocabularyForNewCorpusFileName);
+
+ if(!outputVocFile){
+ cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
+ exit(-1);
+ }
+
+ iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
+ while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
+ outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
+ iterWordsUsedInTheNewCorpus++;
+ }
+
+ outputVocFile.close();
+}
diff --git a/Src/Utils/_UniversalVocabulary.h b/Src/Utils/_UniversalVocabulary.h
new file mode 100755
index 0000000..2df4954
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.h
@@ -0,0 +1,38 @@
+#if !defined (__HEADER_UNIVERSAL_VOC_INCLUDED__)
+#define __HEADER_UNIVERSAL_VOC_INCLUDED__
+
+#include "salm_shared.h"
+#include "_IDVocabulary.h"
+#include "_String.h"
+
+#include <map>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Universal Vocabulary class provides function to update the univeral vocabulary
+* with the words in a new corpus
+* and output the vocabulary needed for the new corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_UniversalVocabulary{
+
+public:
+ void updateWithNewCorpus(const char * newCorpusFileName);
+
+ C_UniversalVocabulary(const char * universalVocFileName);
+ ~C_UniversalVocabulary();
+
+private:
+ char * universalCorpusFileName;
+ C_IDVocabulary * universalVoc;
+
+ map<C_String, IndexType, ltstr> wordsUsedInTheNewCorpus;
+
+};
+
+
+#endif