initial add of salm to github

author: Hieu Hoang <hieu@hoang.co.uk> 2013-11-25 13:56:37 +0400
committer: Hieu Hoang <hieu@hoang.co.uk> 2013-11-25 13:56:37 +0400
commit: a146dbec8f0391e247db1ae4c9b7af5c225436f9 (patch)
tree: 1fa97934675448cdcffb26b4737887d551822a39 /Src
55 files changed, 10193 insertions, 0 deletions
diff --git a/Src/IndexSA/IndexSA.cpp b/Src/IndexSA/IndexSA.cpp
new file mode 100755
index 0000000..3013d4c
--- /dev/null
+++ b/Src/IndexSA/IndexSA.cpp
@@ -0,0 +1,58 @@
+/**
+* Main function to index a corpus according to its suffix array
+* Revision: $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <cstring>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "_MonoCorpus.h"
+#include "salm_shared.h"
+
+using namespace std;
+
+IndexType * corpus;	//because the compare function needs to see this, make it global
+TextLenType actualCorpusSize;
+
+int main(int argc, char* argv[]){
+	
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+
+		fprintf(stderr,"\nUsage:");
+		fprintf(stderr,"\n%s fileNameStem [existingIDVocFileName]\n",argv[0]);				
+			
+		exit(0);
+	}
+
+	C_MonoCorpus corpus;
+
+	char vocFileName[1024];
+	sprintf(vocFileName, "%s.id_voc", argv[1]);
+	
+	if(argc==2){	//no existing vocabulary given
+		cerr<<"Initialize vocabulary file: "<<vocFileName<<endl;
+		corpus.initializeVocabulary(argv[1]);		
+		corpus.loadCorpusAndSort(argv[1], vocFileName, true);
+	}
+	else{
+		if(strcmp(vocFileName, argv[2])!=0){
+			cerr<<"Error! ExistingIDVocFileName has to be called: "<<vocFileName<<" and cover all the words in the corpus."<<endl;
+			exit(-1);
+		}
+		corpus.loadCorpusAndSort(argv[1], argv[2], false);
+	}
+
+	corpus.output(argv[1]);
+
+	return 0;
+}
+
diff --git a/Src/IndexSA/IndexSA.cpp~ b/Src/IndexSA/IndexSA.cpp~
new file mode 100755
index 0000000..d8ad043
--- /dev/null
+++ b/Src/IndexSA/IndexSA.cpp~
@@ -0,0 +1,57 @@
+/**
+* Main function to index a corpus according to its suffix array
+* Revision: $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include "_MonoCorpus.h"
+#include "salm_shared.h"
+
+using namespace std;
+
+IndexType * corpus;	//because the compare function needs to see this, make it global
+TextLenType actualCorpusSize;
+
+int main(int argc, char* argv[]){
+	
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+
+		fprintf(stderr,"\nUsage:");
+		fprintf(stderr,"\n%s fileNameStem [existingIDVocFileName]\n",argv[0]);				
+			
+		exit(0);
+	}
+
+	C_MonoCorpus corpus;
+
+	char vocFileName[1024];
+	sprintf(vocFileName, "%s.id_voc", argv[1]);
+	
+	if(argc==2){	//no existing vocabulary given
+		cerr<<"Initialize vocabulary file: "<<vocFileName<<endl;
+		corpus.initializeVocabulary(argv[1]);		
+		corpus.loadCorpusAndSort(argv[1], vocFileName, true);
+	}
+	else{
+		if(strcmp(vocFileName, argv[2])!=0){
+			cerr<<"Error! ExistingIDVocFileName has to be called: "<<vocFileName<<" and cover all the words in the corpus."<<endl;
+			exit(-1);
+		}
+		corpus.loadCorpusAndSort(argv[1], argv[2], false);
+	}
+
+	corpus.output(argv[1]);
+
+	return 0;
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.cpp b/Src/IndexSA/_MonoCorpus.cpp
new file mode 100755
index 0000000..ab53813
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.cpp
@@ -0,0 +1,440 @@
+/**
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "_MonoCorpus.h"
+#include "malloc.h"
+#include "time.h"
+
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+extern 	IndexType * corpus;
+extern 	TextLenType actualCorpusSize;
+
+bool operator<(const C_SuffixPointer& a, const C_SuffixPointer& b) 
+{
+    bool stillEqual = true;
+	TextLenType currentPosOfA = a.pointer;
+	TextLenType currentPosOfB = b.pointer;
+
+	if(currentPosOfA==currentPosOfB){
+		return false;
+	}
+
+	while(stillEqual){
+		if(corpus[currentPosOfA]<corpus[currentPosOfB]){
+			return true;
+		}
+
+		if(corpus[currentPosOfA]>corpus[currentPosOfB]){
+			return false;
+		}
+
+		//then still equal at these two positions
+		currentPosOfA++;
+		currentPosOfB++;
+
+		if(currentPosOfA>=actualCorpusSize){
+			currentPosOfA=0;
+		}
+
+		if(currentPosOfB>=actualCorpusSize){
+			currentPosOfB=0;
+		}
+	}
+
+	//equal
+	return false;
+}
+
+
+C_SuffixPointer::C_SuffixPointer()
+{
+
+}
+
+//copy constructor
+C_SuffixPointer::C_SuffixPointer(const C_SuffixPointer & obj)
+{
+	this->pointer = obj.pointer;
+}
+
+C_SuffixPointer::~C_SuffixPointer()
+{
+
+}
+
+
+C_SuffixPointer::C_SuffixPointer(TextLenType pointer)
+{
+	this->pointer = pointer;
+}
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_MonoCorpus::C_MonoCorpus()
+{
+	this->currentPosInCorpus = 0;
+	this->maxVocIdFromCorpus = 0;
+}
+
+C_MonoCorpus::~C_MonoCorpus()
+{
+	free(corpus);
+	free(this->suffix);
+	free(this->offsetList);	
+}
+
+
+/**
+* Initialize an IDVocabulary file
+**/
+void C_MonoCorpus::initializeVocabulary(char *fileNameStem)
+{
+	C_IDVocabulary tmpVoc;
+	tmpVoc.addingReservedWords();
+
+	char vocFileName[1024];
+	sprintf(vocFileName, "%s.id_voc", fileNameStem);
+
+	tmpVoc.outputToFile(vocFileName);
+}
+
+
+void C_MonoCorpus::loadCorpusAndSort(const char *fileName, const char * idVocFileName, bool vocNeedsToBeUpdated)
+{
+	IndexType id = 0;
+
+	//load vocabulary
+	this->voc = new C_IDVocabulary(idVocFileName);
+	this->vocNeedsToBeUpdated = vocNeedsToBeUpdated;
+
+	this->vocIdForSentIdPlaceHolder = this->voc->returnId(C_String("_SENT_ID_PLACEHOLDER_"));
+	if(this->vocIdForSentIdPlaceHolder==0){
+		cerr<<"ID vocabulary does not have the type _SENT_ID_PLACEHOLDER_, error!\n Add this word to the universal vocabulary and try again!\n";
+		exit(-1);
+	}
+	if(this->vocIdForSentIdPlaceHolder>this->maxVocIdFromCorpus){
+		this->maxVocIdFromCorpus = this->vocIdForSentIdPlaceHolder;
+	}
+	
+	this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+	if(this->vocIdForSentStart==0){
+		cerr<<"ID vocabulary does not have the type _SENTENCE_START_, error!\n Add this word to the universal vocabulary and try again!\n";
+		exit(-1);
+	}
+	if(this->vocIdForSentStart>this->maxVocIdFromCorpus){
+		this->maxVocIdFromCorpus = this->vocIdForSentStart;
+	}
+
+	this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));	
+	if(this->vocIdForSentEnd==0){
+		cerr<<"ID vocabulary does not have the type _END_OF_SENTENCE_, error!\n Add this word to the universal vocabulary and try again!\n";
+		exit(-1);
+	}
+	if(this->vocIdForSentEnd>this->maxVocIdFromCorpus){
+		this->maxVocIdFromCorpus = this->vocIdForSentEnd;
+	}
+
+	this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+	if(this->vocIdForCorpusEnd==0){
+		cerr<<"ID vocabulary does not have the type _END_OF_CORPUS_, error!\n Add this word to the universal vocabulary and try again!\n";
+		exit(-1);
+	}
+	if(this->vocIdForCorpusEnd>this->maxVocIdFromCorpus){
+		this->maxVocIdFromCorpus = this->vocIdForCorpusEnd;
+	}
+
+	ifstream textStream1;
+	textStream1.open(fileName);
+
+	if(textStream1==NULL){
+		fprintf(stderr,"Text %s does not exist. Exit!\n",fileName);
+		exit(-1);
+	}
+	
+	long ltime1, ltime2;
+	time( &ltime1 );
+    	
+	string aLine;
+	unsigned int sentNumber = 1;
+	unsigned int sentLen = 0;
+	unsigned int corpusSize = 0;
+
+	char * thisToken;
+	char delimit[] =" \t\r\n";	
+
+	//first, scan the corpus to estimate the size and check if each line is shorter than 256 words
+	getline(textStream1, aLine);
+	while(!textStream1.eof()){
+
+		if(aLine.length()>0){
+			sentLen = 0;
+
+			thisToken = strtok((char*) aLine.c_str(), delimit );
+			while( thisToken != NULL ) {			
+				
+				if(this->vocNeedsToBeUpdated){
+					id = this->voc->getId(C_String(thisToken));
+				}
+				else{	//the provided vocabulary should cover all the words in this corpus
+					id = this->voc->returnId(C_String(thisToken));
+
+					if(id==0){	//word does not exist
+						cerr<<"Vocabulary: "<<idVocFileName<<" does not cover all the words in the corpus!"<<endl;
+						cerr<<"Word: "<<thisToken<<" does not exist in the voc!\n";
+						exit(-1);
+					}
+				}
+
+
+
+				sentLen++;
+
+				if(id>this->maxVocIdFromCorpus){
+					this->maxVocIdFromCorpus = id;
+				}
+
+				if(sentLen>=256){
+					cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+					exit(-1);
+				}
+				
+				// While there are tokens in "string"
+				// Get next token: 
+				thisToken = strtok( NULL, delimit);
+			}
+			corpusSize+=sentLen;
+
+			sentLen = 0;
+			sentNumber++;
+		}
+		else{
+			cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this message if this is the last sentence.\n";			
+		}
+		getline(textStream1, aLine);
+	}
+	
+	sentNumber--;
+	unsigned int estimatedSize = corpusSize+3*sentNumber+1000;	//with some redundancy
+	cerr<<sentNumber<<" sentences and "<<corpusSize<<" words in corpus\n";
+	cerr<<"Reserve "<<estimatedSize*2<<" bytes in RAM for sorting\n";
+	textStream1.close();
+	
+
+	//second pass, convert the corpus into vocIDs and create suffix array
+	ifstream textStream2;
+	textStream2.open(fileName);
+
+	this->allocateMem(estimatedSize);
+	this->currentPosInCorpus = 0;
+	sentNumber = 1;
+
+	getline(textStream2, aLine);
+	while(!textStream2.eof()){
+
+		if(aLine.length()>0){
+			sentLen = 0;
+
+			//add sentId
+			//offset at this position will store the acutal sentence length
+			corpus[this->currentPosInCorpus]=this->vocIdForSentIdPlaceHolder;
+			this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);			
+			this->currentPosInCorpus++;
+
+			//add <s>
+			sentLen++;	//not real sentence length, but to keep track of offset
+			corpus[this->currentPosInCorpus]=this->vocIdForSentStart;
+			this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+			this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+			this->currentPosInCorpus++;
+
+			thisToken = strtok((char*) aLine.c_str(), delimit );
+			while( thisToken != NULL ) {			
+				
+				id = this->voc->returnId(C_String(thisToken));
+				if(id==0){
+					cerr<<"Word \""<<thisToken<<"\" is not listed in the IDVocabulary.\n";
+					exit(-1);
+				}
+
+				sentLen++;
+
+				if(id>this->maxVocIdFromCorpus){
+					this->maxVocIdFromCorpus = id;
+				}
+
+				corpus[this->currentPosInCorpus]=id;
+				this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+				this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+				this->currentPosInCorpus++;				
+
+				if(sentLen>=256){
+					cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+					exit(-1);
+				}
+				
+				// While there are tokens in "string"
+				// Get next token: 
+				thisToken = strtok( NULL, delimit);
+			}
+
+			//add <sentEnd>
+			corpus[this->currentPosInCorpus]=this->vocIdForSentEnd;
+			this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+			this->offsetList[this->currentPosInCorpus] = (unsigned char) (sentLen + 1);
+			this->offsetList[this->currentPosInCorpus - sentLen - 1] = (unsigned char) (sentLen-1);	//write the sentLen to sent begin correspond to <sentId>
+			this->currentPosInCorpus++;
+
+			sentLen = 0;
+			sentNumber++;
+		}
+		else{
+			cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this if this is the last sentence.\n";			
+		}
+
+		aLine[0]=0;
+		getline(textStream2, aLine);
+	}
+	textStream2.close();
+
+	//add <endOfCorpus> to the end of data
+	corpus[this->currentPosInCorpus]=this->vocIdForCorpusEnd;
+	this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+	this->offsetList[this->currentPosInCorpus] = (unsigned char) 0;
+	this->currentPosInCorpus++;
+	
+	actualCorpusSize = this->currentPosInCorpus;	
+
+	time( &ltime2 );    	
+	cerr<<"\nCorpus loaded in: "<<ltime2-ltime1<<" seconds."<<endl;
+	cerr<<"Total "<<sentNumber-1<<" sentences loaded.\n";
+
+	//replace the sentId place holder to actual sentId
+	time( &ltime1 );   	
+	cerr<<"Inserting sentence IDs into the corpus...\n";
+	IndexType sentId = this->maxVocIdFromCorpus+1;
+	for(TextLenType i=0;i<actualCorpusSize;i++){
+		if(corpus[i]==this->vocIdForSentIdPlaceHolder){
+			corpus[i]=sentId;
+			sentId++;
+		}
+	}
+	time( &ltime2 );    	
+    cerr<<"\nSentence IDs inserted in: "<<ltime2-ltime1<<" seconds."<<endl;
+
+	//sorting
+	time( &ltime1 );   	
+	cerr<<"Sorting the suffix...\n";
+	sort(this->suffix, this->suffix+actualCorpusSize);
+	time( &ltime2 );    	
+    cerr<<"\nCorpus sorted in: "<<ltime2-ltime1<<" seconds."<<endl;
+	cerr<<"Done."<<endl;
+
+}
+
+void C_MonoCorpus::allocateMem(TextLenType corpusSize)
+{
+	corpus = (IndexType *) malloc(sizeof(IndexType)*corpusSize);
+	
+	if(corpus==0){
+		cerr<<"Failed to allocate memory for corpus. Quit!\n";
+		exit(-1);
+	}
+	
+	this->suffix = (C_SuffixPointer *) malloc(sizeof(C_SuffixPointer)*corpusSize);
+	if(this->suffix==0){
+		cerr<<"Failed to allocate memory for suffix. Quit!\n";
+		exit(-1);
+	}
+
+	this->offsetList = (unsigned char *) malloc(sizeof(unsigned char)*corpusSize);
+	if(this->offsetList==0){
+		cerr<<"Failed to allocate memory for offset. Quit!\n";
+		exit(-1);
+	}
+
+}
+
+
+void C_MonoCorpus::outputCorpus(char *filename)
+{
+	cerr<<"Writing corpus to file: "<<filename<<endl;
+	ofstream textOutStream;
+	textOutStream.open(filename, ios::binary);
+
+	//first, write down the corpus size
+	textOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+	for(TextLenType i=0; i<actualCorpusSize;i++){
+		textOutStream.write((char *)&(corpus[i]), sizeof(IndexType));		
+	}
+	
+	textOutStream.close();
+	
+}
+
+void C_MonoCorpus::outputOffset(char *filename)
+{
+	cerr<<"Writing offset to file: "<<filename<<endl;
+
+	ofstream offsetOutStream;
+	offsetOutStream.open(filename, ios::binary);
+
+	//first, write down the corpus size
+	offsetOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+	
+	for(TextLenType i=0; i<actualCorpusSize; i++){		
+		offsetOutStream.write((char *)& (this->offsetList[i]), sizeof(unsigned char));
+	}
+	offsetOutStream.close();
+}
+
+void C_MonoCorpus::outputSuffix(char *filename)
+{
+	cerr<<"Writing suffix information to file: "<<filename<<endl;
+	
+	ofstream saOutStream;
+	saOutStream.open(filename, ios::binary);
+
+	//first, write down the corpus size
+	saOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+	for(TextLenType i=0;i<actualCorpusSize; i++){
+		saOutStream.write((char *) & (this->suffix[i].pointer), sizeof(TextLenType));		
+	}
+
+	saOutStream.close();
+}
+
+void C_MonoCorpus::output(char *filename)
+{
+	char outputVocFileName[1024];
+	char outputCorpusFileName[1024];
+	char outputOffsetFileName[1024];
+	char outputSuffixFileName[1024];
+	
+	
+	if(this->vocNeedsToBeUpdated){
+		sprintf(outputVocFileName, "%s.id_voc", filename);
+		this->voc->outputToFile(outputVocFileName);		
+	}
+
+	sprintf(outputCorpusFileName, "%s.sa_corpus", filename);
+	sprintf(outputOffsetFileName, "%s.sa_offset", filename);
+	sprintf(outputSuffixFileName, "%s.sa_suffix", filename);
+	
+
+	this->outputCorpus(outputCorpusFileName);
+	this->outputOffset(outputOffsetFileName);
+	this->outputSuffix(outputSuffixFileName);
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.cpp~ b/Src/IndexSA/_MonoCorpus.cpp~
new file mode 100755
index 0000000..3e3a29b
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.cpp~
@@ -0,0 +1,439 @@
+/**
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include "_MonoCorpus.h"
+#include "malloc.h"
+#include "time.h"
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <algorithm>
+
+using namespace std;
+
+extern 	IndexType * corpus;
+extern 	TextLenType actualCorpusSize;
+
+bool operator<(const C_SuffixPointer& a, const C_SuffixPointer& b) 
+{
+    bool stillEqual = true;
+	TextLenType currentPosOfA = a.pointer;
+	TextLenType currentPosOfB = b.pointer;
+
+	if(currentPosOfA==currentPosOfB){
+		return false;
+	}
+
+	while(stillEqual){
+		if(corpus[currentPosOfA]<corpus[currentPosOfB]){
+			return true;
+		}
+
+		if(corpus[currentPosOfA]>corpus[currentPosOfB]){
+			return false;
+		}
+
+		//then still equal at these two positions
+		currentPosOfA++;
+		currentPosOfB++;
+
+		if(currentPosOfA>=actualCorpusSize){
+			currentPosOfA=0;
+		}
+
+		if(currentPosOfB>=actualCorpusSize){
+			currentPosOfB=0;
+		}
+	}
+
+	//equal
+	return false;
+}
+
+
+C_SuffixPointer::C_SuffixPointer()
+{
+
+}
+
+//copy constructor
+C_SuffixPointer::C_SuffixPointer(const C_SuffixPointer & obj)
+{
+	this->pointer = obj.pointer;
+}
+
+C_SuffixPointer::~C_SuffixPointer()
+{
+
+}
+
+
+C_SuffixPointer::C_SuffixPointer(TextLenType pointer)
+{
+	this->pointer = pointer;
+}
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_MonoCorpus::C_MonoCorpus()
+{
+	this->currentPosInCorpus = 0;
+	this->maxVocIdFromCorpus = 0;
+}
+
+C_MonoCorpus::~C_MonoCorpus()
+{
+	free(corpus);
+	free(this->suffix);
+	free(this->offsetList);	
+}
+
+
+/**
+* Initialize an IDVocabulary file
+**/
+void C_MonoCorpus::initializeVocabulary(char *fileNameStem)
+{
+	C_IDVocabulary tmpVoc;
+	tmpVoc.addingReservedWords();
+
+	char vocFileName[1024];
+	sprintf(vocFileName, "%s.id_voc", fileNameStem);
+
+	tmpVoc.outputToFile(vocFileName);
+}
+
+
+void C_MonoCorpus::loadCorpusAndSort(const char *fileName, const char * idVocFileName, bool vocNeedsToBeUpdated)
+{
+	IndexType id = 0;
+
+	//load vocabulary
+	this->voc = new C_IDVocabulary(idVocFileName);
+	this->vocNeedsToBeUpdated = vocNeedsToBeUpdated;
+
+	this->vocIdForSentIdPlaceHolder = this->voc->returnId(C_String("_SENT_ID_PLACEHOLDER_"));
+	if(this->vocIdForSentIdPlaceHolder==0){
+		cerr<<"ID vocabulary does not have the type _SENT_ID_PLACEHOLDER_, error!\n Add this word to the universal vocabulary and try again!\n";
+		exit(-1);
+	}
+	if(this->vocIdForSentIdPlaceHolder>this->maxVocIdFromCorpus){
+		this->maxVocIdFromCorpus = this->vocIdForSentIdPlaceHolder;
+	}
+	
+	this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+	if(this->vocIdForSentStart==0){
+		cerr<<"ID vocabulary does not have the type _SENTENCE_START_, error!\n Add this word to the universal vocabulary and try again!\n";
+		exit(-1);
+	}
+	if(this->vocIdForSentStart>this->maxVocIdFromCorpus){
+		this->maxVocIdFromCorpus = this->vocIdForSentStart;
+	}
+
+	this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));	
+	if(this->vocIdForSentEnd==0){
+		cerr<<"ID vocabulary does not have the type _END_OF_SENTENCE_, error!\n Add this word to the universal vocabulary and try again!\n";
+		exit(-1);
+	}
+	if(this->vocIdForSentEnd>this->maxVocIdFromCorpus){
+		this->maxVocIdFromCorpus = this->vocIdForSentEnd;
+	}
+
+	this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+	if(this->vocIdForCorpusEnd==0){
+		cerr<<"ID vocabulary does not have the type _END_OF_CORPUS_, error!\n Add this word to the universal vocabulary and try again!\n";
+		exit(-1);
+	}
+	if(this->vocIdForCorpusEnd>this->maxVocIdFromCorpus){
+		this->maxVocIdFromCorpus = this->vocIdForCorpusEnd;
+	}
+
+	ifstream textStream1;
+	textStream1.open(fileName);
+
+	if(textStream1==NULL){
+		fprintf(stderr,"Text %s does not exist. Exit!\n",fileName);
+		exit(-1);
+	}
+	
+	long ltime1, ltime2;
+	time( &ltime1 );
+    	
+	string aLine;
+	unsigned int sentNumber = 1;
+	unsigned int sentLen = 0;
+	unsigned int corpusSize = 0;
+
+	char * thisToken;
+	char delimit[] =" \t\r\n";	
+
+	//first, scan the corpus to estimate the size and check if each line is shorter than 256 words
+	getline(textStream1, aLine);
+	while(!textStream1.eof()){
+
+		if(aLine.length()>0){
+			sentLen = 0;
+
+			thisToken = strtok((char*) aLine.c_str(), delimit );
+			while( thisToken != NULL ) {			
+				
+				if(this->vocNeedsToBeUpdated){
+					id = this->voc->getId(C_String(thisToken));
+				}
+				else{	//the provided vocabulary should cover all the words in this corpus
+					id = this->voc->returnId(C_String(thisToken));
+
+					if(id==0){	//word does not exist
+						cerr<<"Vocabulary: "<<idVocFileName<<" does not cover all the words in the corpus!"<<endl;
+						cerr<<"Word: "<<thisToken<<" does not exist in the voc!\n";
+						exit(-1);
+					}
+				}
+
+
+
+				sentLen++;
+
+				if(id>this->maxVocIdFromCorpus){
+					this->maxVocIdFromCorpus = id;
+				}
+
+				if(sentLen>=256){
+					cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+					exit(-1);
+				}
+				
+				// While there are tokens in "string"
+				// Get next token: 
+				thisToken = strtok( NULL, delimit);
+			}
+			corpusSize+=sentLen;
+
+			sentLen = 0;
+			sentNumber++;
+		}
+		else{
+			cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this message if this is the last sentence.\n";			
+		}
+		getline(textStream1, aLine);
+	}
+	
+	sentNumber--;
+	unsigned int estimatedSize = corpusSize+3*sentNumber+1000;	//with some redundancy
+	cerr<<sentNumber<<" sentences and "<<corpusSize<<" words in corpus\n";
+	cerr<<"Reserve "<<estimatedSize*2<<" bytes in RAM for sorting\n";
+	textStream1.close();
+	
+
+	//second pass, convert the corpus into vocIDs and create suffix array
+	ifstream textStream2;
+	textStream2.open(fileName);
+
+	this->allocateMem(estimatedSize);
+	this->currentPosInCorpus = 0;
+	sentNumber = 1;
+
+	getline(textStream2, aLine);
+	while(!textStream2.eof()){
+
+		if(aLine.length()>0){
+			sentLen = 0;
+
+			//add sentId
+			//offset at this position will store the acutal sentence length
+			corpus[this->currentPosInCorpus]=this->vocIdForSentIdPlaceHolder;
+			this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);			
+			this->currentPosInCorpus++;
+
+			//add <s>
+			sentLen++;	//not real sentence length, but to keep track of offset
+			corpus[this->currentPosInCorpus]=this->vocIdForSentStart;
+			this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+			this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+			this->currentPosInCorpus++;
+
+			thisToken = strtok((char*) aLine.c_str(), delimit );
+			while( thisToken != NULL ) {			
+				
+				id = this->voc->returnId(C_String(thisToken));
+				if(id==0){
+					cerr<<"Word \""<<thisToken<<"\" is not listed in the IDVocabulary.\n";
+					exit(-1);
+				}
+
+				sentLen++;
+
+				if(id>this->maxVocIdFromCorpus){
+					this->maxVocIdFromCorpus = id;
+				}
+
+				corpus[this->currentPosInCorpus]=id;
+				this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+				this->offsetList[this->currentPosInCorpus] = (unsigned char) sentLen;
+				this->currentPosInCorpus++;				
+
+				if(sentLen>=256){
+					cerr<<"Sentence "<<sentNumber<<" has more than 256 words. Can not handle such long sentence. Please cut it short first!\n";
+					exit(-1);
+				}
+				
+				// While there are tokens in "string"
+				// Get next token: 
+				thisToken = strtok( NULL, delimit);
+			}
+
+			//add <sentEnd>
+			corpus[this->currentPosInCorpus]=this->vocIdForSentEnd;
+			this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+			this->offsetList[this->currentPosInCorpus] = (unsigned char) (sentLen + 1);
+			this->offsetList[this->currentPosInCorpus - sentLen - 1] = (unsigned char) (sentLen-1);	//write the sentLen to sent begin correspond to <sentId>
+			this->currentPosInCorpus++;
+
+			sentLen = 0;
+			sentNumber++;
+		}
+		else{
+			cerr<<"Warning: sentence "<<sentNumber<< " is empty. Ignore this if this is the last sentence.\n";			
+		}
+
+		aLine[0]=0;
+		getline(textStream2, aLine);
+	}
+	textStream2.close();
+
+	//add <endOfCorpus> to the end of data
+	corpus[this->currentPosInCorpus]=this->vocIdForCorpusEnd;
+	this->suffix[this->currentPosInCorpus]=C_SuffixPointer(this->currentPosInCorpus);
+	this->offsetList[this->currentPosInCorpus] = (unsigned char) 0;
+	this->currentPosInCorpus++;
+	
+	actualCorpusSize = this->currentPosInCorpus;	
+
+	time( &ltime2 );    	
+	cerr<<"\nCorpus loaded in: "<<ltime2-ltime1<<" seconds."<<endl;
+	cerr<<"Total "<<sentNumber-1<<" sentences loaded.\n";
+
+	//replace the sentId place holder to actual sentId
+	time( &ltime1 );   	
+	cerr<<"Inserting sentence IDs into the corpus...\n";
+	IndexType sentId = this->maxVocIdFromCorpus+1;
+	for(TextLenType i=0;i<actualCorpusSize;i++){
+		if(corpus[i]==this->vocIdForSentIdPlaceHolder){
+			corpus[i]=sentId;
+			sentId++;
+		}
+	}
+	time( &ltime2 );    	
+    cerr<<"\nSentence IDs inserted in: "<<ltime2-ltime1<<" seconds."<<endl;
+
+	//sorting
+	time( &ltime1 );   	
+	cerr<<"Sorting the suffix...\n";
+	sort(this->suffix, this->suffix+actualCorpusSize);
+	time( &ltime2 );    	
+    cerr<<"\nCorpus sorted in: "<<ltime2-ltime1<<" seconds."<<endl;
+	cerr<<"Done."<<endl;
+
+}
+
+void C_MonoCorpus::allocateMem(TextLenType corpusSize)
+{
+	corpus = (IndexType *) malloc(sizeof(IndexType)*corpusSize);
+	
+	if(corpus==0){
+		cerr<<"Failed to allocate memory for corpus. Quit!\n";
+		exit(-1);
+	}
+	
+	this->suffix = (C_SuffixPointer *) malloc(sizeof(C_SuffixPointer)*corpusSize);
+	if(this->suffix==0){
+		cerr<<"Failed to allocate memory for suffix. Quit!\n";
+		exit(-1);
+	}
+
+	this->offsetList = (unsigned char *) malloc(sizeof(unsigned char)*corpusSize);
+	if(this->offsetList==0){
+		cerr<<"Failed to allocate memory for offset. Quit!\n";
+		exit(-1);
+	}
+
+}
+
+
+void C_MonoCorpus::outputCorpus(char *filename)
+{
+	cerr<<"Writing corpus to file: "<<filename<<endl;
+	ofstream textOutStream;
+	textOutStream.open(filename, ios::binary);
+
+	//first, write down the corpus size
+	textOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+	for(TextLenType i=0; i<actualCorpusSize;i++){
+		textOutStream.write((char *)&(corpus[i]), sizeof(IndexType));		
+	}
+	
+	textOutStream.close();
+	
+}
+
+void C_MonoCorpus::outputOffset(char *filename)
+{
+	cerr<<"Writing offset to file: "<<filename<<endl;
+
+	ofstream offsetOutStream;
+	offsetOutStream.open(filename, ios::binary);
+
+	//first, write down the corpus size
+	offsetOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+	
+	for(TextLenType i=0; i<actualCorpusSize; i++){		
+		offsetOutStream.write((char *)& (this->offsetList[i]), sizeof(unsigned char));
+	}
+	offsetOutStream.close();
+}
+
+void C_MonoCorpus::outputSuffix(char *filename)
+{
+	cerr<<"Writing suffix information to file: "<<filename<<endl;
+	
+	ofstream saOutStream;
+	saOutStream.open(filename, ios::binary);
+
+	//first, write down the corpus size
+	saOutStream.write((char *)&actualCorpusSize, sizeof(TextLenType));
+
+	for(TextLenType i=0;i<actualCorpusSize; i++){
+		saOutStream.write((char *) & (this->suffix[i].pointer), sizeof(TextLenType));		
+	}
+
+	saOutStream.close();
+}
+
+void C_MonoCorpus::output(char *filename)
+{
+	char outputVocFileName[1024];
+	char outputCorpusFileName[1024];
+	char outputOffsetFileName[1024];
+	char outputSuffixFileName[1024];
+	
+	
+	if(this->vocNeedsToBeUpdated){
+		sprintf(outputVocFileName, "%s.id_voc", filename);
+		this->voc->outputToFile(outputVocFileName);		
+	}
+
+	sprintf(outputCorpusFileName, "%s.sa_corpus", filename);
+	sprintf(outputOffsetFileName, "%s.sa_offset", filename);
+	sprintf(outputSuffixFileName, "%s.sa_suffix", filename);
+	
+
+	this->outputCorpus(outputCorpusFileName);
+	this->outputOffset(outputOffsetFileName);
+	this->outputSuffix(outputSuffixFileName);
+}
+
diff --git a/Src/IndexSA/_MonoCorpus.h b/Src/IndexSA/_MonoCorpus.h
new file mode 100755
index 0000000..4c834b0
--- /dev/null
+++ b/Src/IndexSA/_MonoCorpus.h
@@ -0,0 +1,60 @@
+#if !defined(__MonoCorpus__H__INCLUDED_)
+#define __MonoCorpus__H__INCLUDED_
+
+#include "_IDVocabulary.h"
+#include "salm_shared.h"
+
+/**
+* \ingroup index
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+* Defines the wrapper class for the comparing function
+**/
+class C_SuffixPointer
+{
+public:
+	C_SuffixPointer(const C_SuffixPointer &);
+	C_SuffixPointer();
+	~C_SuffixPointer();
+	C_SuffixPointer(TextLenType pointer);
+	TextLenType pointer;
+};
+
+/**
+* \ingroup index
+* Monolingual corpus class for loading the corpus from file, sort it according to the suffix array order
+* and convert it to the binary format for suffix array applications
+**/
+class C_MonoCorpus  
+{
+public:
+	void initializeVocabulary(char * fileNameStem);
+	void output(char * filename);
+	void loadCorpusAndSort(const char * fileName, const char * idVocFileName, bool vocNeedsToBeUpdated);
+	
+	C_MonoCorpus();
+	virtual ~C_MonoCorpus();
+
+private:
+	IndexType maxVocIdFromCorpus;
+	void outputSuffix(char * filename);	
+	void outputOffset(char * filename);
+	void outputCorpus(char * filename);
+
+	IndexType vocIdForSentIdPlaceHolder;
+	IndexType vocIdForSentStart;
+	IndexType vocIdForSentEnd;
+	IndexType vocIdForCorpusEnd;
+
+	TextLenType currentPosInCorpus;
+	void allocateMem(TextLenType corpusSize);
+
+	C_SuffixPointer * suffix;
+	unsigned char * offsetList;
+	C_IDVocabulary * voc;
+
+	bool vocNeedsToBeUpdated;
+
+};
+
+#endif // !defined(__MonoCorpus__H__INCLUDED_)
diff --git a/Src/SALM-API-Description.txt b/Src/SALM-API-Description.txt
new file mode 100755
index 0000000..c36f60c
--- /dev/null
+++ b/Src/SALM-API-Description.txt
@@ -0,0 +1,24 @@
+/**
+* \defgroup index Indexing the corpus
+* \defgroup search Search Applications
+* \defgroup scan Scan Applications
+* \defgroup lm Suffix Array Language Model
+* \defgroup utils Utilities
+*
+* \mainpage SALM API Documentation
+*	Author: <a href=mailto:joy+salm@cs.cmu.edu > Ying (Joy) Zhang </a>
+*	\section intro Introduction
+*
+*	There are three main modules in <a href=http://projectile.is.cs.cmu.edu/research/public/tools/salm/salm.htm > SALM </a> : Indexing, Searching and Scanning.
+*	To start, use IndexSA to index the corpus according to its suffix array.
+*	This is the first step for all applications.
+*	Once the corpus is indexed. We can use SALM to do all kinds of interesting process on this corpus.
+*	\section search Applications based on searching the corpus
+*	These applications searches for the occurrences of an n-gram or all the embedded n-grams of a sentence in the corpus.
+*	\section scan Applications based on scanning the corpus
+*	These applications scan through the corpus in a linear time and collects information such as the type/token frequency of the n-grams in the data.
+*	\section lm Suffix Array Language Model
+*	An online language model based on the suffix array indexing. Suffix array language model can use arbitrarily long history and very large corpus.
+*	\section utils Utilities
+*	Utility functions such as updating the universal ID vocabulary after observing a new corpus
+**/
diff --git a/Src/Shared/_IDVocabulary.cpp b/Src/Shared/_IDVocabulary.cpp
new file mode 100755
index 0000000..a34b043
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.cpp
@@ -0,0 +1,219 @@
+/**
+* _IDVocabulary.cpp: implementation of the C_IDVocabulary class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+
+#include "_IDVocabulary.h"
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <memory.h>
+#include <stdlib.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_IDVocabulary::C_IDVocabulary()
+{
+	this->maxIdInVoc = 0;	
+}
+
+C_IDVocabulary::C_IDVocabulary(const char * fileName)
+{
+		
+	this->maxIdInVoc = 0;	
+
+	this->loadFromFile(fileName);
+}
+
+C_IDVocabulary::~C_IDVocabulary()
+{
+
+}
+
+/// Return the vocID of word "text" if it exist in the vocabulary
+/// Otherwise return 0
+IndexType C_IDVocabulary::returnId(C_String text)
+{
+	IndexType id;
+	
+	map<C_String, IndexType, ltstr>::iterator iterText2Id;
+	iterText2Id = this->text2id.find(text);
+
+	if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk>
+		id = 0;
+	}
+	else{
+		id = iterText2Id->second;
+	}
+	
+	return id;
+}
+
+/// Return the text of the word given its vocID
+/// return <UNK> if specified vocID does not exist
+C_String C_IDVocabulary::getText(IndexType id)
+{
+	map<IndexType, C_String>::iterator iterId2Text;
+	iterId2Text = this->id2text.find(id);
+
+	if(iterId2Text==this->id2text.end()){
+		return C_String("<UNK>");
+	}
+
+	return iterId2Text->second;
+}
+
+IndexType C_IDVocabulary::getSize()
+{
+	return this->text2id.size();
+}
+
+
+/// Load the vocabulary file into memory
+/// The format of the vocabulary file is:
+///		word vocID
+//	in each line.
+void C_IDVocabulary::loadFromFile(const char *fileName)
+{
+
+	ifstream existingVocFile;
+	existingVocFile.open(fileName);
+
+	if(!existingVocFile){
+		cerr<<"Can not open existing vocabulary file "<<fileName<<endl;
+		exit(0);
+	}
+
+	cerr<<"Loading existing vocabulary file: "<<fileName<<endl;
+
+	char aLine[1024];
+	char * aToken;
+	char delimit[] = " \t\r\n";	
+	IndexType vocId = 0;
+	
+	while(!existingVocFile.eof()){
+		existingVocFile.getline(aLine, 1024, '\n');
+		
+		if(strlen(aLine)>0){	//a meaningful word, esp for the last line during reading file
+				vector<C_String> tokensInLine;
+
+				aToken = strtok(aLine, delimit);				
+				while( aToken != NULL ) {	
+					tokensInLine.push_back(C_String(aToken));
+					aToken = strtok( NULL, delimit);
+				}
+				
+				if(tokensInLine.size()!=2){
+					cerr<<"Not valid format for Vocabulary: "<<aLine<<endl;
+				}
+				
+				vocId = atoi(tokensInLine[1].toString());
+
+				if(vocId>this->maxIdInVoc){
+					this->maxIdInVoc = vocId;
+				}
+
+				this->text2id.insert(make_pair(tokensInLine[0], vocId));
+				this->id2text.insert(make_pair(vocId, tokensInLine[0] ));
+		
+		}
+		
+		aLine[0]=0;
+	}
+	cerr<<"Total "<<this->text2id.size()<<" word types loaded\n";
+	cerr<<"Max VocID="<<this->maxIdInVoc<<endl;
+}
+
+/// Return the maximum ID from all words in the vocabulary
+/// Usually equals to the size of the vocabulary if the vocabulary is created from this corpus only.
+/// If the vocabulary includes words from other corpora and the vocabulary only lists words in this corpus,
+/// then max voc ID could be different from the vocabulary size
+IndexType C_IDVocabulary::returnMaxID()
+{
+	return this->maxIdInVoc;
+}
+
+IndexType C_IDVocabulary::returnNullWordID()
+{
+	return 0;
+}
+
+/**
+* Output the vocabulary to a file
+**/
+void C_IDVocabulary::outputToFile(char *filename)
+{
+
+	ofstream outputVocFile;
+	outputVocFile.open(filename);
+
+	if(!outputVocFile){
+		cerr<<"Can not open "<<filename<<" to write vocabulary\n";
+		exit(-1);
+	}
+
+	map<C_String, IndexType, ltstr>::iterator iterText2Id;
+
+	iterText2Id = this->text2id.begin();
+	while(iterText2Id!=this->text2id.end()){
+		outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl;
+		iterText2Id++;
+	}
+
+	outputVocFile.close();
+}
+
+/// Reserver vocID 0-NUMBER_OF_RESERVED_WORDS_IN_VOC for special words that might be useful for applications
+/// Here we reserved 5 words:
+/// _SENT_ID_PLACEHOLDER_ 1
+/// _END_OF_SENTENCE_ 2
+/// _TOO_LONG_TOKEN_ 3
+/// _SENTENCE_START_ 4
+/// _END_OF_CORPUS_ 5
+/// You can add other special words to the list as long as the assignment of vocID and its interpretation is consistent between application and indexing
+void C_IDVocabulary::addingReservedWords()
+{
+	this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1);
+	this->insertWord(C_String("_END_OF_SENTENCE_"), 2);
+	this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3);
+	this->insertWord(C_String("_SENTENCE_START_"), 4);
+	this->insertWord(C_String("_END_OF_CORPUS_"), 5);
+	
+	char reservedWord[20];
+	for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){
+		memset(reservedWord, 0, 20);
+		sprintf(reservedWord, "_RESERVED_WORDS_%d", i);
+		this->insertWord(C_String(reservedWord), i);
+	}
+}
+
+void C_IDVocabulary::insertWord(C_String text, IndexType id)
+{
+	this->text2id.insert(make_pair(text, id));
+	this->id2text.insert(make_pair(id, text));
+
+}
+
+/**
+*	Check if the word already exist in the voc, 
+*	if so, return the vocID of the word,
+*	otherwise assign an ID to this word and insert it into the voc
+**/
+IndexType C_IDVocabulary::getId(C_String text)
+{
+	IndexType id = this->returnId(text);
+	if(id==0){
+		this->maxIdInVoc++;
+		this->insertWord(text, this->maxIdInVoc);
+		return this->maxIdInVoc;
+	}
+
+	//else, already exist
+	return id;
+}
diff --git a/Src/Shared/_IDVocabulary.cpp~ b/Src/Shared/_IDVocabulary.cpp~
new file mode 100755
index 0000000..d5e6a14
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.cpp~
@@ -0,0 +1,218 @@
+/**
+* _IDVocabulary.cpp: implementation of the C_IDVocabulary class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+
+#include "_IDVocabulary.h"
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <memory.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_IDVocabulary::C_IDVocabulary()
+{
+	this->maxIdInVoc = 0;	
+}
+
+C_IDVocabulary::C_IDVocabulary(const char * fileName)
+{
+		
+	this->maxIdInVoc = 0;	
+
+	this->loadFromFile(fileName);
+}
+
+C_IDVocabulary::~C_IDVocabulary()
+{
+
+}
+
+/// Return the vocID of word "text" if it exist in the vocabulary
+/// Otherwise return 0
+IndexType C_IDVocabulary::returnId(C_String text)
+{
+	IndexType id;
+	
+	map<C_String, IndexType, ltstr>::iterator iterText2Id;
+	iterText2Id = this->text2id.find(text);
+
+	if(iterText2Id==this->text2id.end()){ //this word does not exist in the voc yet, return ID for <unk>
+		id = 0;
+	}
+	else{
+		id = iterText2Id->second;
+	}
+	
+	return id;
+}
+
+/// Return the text of the word given its vocID
+/// return <UNK> if specified vocID does not exist
+C_String C_IDVocabulary::getText(IndexType id)
+{
+	map<IndexType, C_String>::iterator iterId2Text;
+	iterId2Text = this->id2text.find(id);
+
+	if(iterId2Text==this->id2text.end()){
+		return C_String("<UNK>");
+	}
+
+	return iterId2Text->second;
+}
+
+IndexType C_IDVocabulary::getSize()
+{
+	return this->text2id.size();
+}
+
+
+/// Load the vocabulary file into memory
+/// The format of the vocabulary file is:
+///		word vocID
+//	in each line.
+void C_IDVocabulary::loadFromFile(const char *fileName)
+{
+
+	ifstream existingVocFile;
+	existingVocFile.open(fileName);
+
+	if(!existingVocFile){
+		cerr<<"Can not open existing vocabulary file "<<fileName<<endl;
+		exit(0);
+	}
+
+	cerr<<"Loading existing vocabulary file: "<<fileName<<endl;
+
+	char aLine[1024];
+	char * aToken;
+	char delimit[] = " \t\r\n";	
+	IndexType vocId = 0;
+	
+	while(!existingVocFile.eof()){
+		existingVocFile.getline(aLine, 1024, '\n');
+		
+		if(strlen(aLine)>0){	//a meaningful word, esp for the last line during reading file
+				vector<C_String> tokensInLine;
+
+				aToken = strtok(aLine, delimit);				
+				while( aToken != NULL ) {	
+					tokensInLine.push_back(C_String(aToken));
+					aToken = strtok( NULL, delimit);
+				}
+				
+				if(tokensInLine.size()!=2){
+					cerr<<"Not valid format for Vocabulary: "<<aLine<<endl;
+				}
+				
+				vocId = atoi(tokensInLine[1].toString());
+
+				if(vocId>this->maxIdInVoc){
+					this->maxIdInVoc = vocId;
+				}
+
+				this->text2id.insert(make_pair(tokensInLine[0], vocId));
+				this->id2text.insert(make_pair(vocId, tokensInLine[0] ));
+		
+		}
+		
+		aLine[0]=0;
+	}
+	cerr<<"Total "<<this->text2id.size()<<" word types loaded\n";
+	cerr<<"Max VocID="<<this->maxIdInVoc<<endl;
+}
+
+/// Return the maximum ID from all words in the vocabulary
+/// Usually equals to the size of the vocabulary if the vocabulary is created from this corpus only.
+/// If the vocabulary includes words from other corpora and the vocabulary only lists words in this corpus,
+/// then max voc ID could be different from the vocabulary size
+IndexType C_IDVocabulary::returnMaxID()
+{
+	return this->maxIdInVoc;
+}
+
+IndexType C_IDVocabulary::returnNullWordID()
+{
+	return 0;
+}
+
+/**
+* Output the vocabulary to a file
+**/
+void C_IDVocabulary::outputToFile(char *filename)
+{
+
+	ofstream outputVocFile;
+	outputVocFile.open(filename);
+
+	if(!outputVocFile){
+		cerr<<"Can not open "<<filename<<" to write vocabulary\n";
+		exit(-1);
+	}
+
+	map<C_String, IndexType, ltstr>::iterator iterText2Id;
+
+	iterText2Id = this->text2id.begin();
+	while(iterText2Id!=this->text2id.end()){
+		outputVocFile<<iterText2Id->first.toString()<<"\t"<<iterText2Id->second<<endl;
+		iterText2Id++;
+	}
+
+	outputVocFile.close();
+}
+
+/// Reserver vocID 0-NUMBER_OF_RESERVED_WORDS_IN_VOC for special words that might be useful for applications
+/// Here we reserved 5 words:
+/// _SENT_ID_PLACEHOLDER_ 1
+/// _END_OF_SENTENCE_ 2
+/// _TOO_LONG_TOKEN_ 3
+/// _SENTENCE_START_ 4
+/// _END_OF_CORPUS_ 5
+/// You can add other special words to the list as long as the assignment of vocID and its interpretation is consistent between application and indexing
+void C_IDVocabulary::addingReservedWords()
+{
+	this->insertWord(C_String("_SENT_ID_PLACEHOLDER_"), 1);
+	this->insertWord(C_String("_END_OF_SENTENCE_"), 2);
+	this->insertWord(C_String("_TOO_LONG_TOKEN_"), 3);
+	this->insertWord(C_String("_SENTENCE_START_"), 4);
+	this->insertWord(C_String("_END_OF_CORPUS_"), 5);
+	
+	char reservedWord[20];
+	for(int i=6; i<=NUMBER_OF_RESERVED_WORDS_IN_VOC; i++){
+		memset(reservedWord, 0, 20);
+		sprintf(reservedWord, "_RESERVED_WORDS_%d", i);
+		this->insertWord(C_String(reservedWord), i);
+	}
+}
+
+void C_IDVocabulary::insertWord(C_String text, IndexType id)
+{
+	this->text2id.insert(make_pair(text, id));
+	this->id2text.insert(make_pair(id, text));
+
+}
+
+/**
+*	Check if the word already exist in the voc, 
+*	if so, return the vocID of the word,
+*	otherwise assign an ID to this word and insert it into the voc
+**/
+IndexType C_IDVocabulary::getId(C_String text)
+{
+	IndexType id = this->returnId(text);
+	if(id==0){
+		this->maxIdInVoc++;
+		this->insertWord(text, this->maxIdInVoc);
+		return this->maxIdInVoc;
+	}
+
+	//else, already exist
+	return id;
+}
diff --git a/Src/Shared/_IDVocabulary.h b/Src/Shared/_IDVocabulary.h
new file mode 100755
index 0000000..fa50add
--- /dev/null
+++ b/Src/Shared/_IDVocabulary.h
@@ -0,0 +1,55 @@
+#if !defined(__IDVocabulary_H__INCLUDED_)
+#define __IDVocabulary_H__INCLUDED_
+
+#include "_String.h"
+#include <string>
+#include <map>
+#include <vector>
+#include "salm_shared.h"
+
+using namespace std;
+
+
+struct ltstr
+{
+  bool operator()(C_String s1, C_String s2) const
+  {
+    return s1<s2;
+  }
+};
+
+/**
+* Vocabulary class
+* Mapping between words and their IDs
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_IDVocabulary  
+{
+
+public:
+	///Return the ID of word "text", if the word does not exist, add the word into the voc and return the newly assigned ID
+	IndexType getId(C_String text);
+
+	void addingReservedWords();
+	void outputToFile(char * filename);
+	IndexType returnNullWordID();
+	IndexType returnMaxID();
+	IndexType returnId(C_String text);	
+
+	IndexType getSize();	
+	C_String getText(IndexType);	
+	
+	C_IDVocabulary();
+	C_IDVocabulary(const char * fileName);
+	virtual ~C_IDVocabulary();
+
+private:
+	void insertWord(C_String text, IndexType id);
+	void loadFromFile(const char * fileName);
+	IndexType maxIdInVoc;
+	map<C_String, IndexType, ltstr> text2id;
+	map<IndexType, C_String> id2text;	
+};
+
+#endif // !defined(__IDVocabulary_H__INCLUDED_)
diff --git a/Src/Shared/_String.cpp b/Src/Shared/_String.cpp
new file mode 100755
index 0000000..75ba8e8
--- /dev/null
+++ b/Src/Shared/_String.cpp
@@ -0,0 +1,253 @@
+/**
+* _String.cpp: implementation of the C_String class.
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_String.h"
+#include "malloc.h"
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_String::C_String()
+{
+	this->content = (char *) malloc(sizeof(char));
+	this->content[0]='\0';
+	this->hasContent = true;
+}
+
+void C_String::freeContent()
+{
+	if(this->hasContent){
+		this->hasContent = false;
+		free(this->content);
+	}
+}
+
+C_String::~C_String()
+{
+	this->freeContent();
+}
+
+/**
+* Copy constructor from a char string
+**/
+C_String::C_String(char * str1)
+{
+
+	this->content = (char *) malloc(sizeof(char)*strlen(str1)+1);
+	if(this->content==NULL){
+		fprintf(stderr,"Memory allocation error, Quit.\n");
+	}
+
+	strcpy(this->content, str1);
+
+	this->hasContent = true;
+}
+
+
+C_String::C_String(C_String const &strObj1)
+{
+	this->hasContent = false;
+	copy(strObj1);
+}
+
+C_String::C_String(const C_String & obj1, const C_String & obj2)
+{
+	this->freeContent();
+
+	int len1 = strlen(obj1.content);
+	int len2 = strlen(obj2.content);
+
+	int fullLen = len1+len2;
+	this->content = (char *)  malloc(sizeof(char)*len1 + sizeof(char)*len2 + 1);
+
+	if(this->content==NULL){
+		fprintf(stderr,"Memory allocation error, Quit.\n");
+	}
+
+	char * pointer = (char*) this->content; 
+	strcpy(pointer, obj1.content);	//copy first part
+	pointer += len1;
+	strcpy(pointer, obj2.content);	//copy second part
+	
+	this->content[fullLen]='\0';
+
+	this->hasContent = true;
+}
+
+void C_String::operator=(const C_String &strObj2)
+{
+	copy(strObj2);
+}
+
+void C_String::copy(const C_String &strObj)
+{
+	this->freeContent();
+
+	this->content = (char *)  malloc(sizeof(char)*strlen(strObj.content)+1);
+	if(this->content==NULL){
+		fprintf(stderr,"Memory allocation error, Quit.\n");
+	}
+
+	strcpy(this->content, strObj.content);
+	this->hasContent = true;
+}
+
+void C_String::copy(const C_String &strObj, int copyLen)
+{
+	this->freeContent();
+
+	this->content = (char *)  malloc(sizeof(char)*(copyLen+1) );
+	if(this->content==NULL){
+		fprintf(stderr,"Memory allocation error, Quit.\n");
+	}
+	
+	for(int i=0;i<copyLen;i++){
+		this->content[i]=strObj.getCharAtPos(i);
+	}
+
+	this->content[copyLen]='\0';
+
+	this->hasContent = true;
+	
+}
+
+void C_String::print2stream(FILE *stream)
+{
+	fprintf(stream, content);
+}
+
+
+int C_String::length() const
+{
+	if(this->hasContent){
+		return strlen(this->content);
+	}
+
+	return 0;
+}
+
+bool C_String::operator==(const C_String &obj1) const
+{
+	if(strcmp(this->content, obj1.content)==0){
+		return true;
+	};
+
+	return false;
+}
+
+bool C_String::operator!=(const C_String &obj1) const
+{
+	if(strcmp(this->content, obj1.content)!=0){
+		return true;
+	};
+
+	return false;
+}
+
+bool C_String::operator<(const C_String &obj1) const
+{
+	if(strcmp(this->content, obj1.content)<0){
+		return true;
+	};
+
+	return false;
+}
+
+char * C_String::toString() const
+{
+	return this->content;
+}
+
+void C_String::clear()
+{
+	this->freeContent();
+
+	this->content = (char *) malloc(sizeof(char));
+	this->content[0]='\0';
+	this->hasContent = true;
+}
+
+
+char C_String::getCharAtPos(int pos) const
+{
+	if(pos>=this->length()){
+		fprintf(stderr,"Can not get char at pos %d, out of bound! Exit.\n", pos);
+		exit(0);
+	}
+
+	return this->content[pos];
+}
+
+
+void C_String::appending(const C_String &obj)
+{
+	int len1 = 0;
+	
+	if(this->hasContent){
+		len1 = strlen(this->content);
+	}
+	
+	int len2 = strlen(obj.content);
+
+	int fullLen = len1+len2;
+
+	char * newContent = (char *)  malloc(sizeof(char)*fullLen + 1);
+	
+	if(newContent==NULL){
+		fprintf(stderr,"Memory allocation error, Quit.\n");
+	}
+
+	char * pointer = newContent; 
+	if(this->hasContent){
+		strcpy(pointer, content);	//copy first part
+		pointer += len1;
+	}
+
+	strcpy(pointer, obj.content);	//copy second part
+	newContent[fullLen]='\0';
+
+	//free old content
+	this->freeContent();
+
+	//point to new content
+	this->content = newContent;
+
+	this->hasContent = true;
+}
+
+void C_String::appending(const char nextChar)
+{
+	int len1 = 0;
+	
+	if(this->hasContent){
+		len1 = strlen(this->content);
+	}
+
+	int fullLen = len1+1;
+
+	char * newContent = (char *)  malloc(sizeof(char)*fullLen + 1);
+	
+	if(newContent==NULL){
+		fprintf(stderr,"Memory allocation error, Quit.\n");
+	}
+	
+	strcpy(newContent, content);	//copy first part	
+
+	newContent[len1]=nextChar;	//copy second part
+	newContent[fullLen]='\0';
+
+	//free old content
+	this->freeContent();
+
+	//point to new content
+	this->content = newContent;
+
+	this->hasContent = true;
+}
diff --git a/Src/Shared/_String.h b/Src/Shared/_String.h
new file mode 100755
index 0000000..d8f633d
--- /dev/null
+++ b/Src/Shared/_String.h
@@ -0,0 +1,45 @@
+#if !defined(__STRING_H__INCLUDED_)
+#define __STRING_H__INCLUDED_
+
+/**
+*	Definition of class C_String
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+#include "stdio.h"
+
+class C_String  
+{
+public:
+	
+	char getCharAtPos(int) const;
+	void clear();
+	char * toString() const;
+	int length() const;
+	void print2stream(FILE *);
+
+	C_String(const C_String & obj1, const C_String & obj2);
+	C_String(C_String const&);
+	C_String(char *);
+	C_String();
+
+	bool operator==(const C_String &) const;
+	bool operator!=(const C_String &) const;
+	bool operator<(const C_String &) const;
+	void operator=(const C_String &strObj2);
+	
+	void appending(const C_String & obj);
+	void appending(const char nextChar);
+
+	virtual ~C_String();
+
+private:
+	void freeContent();
+	void copy(const C_String &);
+	void copy(const C_String &strObj, int copyLen);
+
+	bool hasContent;
+	char * content;
+};
+
+#endif // !defined(__STRING_H__INCLUDED_)
diff --git a/Src/Shared/salm_shared.h b/Src/Shared/salm_shared.h
new file mode 100755
index 0000000..2c0e186
--- /dev/null
+++ b/Src/Shared/salm_shared.h
@@ -0,0 +1,36 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+#if !defined(_SA_common_h)
+#define _SA_common_h
+
+#include "math.h"
+
+typedef unsigned int IndexType;
+typedef unsigned int TextLenType;
+typedef unsigned short int SearchLenType;
+
+//constants
+const int SIZE_ONE_READ = 16384;	//when loading the data, each I/O read in SIZE_ONE_READ data points
+const int MAX_TOKEN_LEN = 1024;		//length of the longest word
+
+const int NUMBER_OF_RESERVED_WORDS_IN_VOC = 100;
+
+/// for language modeling
+const double SALM_PROB_UNK = 0.00000000023283064365386962890625; // 1/4G
+const double SALM_LOG_PROB_UNK = log(SALM_PROB_UNK);
+const double SALM_LOG_0 = -20;
+
+/**
+* \ingroup scan
+**/
+typedef struct s_nGramScanningInfoElement
+{
+	IndexType vocId;
+	TextLenType freqThreshForOutput;
+	TextLenType freqSoFar;
+}S_nGramScanningInfoElement;
+
+#endif
+
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp
new file mode 100755
index 0000000..ab2915d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp
@@ -0,0 +1,63 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "time.h"
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <cstring>
+
+using namespace std;
+
+/**
+* A simple example of using the C_SuffixArrayLanguageModel class to calculate the LM prob of input sentences
+*
+* Revision $Rev: 3816 $
+* Last Modified $LastChangedDate: 2007-07-06 14:36:11 -0400 (Fri, 06 Jul 2007) $
+**/
+int main(int argc, char * argv[]){
+	if(argc<2){
+		cerr<<"\nUsage:\n\t"<<argv[0]<<" configurationFileName < sentences\n";
+		exit(0);
+	}
+
+	C_SuffixArrayLanguageModel salm(argv[1]);
+
+	long ltime1, ltime2;
+	time( &ltime1 );
+
+	string aWord;
+	char aLine[10240];
+	while(!cin.eof()){
+		cin.getline(aLine, 10240, '\n');
+
+		if(strlen(aLine)>0){
+			istringstream inputLine(aLine, istringstream::in);
+			LMState lmState = salm.beginOfSentenceState();
+
+			LMState nextState;
+			double logProb = 0;
+
+			while(! inputLine.eof()){
+				inputLine>>aWord;
+				if(aWord.length()>0){
+					IndexType vocId = salm.returnVocId(C_String((char *) aWord.c_str()));
+					logProb+=salm.logProb(lmState, vocId, nextState);
+					lmState = nextState;
+				}
+				aWord="";
+			}
+
+			logProb+=salm.logProbEnd(lmState);
+			cout<<"LogProb="<<logProb<<endl;
+			
+		}
+
+		aLine[0]=0;
+	}
+
+	time( &ltime2 );	
+	cerr<<"\n"<<ltime2-ltime1<<" seconds spent."<<endl;
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~
new file mode 100755
index 0000000..95e7993
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/Applications/EvaluateLM.cpp~
@@ -0,0 +1,62 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "time.h"
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <sstream>
+#include <string>
+
+using namespace std;
+
+/**
+* A simple example of using the C_SuffixArrayLanguageModel class to calculate the LM prob of input sentences
+*
+* Revision $Rev: 3816 $
+* Last Modified $LastChangedDate: 2007-07-06 14:36:11 -0400 (Fri, 06 Jul 2007) $
+**/
+int main(int argc, char * argv[]){
+	if(argc<2){
+		cerr<<"\nUsage:\n\t"<<argv[0]<<" configurationFileName < sentences\n";
+		exit(0);
+	}
+
+	C_SuffixArrayLanguageModel salm(argv[1]);
+
+	long ltime1, ltime2;
+	time( &ltime1 );
+
+	string aWord;
+	char aLine[10240];
+	while(!cin.eof()){
+		cin.getline(aLine, 10240, '\n');
+
+		if(strlen(aLine)>0){
+			istringstream inputLine(aLine, istringstream::in);
+			LMState lmState = salm.beginOfSentenceState();
+
+			LMState nextState;
+			double logProb = 0;
+
+			while(! inputLine.eof()){
+				inputLine>>aWord;
+				if(aWord.length()>0){
+					IndexType vocId = salm.returnVocId(C_String((char *) aWord.c_str()));
+					logProb+=salm.logProb(lmState, vocId, nextState);
+					lmState = nextState;
+				}
+				aWord="";
+			}
+
+			logProb+=salm.logProbEnd(lmState);
+			cout<<"LogProb="<<logProb<<endl;
+			
+		}
+
+		aLine[0]=0;
+	}
+
+	time( &ltime2 );	
+	cerr<<"\n"<<ltime2-ltime1<<" seconds spent."<<endl;
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt
new file mode 100755
index 0000000..17cd5a8
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/Readme.txt
@@ -0,0 +1,5 @@
+June 27, 2007
+
+Working branch of applying KN smoothing in LM.
+Not finished yet.
+Do not distribute!
+\ No newline at end of file
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp
new file mode 100755
index 0000000..583b222
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.cpp
@@ -0,0 +1,1113 @@
+/**
+* Revision $Rev: 3665 $
+* $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+
+#include  "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <set>
+
+#include "math.h"
+
+using namespace std;
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+	
+}
+
+
+/**
+* Construct the suffix array language model object
+* Take the configuration filename as the parameter for the constructor
+* 
+* The configuration file is of the following format for each line:
+* 
+* Keyword<tab>value
+* <p>
+* Note: keywords are all case sensitive.
+* <ul>
+*	<li> <b>CORPUS</b> filename of the corpus for LM training. It should be the same as used in IndexSA
+*	<li> <b>N</b> Highest order of n considered for n-gram LM. Default value = <i>5</i>
+*	<li> <b>SMOOTHING_STRATEGY</b> Smoothing strategy.
+*		<ul>
+*			<li> <i>k</i> : default value. Modified Kneser-Ney Smoothing @see <a href=http://acl.ldc.upenn.edu/P/P96/P96-1041.pdf> An Empirical Study of Smoothing Techniques for Language Modeling </a> 
+*			<li> <i>g</i> : Good-Turing discounting @see <a href=http://l2r.cs.uiuc.edu/~danr/Teaching/CS598-05/Papers/Gale-Sampson-smoothgoodturing.pdf> Good Turing without Tears</a>
+*		</ul>
+*	<li> <b>INTERPOLATION_STRATEGY</b> : Interpolation strategy
+*		<ul>
+*			<li> <i>e</i> : Probability of the next word predicted by histories of different orders are equally interpolated
+*			<li> <i>m</i> : Use the maximum conditional probability from all different order of history as the probability for the next word
+*			<li> <i>i</i> : Use deleted interpolation based on heuristics developed by IBM
+*		</ul>
+*	<li> <b>MAX_FREQ_DISC</b>: <br>
+*						<i>default</i>=50<br>
+*							If the frequency of an n-gram is lower than this value and SMOOTHING is set, discounting will be applied. <br>
+*							If this value is set to 0 or negative values, smoothing/discounting will not be used. <br>
+*	<li> <b>PURGE_CACHE</b>: Check entries in the cache after "PURGE_CACHE" number of sentences have been processed. Default = 100.
+*	<li> <b>FRESH_TIME</b>: Entries in the cache that are not used since "current time - FRESH_TIME" will be purged from the cache. Mesured in seconds of wall clock time.
+** </ul>
+* @param Configuration File Name 
+* @param corpusFileNameStem The training corpus filename used by IndexSA.
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
+{
+
+	fstream cfgFile;
+	cfgFile.open(cfgFileName,ios::in);
+
+	if(!cfgFile){
+		fprintf(stderr,"Configuration file %s does not exist! quit!!\n", cfgFileName);
+		exit(-1);
+	}
+
+	//-----------------------------------------------------------------------------
+	//reading parameters
+	char paraName[1024];
+	char corpusFileNameStem[1024];
+
+	corpusFileNameStem[0]='\0';
+
+	//default values for member variables
+	this->interpolationStrategy = 'e';		//default interpolation strategy: equally weighted n-gram conditional prob
+	this->smoothingStrategy = 'k';			//default smoothing strategy: modified Kneser-Ney smoothing
+	this->maxFreqForDiscounting = 50;		//default, freq that is lower than this value will not be applied with discounting
+	this->maxN= 5;         // default value; consider up to 5 words
+	
+	this->numberOfSentSeenToPurgeCache = 100;	//default value, purge cache after processing 100 sentences
+	this->freshTime = 50;				//entries in the cache that are older than 50 seconds are subject to purging
+	this->sentenceProcessedSoFar = 0;
+	this->typeOfBigrams = 0;
+	
+	while(!cfgFile.eof()){
+		cfgFile>>paraName;
+
+		if(strcmp(paraName,"CORPUS")==0){
+			cfgFile>>corpusFileNameStem;
+		}
+		else if(strcmp(paraName, "SMOOTHING_STRATEGY")==0){
+			cfgFile>>this->smoothingStrategy;
+		}
+		else if(strcmp(paraName,"N")==0){
+			cfgFile>>this->maxN;
+		}
+		else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+			cfgFile>>this->maxFreqForDiscounting;
+		}
+		else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+			cfgFile>>this->interpolationStrategy;
+		}
+		else if(strcmp(paraName,"FRESH_TIME")==0){
+			cfgFile>>this->freshTime;
+		}
+		else if(strcmp(paraName, "PURGE_CACHE")==0){
+			cfgFile>>this->numberOfSentSeenToPurgeCache;
+		}
+
+		paraName[0]=0;
+		
+	}
+
+
+	if(strlen(corpusFileNameStem)==0){
+		cerr<<"CORPUS not specified in the configuration file! Quit!"<<endl;
+		exit(-1);
+	}
+
+
+	this->loadData_forSearch(corpusFileNameStem, false, true);	//call the constructor of the super class
+					   //corpusName, with vocabulary, no offset, 
+
+	
+	this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+
+	//initialize the scanning list
+	for(int i=0;i<this->maxN;i++){
+		this->nGramScanningList[i].freqSoFar=0;
+		this->nGramScanningList[i].vocId = 0;
+		this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1;	//default, do not output
+	}
+
+	//get vocID for sentEnd
+	this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+	if(this->vocIdForSentEnd==0){
+		cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+		exit(0);
+	}
+
+	this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+	if(this->vocIdForSentStart==0){
+		cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+		exit(0);
+	}
+	
+	this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+	if(this->vocIdForCorpusEnd==0){
+		cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+		exit(0);
+	}
+
+	if(this->maxFreqForDiscounting<=0){
+		this->applyDiscounting = false;
+	}
+	else{
+		if(this->maxFreqForDiscounting<3){
+			cerr<<"MAX_FREQ_DISC has to be at least 3!"<<endl;
+			exit(-1);
+		}
+
+		this->applyDiscounting = true;		
+		this->constructDiscountingMap();	//scan the corpus and construct the count of counts table and then discounting map		
+	}
+
+}
+
+/**
+* Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 'e' for equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+void C_SuffixArrayLanguageModel::setParam_interpolationStrategy(char interpolationStrategy)
+{
+	this->interpolationStrategy = interpolationStrategy;
+}
+
+/**
+* Set the value for parameter :numberOfSentSeenToPurgeCache
+* LM will purge the entries in the cache that have not been used in 'freshTime'
+**/
+void C_SuffixArrayLanguageModel::setParam_numberOfSentSeenToPurgeCache(int numberOfSentSeenToPurgeCache)
+{
+	this->numberOfSentSeenToPurgeCache = numberOfSentSeenToPurgeCache;
+}
+
+/**
+* Set the value for parameter: freshTime
+* LM will purge the entries in the cache that have not been used in 'freshTime'
+**/
+void C_SuffixArrayLanguageModel::setParam_freshTime(long freshTime)
+{
+	this->freshTime = freshTime;
+}
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information 
+* and construct the discounting using Good-Turing smoothing
+* Also, estimate the Y, D1, D2, D3+ values as needed for the modified Kneser-Ney smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+	unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+	this->typeOfBigrams = 0;
+
+	if(countOfCountsTable==NULL){
+		cerr<<"Count of counts table can not be initialized. Exit\n";
+		exit(0);
+	}
+
+	for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+		countOfCountsTable[c]=0;
+	}
+
+
+	int i,j;
+	bool stillMeaningful = true;	
+	TextLenType saPos=0;
+
+	while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+		TextLenType posInCorpus = this->suffix_list[saPos];
+		IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+		if(wordInCorpus<this->sentIdStart){	//SA positions pointing to sentID are not interesting
+			
+			if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){	//n-grams start with <s> and </s>, or <end of corpus> are not interested
+			
+				bool quit =false;
+				i=0;
+
+				while(!quit && (i<this->maxN)){
+					wordInCorpus = this->corpus_list[posInCorpus+i];
+					if(						
+						(wordInCorpus<this->sentIdStart)&&
+						(wordInCorpus!=this->vocIdForSentEnd)&&
+						(wordInCorpus!=this->vocIdForSentStart)&&
+						(wordInCorpus==this->nGramScanningList[i].vocId)){	//still match
+
+						this->nGramScanningList[i].freqSoFar++;
+					}
+					else{	//we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+									
+						bool validNgramUpSoFar = true;
+						unsigned int freqSoFar;
+
+						
+						
+						for(j=i;j<this->maxN;j++){				
+							
+							
+							if(this->nGramScanningList[j].vocId==0){		//a NULL word, then this n-gram and longer ones in the scan window are invalid
+								validNgramUpSoFar = false;
+							}
+
+							if(validNgramUpSoFar){		//perform actions depends on actionType
+								
+								if(j==1){	//a new bigram type, this information is important for KN-smoothing
+									this->typeOfBigrams++;
+								}
+
+
+								freqSoFar = this->nGramScanningList[j].freqSoFar;
+								if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+									//increase the count for (j+1)-gram with freq freqSoFar
+									countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+								}							
+							}
+
+							//finished output, now clear the list from point of i
+							if((posInCorpus+j)<this->corpusSize){
+								wordInCorpus = this->corpus_list[posInCorpus+j];
+							}
+							else{
+								wordInCorpus = 0;	//out of bound for corpus
+							}
+
+							if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+								wordInCorpus=0;	//write 0 for <sentId>, <s> and </s>
+								this->nGramScanningList[j].freqSoFar = 0;
+							}
+							else{
+								this->nGramScanningList[j].freqSoFar = 1;
+							}
+
+							this->nGramScanningList[j].vocId = wordInCorpus;							
+						}
+
+						quit=true;	//at i+1 gram, already not match, no need to check for longer
+					}
+
+					i++;
+				}
+			}
+		}
+		else{
+			stillMeaningful = false;	//once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+		}
+
+		saPos++;
+	}
+
+	//at the end of corpus (according to suffix order)
+	bool validNgramUpSoFar = true;
+	unsigned int freqSoFar;
+	for(i=0;i<this->maxN;i++){
+		if(this->nGramScanningList[i].vocId==0){	//invalide word
+			validNgramUpSoFar = false;
+		}
+
+		if(validNgramUpSoFar){
+
+			if(i==1){
+				this->typeOfBigrams++;
+			}
+			
+			freqSoFar = this->nGramScanningList[i].freqSoFar;
+			if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+				//increase the count for (i+1)-gram with freq freqSoFar
+				countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+			}			
+		}
+	}
+
+	//now, use Good-Turing discounting to create frequency mapping
+	//still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+	this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+		
+	for(i=0;i<this->maxN;i++){
+		//for (i+1)-gram
+			
+		unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+		double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+		for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){	//only goes to maxFreq-1, because we can not discount maxFreq
+			//for all (freq+1) ngrams
+			if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){	//both freq exists
+				discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);			
+			}
+			else{
+				discountingMapForThisN[freq] = -1;
+			}
+		}
+
+		discountingMapForThisN[this->maxFreqForDiscounting-1] = -1;	//won't be used, just for consistency
+	}
+
+
+	//estimate the Y, D1, D2 and D3+ values for each order of n.
+	//these values will be used for KN-smoothing to estimate the gamma, the discounting factor
+	this->Y = (double *) malloc(sizeof(double) * this->maxN);
+	this->D1 = (double *) malloc(sizeof(double) * this->maxN);
+	this->D2 = (double *) malloc(sizeof(double) * this->maxN);
+	this->D3plus = (double *) malloc(sizeof(double) * this->maxN);
+
+	for(i=0;i<this->maxN;i++){
+		unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+		double n1 = ccTableForThisN[0];	//number of n-gram types that have freq equals 1
+		double n2 = ccTableForThisN[1];	//number of n-gram types that have freq equals 2;
+		double n3 = ccTableForThisN[2];	//number of n-gram types that have freq equals 3;
+		double n4 = ccTableForThisN[3];	//number of n-gram types that have freq equals 4;
+
+		this->Y[i] = n1/(n1+2*n2);		//for (i+1)-gram
+		this->D1[i] = 1-2*Y[i]*n2/n1;
+		this->D2[i] = 2-3*Y[i]*n3/n2;
+		this->D3plus[i] = 3 - 4*Y[i]*n4/n3;
+	}
+
+	free(countOfCountsTable);
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+///	    3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+	vector<IndexType> nGram;	
+
+	if(currentMatchStart!=(TextLenType) -1){	//-1 will be <unk>
+		if(currentMatchLen==this->maxN){	//we consider only up to this->maxN for the extended n-gram
+			currentMatchStart++;
+			currentMatchLen--;
+		}
+
+		for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+			nGram.push_back(this->corpus_list[pos]);
+		}
+	}
+
+	nGram.push_back(nextWord);
+
+	int sentLen = nGram.size();
+	
+	//construct the n-gram search table	
+	S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+	int startPosForNgram;
+	int startPosForLongestMatchingWithNextWord;
+	int cellIndexForLongestMatchingWithNextWord;
+
+	bool stillMatched = true;
+	bool atLeastOneMatched = false;
+
+	int indexForNgram;
+
+	unsigned int totalOccurrences;
+	unsigned int totalOccurrencesOfHistory;
+
+	//for unigram
+	indexForNgram = sentLen - 1;
+	if(table[indexForNgram].found){
+		totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+		if(this->smoothingStrategy=='g'){	//if use Good-Turing for discounting
+			freqTable[0] = this->discountFreq_GT(1, totalOccurrences);
+		}
+		else{
+			freqTable[0] = totalOccurrences; 
+		}
+
+		freqTable[1] = this->corpusSize;
+		cellIndexForLongestMatchingWithNextWord = indexForNgram;
+		startPosForLongestMatchingWithNextWord = sentLen-1;
+		atLeastOneMatched = true;
+	}
+	else{
+		stillMatched = false;
+	}
+
+	int n=2;	//considering 2-gram and longer n-gram now
+	startPosForNgram = sentLen - 2;
+	while((stillMatched)&&(startPosForNgram>=0)){
+		
+		indexForNgram = (n-1) * sentLen + startPosForNgram;
+		int indexForHistory = (n-2) * sentLen +  startPosForNgram;
+		
+		if(table[indexForNgram].found){
+						
+			totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;	
+			totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+			
+			if(this->applyDiscounting){
+				freqTable[2*n-2] = this->discountFreq_GT(n, totalOccurrences);				
+			}
+			else{
+				freqTable[2*n-2] = (double)totalOccurrences;
+			}
+
+			freqTable[2*n-1] = (double) totalOccurrencesOfHistory;	//do not discount the history
+			
+			if(n<this->maxN){	//new history is at most this->maxFreqForDiscounting-1 words long
+				cellIndexForLongestMatchingWithNextWord = indexForNgram;
+				startPosForLongestMatchingWithNextWord = startPosForNgram;
+			}
+		}
+		else{
+			stillMatched = false;
+		}
+
+		startPosForNgram--;
+		n++;
+	}
+
+	if(atLeastOneMatched){	//at least one n-gram can be matched with 'nextWord'
+		updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+		updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+	}
+	else{
+		updatedMatchingStart = (TextLenType) -1;
+		updatedMatchingLen = 0;
+	}
+
+	free(table);
+
+}
+
+
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, S_ContextTypeInfo * contextTypeInfo, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+	vector<IndexType> nGram;	
+
+	if(currentMatchStart!=(TextLenType) -1){	//-1 will be <unk>
+		if(currentMatchLen==this->maxN){	//we consider only up to this->maxN for the extended n-gram
+			currentMatchStart++;
+			currentMatchLen--;
+		}
+
+		for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+			nGram.push_back(this->corpus_list[pos]);
+		}
+	}
+
+	nGram.push_back(nextWord);
+
+	int sentLen = nGram.size();
+	
+	//construct the n-gram search table	
+	S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+	int startPosForNgram;
+	int startPosForLongestMatchingWithNextWord;
+	int cellIndexForLongestMatchingWithNextWord;
+
+	bool stillMatched = true;
+	bool atLeastOneMatched = false;
+
+	int indexForNgram;
+
+	unsigned int totalOccurrences;
+	unsigned int totalOccurrencesOfHistory;
+
+	//for unigram
+	indexForNgram = sentLen - 1;
+	if(table[indexForNgram].found){
+		totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+		
+		freqTable[0] = totalOccurrences; 
+		freqTable[1] = this->corpusSize;
+
+		cellIndexForLongestMatchingWithNextWord = indexForNgram;
+		startPosForLongestMatchingWithNextWord = sentLen-1;
+		atLeastOneMatched = true;
+	}
+	else{
+		stillMatched = false;
+	}
+
+	int n=2;	//considering 2-gram and longer n-gram now for token freq
+	startPosForNgram = sentLen - n;
+	while((stillMatched)&&(startPosForNgram>=0)){
+		
+		indexForNgram = (n-1) * sentLen + startPosForNgram;
+		int indexForHistory = (n-2) * sentLen +  startPosForNgram;
+		
+		if(table[indexForNgram].found){
+						
+			totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;	
+			totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+			freqTable[2*n-2] = (double)totalOccurrences;			
+			freqTable[2*n-1] = (double) totalOccurrencesOfHistory;	//do not discount the history
+			
+			if(n<this->maxN){	//new history is at most this->maxFreqForDiscounting-1 words long
+				cellIndexForLongestMatchingWithNextWord = indexForNgram;
+				startPosForLongestMatchingWithNextWord = startPosForNgram;
+			}
+		}
+		else{
+			stillMatched = false;
+		}
+
+		startPosForNgram--;
+		n++;
+	}
+
+	if(atLeastOneMatched){	//at least one n-gram can be matched with 'nextWord'
+		updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+		updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+	}
+	else{
+		updatedMatchingStart = (TextLenType) -1;
+		updatedMatchingLen = 0;
+	}
+
+
+	//estimate the context type information which will be used for KN-smoothing
+	for(n=2;n<=sentLen;n++){
+		startPosForNgram = sentLen - n;
+		TextLenType w_in2_i1_startPos_in_SA = 0;
+		TextLenType w_in2_i1_endPos_in_SA = 0;
+
+		if(n>2){
+		  int indexForW_in2_i1 = (n-3) * sentLen + startPosForNgram + 1;  //the location information for w_{i-n+2}^{i-1} of length n-2
+		  w_in2_i1_startPos_in_SA = table[indexForW_in2_i1].startPosInSA;
+		  w_in2_i1_endPos_in_SA = table[indexForW_in2_i1].endingPosInSA;
+		}
+
+		int indexForW_in1_i1 = (n-2) * sentLen + startPosForNgram; //the location information of w_{i-n+1}^{i-1} of length n-1
+
+		this->scanCorpusForContextTypeInfo(n, nextWord,
+						   w_in2_i1_startPos_in_SA, w_in2_i1_endPos_in_SA,
+					table[indexForW_in1_i1].startPosInSA, table[indexForW_in1_i1].endingPosInSA,
+					contextTypeInfo[n-1]);
+	}
+
+	free(table);
+
+
+}
+
+///given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq_GT(int n, unsigned int observedFreq)
+{
+	if(n>=this->maxN){	//do not discount
+		return (double) observedFreq;
+	}
+
+	if(observedFreq>=(this->maxFreqForDiscounting-1)){	//no discounting for high freq
+		return (double) observedFreq;
+	}
+
+	//else, check the discount map
+	double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+	if(discountedFreq>0){
+		return discountedFreq;
+	}
+
+	//else, no discounting
+	return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+///Increase the count of 'sentenceProcessedSoFar'
+///If LM has processed 'numberOfSentSeenToPurgeCache' sentences
+///it is time to check if old entries in the cache should be cleaned
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+	long currentTime;
+	time(&currentTime);
+	
+	this->resetLmStates();
+	this->initialLmState();	
+
+	this->sentenceProcessedSoFar++;
+
+	if(this->sentenceProcessedSoFar==this->numberOfSentSeenToPurgeCache){
+		//purge the cache
+		this->purgeCache(currentTime-this->freshTime);
+		
+		this->sentenceProcessedSoFar = 0;
+	}
+
+	return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+	//add sentence start
+	S_LMStateInfo sentStartNode;
+	sentStartNode.posInCorpus = 1;	//if corpus is indexed correctly position 1 should be <s>
+	sentStartNode.len = 1;
+
+	this->allLMStates.push_back(sentStartNode);
+	this->lmStateInfo2Id.insert(make_pair(sentStartNode, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+	this->buffer.clear();
+	this->allLMStates.clear();
+	this->lmStateInfo2Id.clear();
+}
+
+/** 
+* Purge entries in the cache that are not visited after "lastVisitedTime"
+* @param lastVisitedTime Entries in the cache that are older than 'lastVisitedTime' parameter will be purged
+**/
+void C_SuffixArrayLanguageModel::purgeCache(long lastVisitedTime)
+{
+	//cerr<<this->cached_sa_access.size()<<" entries in cache, purged to ";
+	
+	map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key>::iterator iter1,iter2;
+
+	iter1 = this->cached_sa_access.begin();
+
+	while(iter1!=this->cached_sa_access.end()){
+		iter2=iter1;
+		iter2++;
+		
+		if(iter1->second.lastTimedUsed<lastVisitedTime){
+			this->cached_sa_access.erase(iter1);
+		}
+
+		iter1=iter2;
+	}
+	//cerr<<this->cached_sa_access.size()<<" entries"<<endl;
+}
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+
+	//first check if we have already seen this before
+	map< pair<LMState, IndexType>, S_BufferedLmInfo>::iterator iterBuffer;
+	iterBuffer = this->buffer.find( make_pair( lmState, nextWord) );
+
+	if(iterBuffer==this->buffer.end()){ //we haven't seen this lmState+word yet
+		//search for it in the corpus
+		S_LMStateInfo lmStateInfo = this->allLMStates[lmState];
+		TextLenType updatedMatchingStart;
+		unsigned char updatedMatchingLen;
+		
+		double logProb = this->logProbOfNgramFromCorpusInfo(lmStateInfo.posInCorpus, lmStateInfo.len, nextWord, updatedMatchingStart, updatedMatchingLen);
+		
+
+		S_LMStateInfo updatedLmStateInfo;
+		updatedLmStateInfo.posInCorpus = updatedMatchingStart;
+		updatedLmStateInfo.len = updatedMatchingLen;
+
+		int updatedLmStateId;
+		map<S_LMStateInfo, int, lt_lmStateInfo>::iterator iterLmStateInfo2Id;
+		iterLmStateInfo2Id = this->lmStateInfo2Id.find(updatedLmStateInfo);
+		if(iterLmStateInfo2Id==this->lmStateInfo2Id.end()){	//this updated lm state does not exist yet
+			this->allLMStates.push_back(updatedLmStateInfo);
+			updatedLmStateId = this->allLMStates.size()-1;
+			this->lmStateInfo2Id.insert(make_pair(updatedLmStateInfo, updatedLmStateId));
+		}
+		else{
+			updatedLmStateId = iterLmStateInfo2Id->second;
+		}
+
+		//buffer this
+		S_BufferedLmInfo bufferedLmInfo;
+		bufferedLmInfo.logProb = logProb;
+		bufferedLmInfo.nextState = updatedLmStateId;
+
+		this->buffer.insert(make_pair( make_pair(lmState, nextWord), bufferedLmInfo));
+
+		//updated next state
+		nextState = updatedLmStateId;
+		
+		return logProb;
+	}
+
+	nextState = iterBuffer->second.nextState;
+
+	return iterBuffer->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+	double logProb = 0;
+	for(int i=0;i<phrase.size();i++){
+		logProb+=this->logProb(lmState, phrase[i], nextState);
+		lmState = nextState;
+	}
+
+	return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+	LMState dummyNextState;
+	return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbOfNgramFromCorpusInfo(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+	long currentTime;
+	time(&currentTime);
+
+	double logProb;
+
+	//first check if information is already in cache
+	S_CachedSA_Access_Key accessKey;
+	accessKey.currentMatchStart = currentMatchStart;
+	accessKey.currentMatchLen = currentMatchLen;
+	accessKey.nextWord = nextWord;
+
+	map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key>::iterator iter_cached_sa_access;
+
+	iter_cached_sa_access = this->cached_sa_access.find(accessKey);
+
+	if(iter_cached_sa_access==this->cached_sa_access.end()){	//information not in cache yet
+		double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+		memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+		S_ContextTypeInfo * contextTypeInfo = (S_ContextTypeInfo *) malloc(sizeof(S_ContextTypeInfo)*this->maxN);
+
+		switch(this->smoothingStrategy){
+			case 'k':	//for Modified Kneser-Ney smoothing
+				
+				this->calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, contextTypeInfo, updatedMatchingStart, updatedMatchingLen);
+				logProb = this->calcLogProb_kneserNeySmoothing(freqTable, contextTypeInfo);
+				break;
+			default:	//all other cases including 'g' (Good-Turing smoothing)
+				this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+				logProb = this->calcLogProb(freqTable);
+		}
+
+		free(freqTable);
+		free(contextTypeInfo);
+
+		//insert the info into the cache
+		S_Cached_SA_Access_Info accessInfo;
+		accessInfo.updatedMatchingStart = updatedMatchingStart;
+		accessInfo.updatedMatchingLen = updatedMatchingLen;
+		accessInfo.logProb = logProb;
+		accessInfo.lastTimedUsed = currentTime;
+
+		this->cached_sa_access.insert(make_pair(accessKey, accessInfo));
+
+		return logProb;
+	}
+
+	//otherwise, already exist in the cache, just update the last touched time
+	updatedMatchingStart = iter_cached_sa_access->second.updatedMatchingStart;
+	updatedMatchingLen = iter_cached_sa_access->second.updatedMatchingLen;
+	logProb = iter_cached_sa_access->second.logProb;
+
+	return logProb;
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+	switch(this->interpolationStrategy){
+	case 'e':
+		return this->calcLogProb_equalWeightedInterpolation(freq);
+		break;
+	case 'i':
+		return this->calcLogProb_ibmHeuristicInterpolation(freq);
+		break;
+	case 'm':
+		return this->calcLogProb_maxProbInterpolation(freq);
+		break;
+	default:
+		cerr<<"Unknown interpolation strategy!\n";
+		exit(0);
+	}
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+	double prob = 0.0;
+
+	
+	if(freq[0]>0){
+
+		int i=0;
+		bool stillMatched = true;
+
+		while(stillMatched && (i<this->maxN)){
+			if(freq[2*i]>0){
+				prob+=freq[2*i]/freq[2*i+1];
+			}
+			else{
+				stillMatched = false;
+			}
+
+			i++;
+		}
+
+		return log(prob/(double)this->maxN);
+	}
+	else{	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+	double prob = 0.0;
+	if(freq[0]==0){	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+
+	double remainingWeightSum = 1.0;
+
+	//find the first non-zero match
+	int i = this->maxN - 1;
+
+	while(freq[2*i]==0){	//will stop for sure because freq[0]!=0
+		i--;
+	}
+
+	for(int j=i;j>=0;j--){
+		//for (j+1)-gram
+		double historyFreq = freq[2*j+1];
+		double logHistoryFreq = log(historyFreq);
+		if(logHistoryFreq>1){
+			logHistoryFreq = 1.0;	//cap it to 1
+		}
+
+		double reliability = 0.1*logHistoryFreq+0.3;	//heuristics for reliability of the history
+		double adjustedWeights = remainingWeightSum *  reliability;
+		
+		prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+		remainingWeightSum -= adjustedWeights;
+	}
+
+	return log(prob);	
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+	double maxProb = 0.0;
+	
+	if(freq[0]>0){
+
+		int i=0;
+		bool stillMatched = true;
+
+		while(stillMatched && (i<this->maxN)){
+			if(freq[2*i]>0){
+				double prob=freq[2*i]/freq[2*i+1];
+
+				if(prob>maxProb){
+					maxProb = prob;
+				}
+			}
+			else{
+				stillMatched = false;
+			}
+
+			i++;
+		}
+
+		return log(maxProb);
+	}
+	else{	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+}
+
+/**
+* Follow the implementation described in page 23 of Chen & Goodman tech report (section 4.1.6 and 4.1.7)
+* Use notation described in James 2000 pp3 for MODKN-COUNT
+**/
+double C_SuffixArrayLanguageModel::calcLogProb_kneserNeySmoothing(double *freq, S_ContextTypeInfo * contextTypeFreq)
+{
+	double prob = 0.0;
+	int i;
+	
+	if(freq[0]>0){
+		contextTypeFreq[i].
+	}
+	
+	//unknown word
+	return SALM_LOG_PROB_UNK;	
+}
+
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+	return this->voc->returnId(aWord);
+}
+
+
+/**
+* Scan corpus to collect important context-type information needed for KN-smoothing
+* Knowing where n-gram w_(i-n+2)^(i-1) occurs, scan corpus for N_{1+}(dot w_{i-n+2}^i)
+* and N_{1+}(dot w_{i-n+2}^{i-1} dot)
+* Also, collect type freq of n-grams w_{i-n+1}^{i-1} that occur exactly 1, 2 and 3+ times
+* to estimate the discounting factor gammar
+* 
+* @see Chen & Goodman 1998 page 19-20 for detailed description
+*
+* @param n order of n-gram
+* @param w_in1 VocId of w<sub>i-n+1</sub>
+* @param w_i VocId of w<sub>i</sub>, the next word to be predicted
+* @param leftBoundaryOfSaRangeFor_w_in2_i1
+* @param rightBoundaryOfSaRangeFor_w_in2_i1 [leftBoundaryOfSaRangeFor_w_in2_i1, rightBoundaryOfSaRangeFor_w_in2_i1] is the range of suffix array positions that correspond to the locations of phrase w<sub>i-n+2</sub><sup>i-1</sup>
+* @param leftBoundaryOfSaRangeFor_w_in1
+* @param rigthBoundaryOfSaRangeFor_w_i1 [leftBoundaryOfSaRangeFor_w_in1, rigthBoundaryOfSaRangeFor_w_i1] is the range of suffix array positions that correspond to the locations of phrase w<sub>i-n+1</sub><sup>i-1</sup>
+* @return S_ContextTypeInfo containing the context type information
+**/
+void C_SuffixArrayLanguageModel::scanCorpusForContextTypeInfo(int n, IndexType w_i, TextLenType leftBoundaryOfSaRangeFor_w_in2_i1, TextLenType rightBoundaryOfSaRangeFor_w_in2_i1, TextLenType leftBoundaryOfSaRangeFor_w_in1_i1, TextLenType rigthBoundaryOfSaRangeFor_w_in1_i1, S_ContextTypeInfo & result)
+{
+
+	TextLenType i;
+	TextLenType posInCorpus;
+	IndexType nextWordInCorpus;
+	int n1 = n-1;	//this value will be used frequently here
+
+	//first scan the corpus for all the word types that follow w_{i-n+1}^{i-1}
+	//to collect N1(w_in1^i1 dot) N2, and N3+ info needed
+	result.N1_w_in1_i1_dot = 0;
+	result.N2_w_in1_i1_dot = 0;
+	result.N3plus_w_in1_i1_dot = 0;
+
+	int freqOfCurrentType = -1; //freq of 'dot' with current type
+	IndexType currentNextWordType = 0;
+	for(i=leftBoundaryOfSaRangeFor_w_in1_i1;i<=rigthBoundaryOfSaRangeFor_w_in1_i1;i++){
+		posInCorpus = this->suffix_list[i] + n1;
+		//suffix_list[i] is the position of w_{i-n+1} in the corpus
+		//suffix_list[i]+n-1 is hte position of the word (the dot in the equation) that follows w_{i-n+1}^{i-1}
+
+		nextWordInCorpus = this->corpus_list[posInCorpus];
+		freqOfCurrentType++;
+		if(nextWordInCorpus!=currentNextWordType){
+
+			if(freqOfCurrentType==1){
+				result.N1_w_in1_i1_dot++;
+			}
+			else if(freqOfCurrentType==2){
+				result.N2_w_in1_i1_dot++;
+			}
+			else{	//freq of this type is >=3
+				result.N3plus_w_in1_i1_dot++;
+			}
+
+			currentNextWordType = nextWordInCorpus;
+			freqOfCurrentType=0;
+		}
+	}
+
+	//for the last type in the range
+	freqOfCurrentType++;
+
+	if(freqOfCurrentType==1){
+		result.N1_w_in1_i1_dot++;
+	}
+	else if(freqOfCurrentType==2){
+		result.N2_w_in1_i1_dot++;
+	}
+	else{	//freq of this type is >=3
+		result.N3plus_w_in1_i1_dot++;
+	}
+
+
+	//step 2, scan the corpus for N_{1+}(dot w_{i-n+2}^{i}) and N_{1+}(dot w_{i-n+2}^{i-1} dot)
+	IndexType precedingWord;
+	IndexType followingWord;
+	if(n==2){	//the special case
+		result.N1plus_dot_w_in2_i1_dot = this->typeOfBigrams;
+
+		//check if we have the N_1+(dot w_i) information already
+		map<IndexType, unsigned int>::iterator iterTypeFreqPrecedingWord;
+		iterTypeFreqPrecedingWord = this->typeFreqPrecedingWord.find(w_i);
+
+		if(iterTypeFreqPrecedingWord==this->typeFreqPrecedingWord.end()){	//does not exist yet
+			TextLenType startPosInSA = this->level1Buckets[w_i].first;
+			TextLenType endPosInSA = this->level1Buckets[w_i].last;
+
+			set<IndexType> wordTypePrecedesW_i;
+			for(i=startPosInSA;i<=endPosInSA;i++){
+				posInCorpus = this->suffix_list[i] - 1;
+				precedingWord = this->corpus_list[posInCorpus];
+
+				wordTypePrecedesW_i.insert(precedingWord);
+			}
+
+			result.N1plus_dot_w_in2_i = (double) wordTypePrecedesW_i.size();
+
+			//and save this for future references
+			this->typeFreqPrecedingWord.insert(make_pair(w_i, wordTypePrecedesW_i.size()));
+		}
+		else{	//already has the information in typeFreqPrecedingWord
+			result.N1plus_dot_w_in2_i = (double) (iterTypeFreqPrecedingWord->second);
+		}
+	}
+	else{
+		set<IndexType> wordTypesPrecedesW_in2_i;
+		set< pair<IndexType, IndexType> > wordTypesSurroundW_in2_i1;
+		
+		for(i=leftBoundaryOfSaRangeFor_w_in2_i1;i<=rightBoundaryOfSaRangeFor_w_in2_i1;i++){
+			posInCorpus = this->suffix_list[i] -1;	//pos of preceding word (w_{i-n+1}) in the corpus
+			precedingWord = this->corpus_list[posInCorpus];
+
+			posInCorpus+=n1;						//pos of following word w_i in the corpus
+			followingWord = this->corpus_list[posInCorpus];
+
+			pair<IndexType, IndexType> tmpPair = make_pair(precedingWord, followingWord);
+
+			//if w_i equals next word, add the preceding word to set 
+			if(followingWord==w_i){
+				wordTypesPrecedesW_in2_i.insert(precedingWord);
+			}
+
+			//add the pair to set
+			wordTypesSurroundW_in2_i1.insert(tmpPair);
+
+		}
+
+
+		result.N1plus_dot_w_in2_i = wordTypesPrecedesW_in2_i.size();
+		result.N1plus_dot_w_in2_i1_dot = wordTypesSurroundW_in2_i1.size();
+	}
+
+	result.valid = true;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h
new file mode 100755
index 0000000..9f9155a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/KN-Smoothed-branch/_SuffixArrayLanguageModel.h
@@ -0,0 +1,210 @@
+#if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__)
+#define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__
+
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include "salm_shared.h"
+#include "time.h"
+
+/**
+* \ingroup lm
+* Context type information needed in KN-smoothing
+**/
+typedef struct s_contextTypeInfo{
+	double N1plus_dot_w_in2_i;		//Goodman and Chen 98, eq 23
+	double N1plus_dot_w_in2_i1_dot;
+	double N1_w_in1_i1_dot;			//Goodman and Chen 98, eq 19
+	double N2_w_in1_i1_dot;
+	double N3plus_w_in1_i1_dot;
+	bool valid;
+}S_ContextTypeInfo;
+
+
+/**
+* \ingroup lm
+**/
+typedef unsigned int LMState;
+
+
+/**
+* \ingroup lm
+**/
+typedef struct s_lmStateInfo{
+	TextLenType posInCorpus;
+	unsigned char len;
+}S_LMStateInfo;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_bufferedLmInfo{
+	int nextState;
+	double logProb;
+}S_BufferedLmInfo;
+
+
+/**
+* \ingroup lm
+**/
+struct lt_lmStateInfo
+{
+  bool operator()(S_LMStateInfo a, S_LMStateInfo b) const{
+		if(a.posInCorpus<b.posInCorpus){
+			return true;
+		}
+
+		if(a.posInCorpus>b.posInCorpus){
+			return false;
+		}
+
+		if(a.len<b.len){
+			return true;
+		}
+
+		return false;	
+	}
+};
+
+
+/**
+* \ingroup lm
+* structure for elements in the cache for accessing the suffix array for LM prob
+**/
+typedef struct s_cached_SA_access_key{
+	TextLenType currentMatchStart;
+	unsigned char currentMatchLen;
+	IndexType nextWord;
+}S_CachedSA_Access_Key;
+
+typedef struct s_cached_SA_access_info{
+	TextLenType updatedMatchingStart;
+	unsigned char updatedMatchingLen;
+	double logProb;
+	long lastTimedUsed;
+}S_Cached_SA_Access_Info;
+
+struct lt_s_cached_SA_access_key
+{
+  bool operator()(S_CachedSA_Access_Key a, S_CachedSA_Access_Key b) const{
+		if(a.currentMatchStart<b.currentMatchStart){
+			return true;
+		}
+
+		if(a.currentMatchStart>b.currentMatchStart){
+			return false;
+		}
+
+		if(a.currentMatchLen<b.currentMatchLen){
+			return true;
+		}
+
+		if(a.currentMatchLen>b.currentMatchLen){
+			return false;
+		}
+
+		if(a.nextWord<b.nextWord){
+			return true;
+		}
+
+		return false;	
+	}
+};
+
+
+/**
+* \ingroup lm
+* C_SuffixArrayLanguageModel inherit the C_SuffixArraySearchApplicationBase class and C_SuffixArrayScanningBase
+* to provide functionalities of estimating the likelihood of a sentence given an indexed training corpus
+* 
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase
+{
+
+public:
+	IndexType returnVocId(C_String aWord);	
+
+	/// At the beginning of a sentence, return the LMState and reset the cache
+	LMState beginOfSentenceState();
+	
+	/// Calculate the log prob of a word predicted by the history LM state
+	double logProb(LMState lmState, IndexType nextWord, LMState & nextState);
+	
+	/// The log prob of a phrase extending the history as a LMState
+	double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState);	
+	
+	/// End of sentence
+	double logProbEnd(LMState lmState);	
+
+	/// Constructors
+	C_SuffixArrayLanguageModel(const char * cfgFileName);
+	C_SuffixArrayLanguageModel();
+	~C_SuffixArrayLanguageModel();
+
+
+private:
+	void scanCorpusForContextTypeInfo(int n, IndexType w_i, TextLenType  leftBoundaryOfSaRangeFor_w_in2_i1, TextLenType rightBoundaryOfSaRangeFor_w_in2_i1, TextLenType  leftBoundaryOfSaRangeFor_w_in1_i1, TextLenType rigthBoundaryOfSaRangeFor_w_in1_i1, S_ContextTypeInfo & result);
+
+	void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+	void calcNgramMatchingInfoTokenFreqContextTypeExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, S_ContextTypeInfo * contextTypeInfo, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+	
+	//Log prob calculation
+	double logProbOfNgramFromCorpusInfo(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+	double calcLogProb(double *freq);
+	double calcLogProb_equalWeightedInterpolation(double *freq);
+	double calcLogProb_ibmHeuristicInterpolation(double *freq);
+	double calcLogProb_maxProbInterpolation(double * freq);
+	double calcLogProb_kneserNeySmoothing(double *freq, S_ContextTypeInfo * contextTypeFreq);
+
+	///parameter and settings
+	///set the interploation strategy
+	void setParam_interpolationStrategy(char interpolationStrategy);
+
+	///set the number of sentences processed by the LM before purging the cache
+	void setParam_numberOfSentSeenToPurgeCache(int numberOfSentSeenToPurgeCache);
+
+	///set the fresh time thresh for the cache entries
+	void setParam_freshTime(long freshTime);
+
+	char smoothingStrategy;
+	char interpolationStrategy;
+	int maxN;
+	IndexType vocIdForSentStart;
+	IndexType vocIdForSentEnd;
+	IndexType vocIdForCorpusEnd;
+
+
+	///Discounting
+	void constructDiscountingMap();
+	double discountFreq_GT(int n, unsigned int observedFreq);
+
+	double * Y;	// following the notation of Chen&Goodman 98, Eq. 26
+	double * D1;
+	double * D2;
+	double * D3plus;
+	double typeOfBigrams;	//will be needed for KN-smoothing
+
+	double *discountingMap;
+	bool applyDiscounting;
+	int maxFreqForDiscounting;
+	S_nGramScanningInfoElement * nGramScanningList;	
+	map<IndexType, unsigned int> typeFreqPrecedingWord;
+
+	///LM State and related functions
+	void resetLmStates();
+	void initialLmState();	
+	map< pair<LMState, IndexType>, S_BufferedLmInfo> buffer;
+	vector<S_LMStateInfo> allLMStates;
+	map<S_LMStateInfo, int, lt_lmStateInfo> lmStateInfo2Id;
+
+	//caching information for SA access
+	unsigned int sentenceProcessedSoFar;
+	long freshTime;
+	unsigned int numberOfSentSeenToPurgeCache;
+	map<S_CachedSA_Access_Key, S_Cached_SA_Access_Info, lt_s_cached_SA_access_key> cached_sa_access;
+	void purgeCache(long lastVisitedTime);
+
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp
new file mode 100755
index 0000000..0a94ff0
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp
@@ -0,0 +1,691 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include  "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <memory.h>
+#include <cstring>
+
+#include "math.h"
+
+using namespace std;
+
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+	
+}
+
+
+/**
+* Construct the suffix array language model object
+* Using the training data corpusFileNameStem that has been indexed by IndexSA
+* Consider at most maxN-gram in language modeling
+* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting
+* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history
+* @param cfgFileName Configuration file that specifies the value of parameters for SALM
+* 
+* Each line in the configuration file is a Keyword Value pair. Legal keywords are:
+* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified!
+* N : Highest order of n considered for n-gram LM estimation, default value = 5
+* MAX_FREQ_DISC :  When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1. 
+* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 	'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 	'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 	'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char *  cfgFileName)
+{
+  
+	fstream cfgFile;
+	cfgFile.open(cfgFileName,ios::in);
+
+	if(!cfgFile){
+		fprintf(stderr,"Configuration file does not exist! quit!!\n");
+		exit(0);
+	}
+
+	//-----------------------------------------------------------------------------
+	//reading parameters
+	char paraName[1024];
+	char corpusFileNameStem[1024];
+	corpusFileNameStem[0]=0;
+	this->maxFreqForDiscounting=-1;
+
+	this->interpolationStrategy = 'e';	//default interpolation strategy: equally weighted n-gram conditional prob
+	this->maxN = 5;         // default value; consider up to 5 words	
+		
+	while(!cfgFile.eof()){
+		cfgFile>>paraName;
+
+		if(strcmp(paraName,"CORPUS")==0){
+			cfgFile>>corpusFileNameStem;
+		}		
+		else if(strcmp(paraName,"N")==0){
+			cfgFile>>this->maxN;
+		}
+		else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+			cfgFile>>maxFreqForDiscounting;
+		}		
+		else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+			cfgFile>>this->interpolationStrategy;
+		}	
+
+		paraName[0]=0;
+		
+	}
+	
+	//load corpus and suffix array
+	if(strlen(corpusFileNameStem)==0){
+		cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n";
+		exit(-1);
+	}
+	this->loadData_forSearch(corpusFileNameStem, false, true);	//call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset, 
+	
+	
+	//if apply discounting construct the discounting map
+	if(this->maxFreqForDiscounting<=0){
+		this->applyDiscounting = false;
+	}
+	else{
+		this->applyDiscounting = true;		
+		this->constructDiscountingMap();	//scan the corpus and construct the count of counts table and then discounting map		
+	}	
+
+	//get vocID for sentEnd
+	this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+	if(this->vocIdForSentEnd==0){
+		cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+		exit(0);
+	}
+
+	this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+	if(this->vocIdForSentStart==0){
+		cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+		exit(0);
+	}
+	
+	this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+	if(this->vocIdForCorpusEnd==0){
+		cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+		exit(0);
+	}
+
+	this->interpolationStrategy = 'e';			//default: interpolation strategy: equally weighted n-gram conditional prob
+
+}
+
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information 
+* and construct the discounting using Good-Turing smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+	int i,j;
+	unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+
+	if(countOfCountsTable==NULL){
+		cerr<<"Count of counts table can not be initialized. Exit\n";
+		exit(0);
+	}
+	
+	//initialize count of counts table
+	for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+		countOfCountsTable[c]=0;
+	}
+		
+	//initialize the scanning list
+	S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+	for(i=0;i<this->maxN;i++){
+		nGramScanningList[i].freqSoFar=0;
+		nGramScanningList[i].vocId = 0;
+		nGramScanningList[i].freqThreshForOutput = (unsigned int) -1;	//default, do not output
+	}
+
+	bool stillMeaningful = true;	
+	TextLenType saPos=0;
+
+	while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+		TextLenType posInCorpus = this->suffix_list[saPos];
+		IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+		if(wordInCorpus<this->sentIdStart){	//SA positions pointing to sentID are not interesting
+			
+			if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){	//n-grams start with <s> and </s>, or <end of corpus> are not interested
+			
+				bool quit =false;
+				i=0;
+
+				while(!quit && (i<this->maxN)){
+					wordInCorpus = this->corpus_list[posInCorpus+i];
+					if(						
+						(wordInCorpus<this->sentIdStart)&&
+						(wordInCorpus!=this->vocIdForSentEnd)&&
+						(wordInCorpus!=this->vocIdForSentStart)&&
+						(wordInCorpus==nGramScanningList[i].vocId)){	//still match
+
+						nGramScanningList[i].freqSoFar++;
+					}
+					else{	//we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+									
+						bool validNgramUpSoFar = true;
+						unsigned int freqSoFar;
+						
+						for(j=i;j<this->maxN;j++){				
+							
+							
+							if(nGramScanningList[j].vocId==0){		//a NULL word, then this n-gram and longer ones in the scan window are invalid
+								validNgramUpSoFar = false;
+							}
+
+							if(validNgramUpSoFar){		//perform actions depends on actionType
+								
+								freqSoFar = nGramScanningList[j].freqSoFar;
+								if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+									//increase the count for (j+1)-gram with freq freqSoFar
+									countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+								}							
+							}
+
+							//finished output, now clear the list from point of i
+							if((posInCorpus+j)<this->corpusSize){
+								wordInCorpus = this->corpus_list[posInCorpus+j];
+							}
+							else{
+								wordInCorpus = 0;	//out of bound for corpus
+							}
+
+							if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+								wordInCorpus=0;	//write 0 for <sentId>, <s> and </s>
+								nGramScanningList[j].freqSoFar = 0;
+							}
+							else{
+								nGramScanningList[j].freqSoFar = 1;
+							}
+
+							nGramScanningList[j].vocId = wordInCorpus;							
+						}
+
+						quit=true;	//at i+1 gram, already not match, no need to check for longer
+					}
+
+					i++;
+				}
+			}
+		}
+		else{
+			stillMeaningful = false;	//once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+		}
+
+		saPos++;
+	}
+
+	//at the end of corpus (according to suffix order)
+	bool validNgramUpSoFar = true;
+	unsigned int freqSoFar;
+	for(i=0;i<this->maxN;i++){
+		if(nGramScanningList[i].vocId==0){	//invalide word
+			validNgramUpSoFar = false;
+		}
+
+		if(validNgramUpSoFar){
+			
+			freqSoFar = nGramScanningList[i].freqSoFar;
+			if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+				//increase the count for (i+1)-gram with freq freqSoFar
+				countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+			}			
+		}
+	}
+
+	//now, use Good-Turing discounting to create frequency mapping
+	//still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+	this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+		
+	for(i=0;i<this->maxN;i++){
+		//for (i+1)-gram
+			
+		unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+		double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+		for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){	//only goes to maxFreq-1, because we can not discount maxFreq
+			//for all (freq+1) ngrams
+			if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){	//both freq exists
+				discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);			
+			}
+			else{
+				discountingMapForThisN[freq] = -1;
+			}
+		}
+
+		discountingMapForThisN[this->maxFreqForDiscounting-1] = -1;	//won't be used, just for consistency
+	}
+
+
+	free(countOfCountsTable);
+
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+///	    3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+	vector<IndexType> nGram;	
+
+	if(currentMatchStart!=(TextLenType) -1){	//-1 will be <unk>
+		if(currentMatchLen==this->maxN){	//we consider only up to this->maxN for the extended n-gram
+			currentMatchStart++;
+			currentMatchLen--;
+		}
+
+		for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+			nGram.push_back(this->corpus_list[pos]);
+		}
+	}
+
+	nGram.push_back(nextWord);
+
+	int sentLen = nGram.size();
+	
+	//construct the n-gram search table	
+	S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+	int startPosForNgram;
+	int startPosForLongestMatchingWithNextWord;
+	int cellIndexForLongestMatchingWithNextWord;
+
+	bool stillMatched = true;
+	bool atLeastOneMatched = false;
+
+	int indexForNgram;
+
+	unsigned int totalOccurrences;
+	unsigned int totalOccurrencesOfHistory;
+
+	//for unigram
+	indexForNgram = sentLen - 1;
+	if(table[indexForNgram].found){
+		totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+		if(this->applyDiscounting){
+			freqTable[0] = this->discountFreq(1, totalOccurrences);
+		}
+		else{
+			freqTable[0] = totalOccurrences; 
+		}
+
+		freqTable[1] = this->corpusSize;
+		cellIndexForLongestMatchingWithNextWord = indexForNgram;
+		startPosForLongestMatchingWithNextWord = sentLen-1;
+		atLeastOneMatched = true;
+	}
+	else{
+		stillMatched = false;
+	}
+
+	int n=2;	//considering 2-gram and longer n-gram now
+	startPosForNgram = sentLen - 2;
+	while((stillMatched)&&(startPosForNgram>=0)){
+		
+		indexForNgram = (n-1) * sentLen + startPosForNgram;
+		int indexForHistory = (n-2) * sentLen +  startPosForNgram;
+		
+		if(table[indexForNgram].found){
+						
+			totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;	
+			totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+			
+			if(this->applyDiscounting){
+				freqTable[2*n-2] = this->discountFreq(n, totalOccurrences);				
+			}
+			else{
+				freqTable[2*n-2] = (double)totalOccurrences;
+			}
+
+			freqTable[2*n-1] = (double) totalOccurrencesOfHistory;	//do not discount the history
+			
+			if(n<this->maxN){	//new history is at most this->maxFreqForDiscounting-1 words long
+				cellIndexForLongestMatchingWithNextWord = indexForNgram;
+				startPosForLongestMatchingWithNextWord = startPosForNgram;
+			}
+		}
+		else{
+			stillMatched = false;
+		}
+
+		startPosForNgram--;
+		n++;
+	}
+
+	if(atLeastOneMatched){	//at least one n-gram can be matched with 'nextWord'
+		updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+		updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+	}
+	else{
+		updatedMatchingStart = (TextLenType) -1;
+		updatedMatchingLen = 0;
+	}
+
+	free(table);
+
+}
+
+
+//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq)
+{
+	if(n>=this->maxN){	//do not discount
+		return (double) observedFreq;
+	}
+
+	if(observedFreq>=(this->maxFreqForDiscounting-1)){	//no discounting for high freq
+		return (double) observedFreq;
+	}
+
+	//else, check the discount map
+	double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+	if(discountedFreq>0){
+		return discountedFreq;
+	}
+
+	//else, no discounting
+	return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+	
+	this->resetLmStates();
+	this->initialLmState();	
+
+	return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+	//add sentence start
+	S_LMStateInfo sentStartNode;
+	sentStartNode.locationInCorpus.posInCorpus = 1;	//if corpus is indexed correctly position 1 should be <s>
+	sentStartNode.locationInCorpus.len = 1;
+	sentStartNode.cachedNextWordExtension.clear();
+
+	this->allLMStates.push_back(sentStartNode);
+	this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+	this->allLMStates.clear();
+	this->ngramLocation2LmStateId.clear();
+}
+
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+	if(lmState>=this->allLMStates.size()){
+		cerr<<"Invalid LM State: "<<lmState<<endl;
+		exit(-1);
+	}
+
+	//first check if we have already seen this 'nextWord' before
+	map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache;
+	iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord );
+
+	if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet
+
+		//search for it in the corpus
+		S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus;
+		S_NgramLocationInCorpus updatedNgramLocation;
+		
+		double logProb = this->logProbFromFreq(
+			correspondingNgramLocation.posInCorpus, 
+			correspondingNgramLocation.len, 
+			nextWord, 
+			updatedNgramLocation.posInCorpus,
+			updatedNgramLocation.len);
+		
+		//caching the logprob of 'nextword' given the lmState
+		int updatedLmStateId;
+		map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId;
+		iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation);
+		if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){	//this updated lm state does not exist yet
+			S_LMStateInfo newLmStateNode;
+
+			newLmStateNode.locationInCorpus = updatedNgramLocation;
+			newLmStateNode.cachedNextWordExtension.clear();
+			
+			this->allLMStates.push_back(newLmStateNode);
+			updatedLmStateId = this->allLMStates.size() -1 ;
+			this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId));
+		}
+		else{
+			updatedLmStateId = iterNgramLocation2LmStateId->second;
+		}
+
+		//cache this
+		S_CachedLmInfo cachedLmInfo;
+		cachedLmInfo.logProb = logProb;
+		cachedLmInfo.nextState = updatedLmStateId;
+
+		this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo));
+
+		//updated next state
+		nextState = updatedLmStateId;
+		
+		return logProb;
+	}
+
+	nextState = iterNextWordExtensionCache->second.nextState;
+
+	return iterNextWordExtensionCache->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase. 
+* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function.
+ * @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+	double logProb = 0;
+	
+	if (phrase.size() == 0) {
+		nextState = lmState;
+		return logProb;
+	}
+
+	for(int i=0;i<phrase.size();i++){
+		logProb+=this->logProb(lmState, phrase[i], nextState);
+		lmState = nextState;
+	}
+
+	return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+	LMState dummyNextState;
+	return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+
+	double logProb;
+
+	double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+	memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+	this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+
+	logProb = this->calcLogProb(freqTable);
+
+	free(freqTable);
+
+	return logProb;
+
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+	switch(this->interpolationStrategy){
+	case 'e':
+		return this->calcLogProb_equalWeightedInterpolation(freq);
+		break;
+	case 'i':
+		return this->calcLogProb_ibmHeuristicInterpolation(freq);
+		break;
+	case 'm':
+		return this->calcLogProb_maxProbInterpolation(freq);
+		break;
+	default:
+		cerr<<"Unknown interpolation strategy!\n";
+		exit(0);
+	}
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+	double prob = 0.0;
+
+	
+	if(freq[0]>0){
+
+		int i=0;
+		bool stillMatched = true;
+
+		while(stillMatched && (i<this->maxN)){
+			if(freq[2*i]>0){
+				prob+=freq[2*i]/freq[2*i+1];
+			}
+			else{
+				stillMatched = false;
+			}
+
+			i++;
+		}
+
+		return log(prob/(double)this->maxN);
+	}
+	else{	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+	double prob = 0.0;
+	if(freq[0]==0){	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+
+	double remainingWeightSum = 1.0;
+
+	//find the first non-zero match
+	int i = this->maxN - 1;
+
+	while(freq[2*i]==0){	//will stop for sure because freq[0]!=0
+		i--;
+	}
+
+	for(int j=i;j>=0;j--){
+		//for (j+1)-gram
+		double historyFreq = freq[2*j+1];
+		double logHistoryFreq = log(historyFreq);
+		if(logHistoryFreq>1){
+			logHistoryFreq = 1.0;	//cap it to 1
+		}
+
+		double reliability = 0.1*logHistoryFreq+0.3;	//heuristics for reliability of the history
+		double adjustedWeights = remainingWeightSum *  reliability;
+		
+		prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+		remainingWeightSum -= adjustedWeights;
+	}
+
+	return log(prob);	
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+	double maxProb = 0.0;
+	
+	if(freq[0]>0){
+
+		int i=0;
+		bool stillMatched = true;
+
+		while(stillMatched && (i<this->maxN)){
+			if(freq[2*i]>0){
+				double prob=freq[2*i]/freq[2*i+1];
+
+				if(prob>maxProb){
+					maxProb = prob;
+				}
+			}
+			else{
+				stillMatched = false;
+			}
+
+			i++;
+		}
+
+		return log(maxProb);
+	}
+	else{	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+}
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+	return this->voc->returnId(aWord);
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~
new file mode 100755
index 0000000..5241621
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~
@@ -0,0 +1,690 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include  "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <memory.h>
+
+#include "math.h"
+
+using namespace std;
+
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+	
+}
+
+
+/**
+* Construct the suffix array language model object
+* Using the training data corpusFileNameStem that has been indexed by IndexSA
+* Consider at most maxN-gram in language modeling
+* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting
+* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history
+* @param cfgFileName Configuration file that specifies the value of parameters for SALM
+* 
+* Each line in the configuration file is a Keyword Value pair. Legal keywords are:
+* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified!
+* N : Highest order of n considered for n-gram LM estimation, default value = 5
+* MAX_FREQ_DISC :  When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1. 
+* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 	'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 	'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 	'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char *  cfgFileName)
+{
+  
+	fstream cfgFile;
+	cfgFile.open(cfgFileName,ios::in);
+
+	if(!cfgFile){
+		fprintf(stderr,"Configuration file does not exist! quit!!\n");
+		exit(0);
+	}
+
+	//-----------------------------------------------------------------------------
+	//reading parameters
+	char paraName[1024];
+	char corpusFileNameStem[1024];
+	corpusFileNameStem[0]=0;
+	this->maxFreqForDiscounting=-1;
+
+	this->interpolationStrategy = 'e';	//default interpolation strategy: equally weighted n-gram conditional prob
+	this->maxN = 5;         // default value; consider up to 5 words	
+		
+	while(!cfgFile.eof()){
+		cfgFile>>paraName;
+
+		if(strcmp(paraName,"CORPUS")==0){
+			cfgFile>>corpusFileNameStem;
+		}		
+		else if(strcmp(paraName,"N")==0){
+			cfgFile>>this->maxN;
+		}
+		else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+			cfgFile>>maxFreqForDiscounting;
+		}		
+		else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+			cfgFile>>this->interpolationStrategy;
+		}	
+
+		paraName[0]=0;
+		
+	}
+	
+	//load corpus and suffix array
+	if(strlen(corpusFileNameStem)==0){
+		cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n";
+		exit(-1);
+	}
+	this->loadData_forSearch(corpusFileNameStem, false, true);	//call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset, 
+	
+	
+	//if apply discounting construct the discounting map
+	if(this->maxFreqForDiscounting<=0){
+		this->applyDiscounting = false;
+	}
+	else{
+		this->applyDiscounting = true;		
+		this->constructDiscountingMap();	//scan the corpus and construct the count of counts table and then discounting map		
+	}	
+
+	//get vocID for sentEnd
+	this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+	if(this->vocIdForSentEnd==0){
+		cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+		exit(0);
+	}
+
+	this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+	if(this->vocIdForSentStart==0){
+		cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+		exit(0);
+	}
+	
+	this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+	if(this->vocIdForCorpusEnd==0){
+		cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+		exit(0);
+	}
+
+	this->interpolationStrategy = 'e';			//default: interpolation strategy: equally weighted n-gram conditional prob
+
+}
+
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information 
+* and construct the discounting using Good-Turing smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+	int i,j;
+	unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+
+	if(countOfCountsTable==NULL){
+		cerr<<"Count of counts table can not be initialized. Exit\n";
+		exit(0);
+	}
+	
+	//initialize count of counts table
+	for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+		countOfCountsTable[c]=0;
+	}
+		
+	//initialize the scanning list
+	S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+	for(i=0;i<this->maxN;i++){
+		nGramScanningList[i].freqSoFar=0;
+		nGramScanningList[i].vocId = 0;
+		nGramScanningList[i].freqThreshForOutput = (unsigned int) -1;	//default, do not output
+	}
+
+	bool stillMeaningful = true;	
+	TextLenType saPos=0;
+
+	while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+		TextLenType posInCorpus = this->suffix_list[saPos];
+		IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+		if(wordInCorpus<this->sentIdStart){	//SA positions pointing to sentID are not interesting
+			
+			if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){	//n-grams start with <s> and </s>, or <end of corpus> are not interested
+			
+				bool quit =false;
+				i=0;
+
+				while(!quit && (i<this->maxN)){
+					wordInCorpus = this->corpus_list[posInCorpus+i];
+					if(						
+						(wordInCorpus<this->sentIdStart)&&
+						(wordInCorpus!=this->vocIdForSentEnd)&&
+						(wordInCorpus!=this->vocIdForSentStart)&&
+						(wordInCorpus==nGramScanningList[i].vocId)){	//still match
+
+						nGramScanningList[i].freqSoFar++;
+					}
+					else{	//we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+									
+						bool validNgramUpSoFar = true;
+						unsigned int freqSoFar;
+						
+						for(j=i;j<this->maxN;j++){				
+							
+							
+							if(nGramScanningList[j].vocId==0){		//a NULL word, then this n-gram and longer ones in the scan window are invalid
+								validNgramUpSoFar = false;
+							}
+
+							if(validNgramUpSoFar){		//perform actions depends on actionType
+								
+								freqSoFar = nGramScanningList[j].freqSoFar;
+								if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+									//increase the count for (j+1)-gram with freq freqSoFar
+									countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+								}							
+							}
+
+							//finished output, now clear the list from point of i
+							if((posInCorpus+j)<this->corpusSize){
+								wordInCorpus = this->corpus_list[posInCorpus+j];
+							}
+							else{
+								wordInCorpus = 0;	//out of bound for corpus
+							}
+
+							if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+								wordInCorpus=0;	//write 0 for <sentId>, <s> and </s>
+								nGramScanningList[j].freqSoFar = 0;
+							}
+							else{
+								nGramScanningList[j].freqSoFar = 1;
+							}
+
+							nGramScanningList[j].vocId = wordInCorpus;							
+						}
+
+						quit=true;	//at i+1 gram, already not match, no need to check for longer
+					}
+
+					i++;
+				}
+			}
+		}
+		else{
+			stillMeaningful = false;	//once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+		}
+
+		saPos++;
+	}
+
+	//at the end of corpus (according to suffix order)
+	bool validNgramUpSoFar = true;
+	unsigned int freqSoFar;
+	for(i=0;i<this->maxN;i++){
+		if(nGramScanningList[i].vocId==0){	//invalide word
+			validNgramUpSoFar = false;
+		}
+
+		if(validNgramUpSoFar){
+			
+			freqSoFar = nGramScanningList[i].freqSoFar;
+			if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+				//increase the count for (i+1)-gram with freq freqSoFar
+				countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+			}			
+		}
+	}
+
+	//now, use Good-Turing discounting to create frequency mapping
+	//still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+	this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+		
+	for(i=0;i<this->maxN;i++){
+		//for (i+1)-gram
+			
+		unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+		double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+		for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){	//only goes to maxFreq-1, because we can not discount maxFreq
+			//for all (freq+1) ngrams
+			if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){	//both freq exists
+				discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);			
+			}
+			else{
+				discountingMapForThisN[freq] = -1;
+			}
+		}
+
+		discountingMapForThisN[this->maxFreqForDiscounting-1] = -1;	//won't be used, just for consistency
+	}
+
+
+	free(countOfCountsTable);
+
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+///	    3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+	vector<IndexType> nGram;	
+
+	if(currentMatchStart!=(TextLenType) -1){	//-1 will be <unk>
+		if(currentMatchLen==this->maxN){	//we consider only up to this->maxN for the extended n-gram
+			currentMatchStart++;
+			currentMatchLen--;
+		}
+
+		for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+			nGram.push_back(this->corpus_list[pos]);
+		}
+	}
+
+	nGram.push_back(nextWord);
+
+	int sentLen = nGram.size();
+	
+	//construct the n-gram search table	
+	S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+	int startPosForNgram;
+	int startPosForLongestMatchingWithNextWord;
+	int cellIndexForLongestMatchingWithNextWord;
+
+	bool stillMatched = true;
+	bool atLeastOneMatched = false;
+
+	int indexForNgram;
+
+	unsigned int totalOccurrences;
+	unsigned int totalOccurrencesOfHistory;
+
+	//for unigram
+	indexForNgram = sentLen - 1;
+	if(table[indexForNgram].found){
+		totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+		if(this->applyDiscounting){
+			freqTable[0] = this->discountFreq(1, totalOccurrences);
+		}
+		else{
+			freqTable[0] = totalOccurrences; 
+		}
+
+		freqTable[1] = this->corpusSize;
+		cellIndexForLongestMatchingWithNextWord = indexForNgram;
+		startPosForLongestMatchingWithNextWord = sentLen-1;
+		atLeastOneMatched = true;
+	}
+	else{
+		stillMatched = false;
+	}
+
+	int n=2;	//considering 2-gram and longer n-gram now
+	startPosForNgram = sentLen - 2;
+	while((stillMatched)&&(startPosForNgram>=0)){
+		
+		indexForNgram = (n-1) * sentLen + startPosForNgram;
+		int indexForHistory = (n-2) * sentLen +  startPosForNgram;
+		
+		if(table[indexForNgram].found){
+						
+			totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;	
+			totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+			
+			if(this->applyDiscounting){
+				freqTable[2*n-2] = this->discountFreq(n, totalOccurrences);				
+			}
+			else{
+				freqTable[2*n-2] = (double)totalOccurrences;
+			}
+
+			freqTable[2*n-1] = (double) totalOccurrencesOfHistory;	//do not discount the history
+			
+			if(n<this->maxN){	//new history is at most this->maxFreqForDiscounting-1 words long
+				cellIndexForLongestMatchingWithNextWord = indexForNgram;
+				startPosForLongestMatchingWithNextWord = startPosForNgram;
+			}
+		}
+		else{
+			stillMatched = false;
+		}
+
+		startPosForNgram--;
+		n++;
+	}
+
+	if(atLeastOneMatched){	//at least one n-gram can be matched with 'nextWord'
+		updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+		updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+	}
+	else{
+		updatedMatchingStart = (TextLenType) -1;
+		updatedMatchingLen = 0;
+	}
+
+	free(table);
+
+}
+
+
+//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq)
+{
+	if(n>=this->maxN){	//do not discount
+		return (double) observedFreq;
+	}
+
+	if(observedFreq>=(this->maxFreqForDiscounting-1)){	//no discounting for high freq
+		return (double) observedFreq;
+	}
+
+	//else, check the discount map
+	double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+	if(discountedFreq>0){
+		return discountedFreq;
+	}
+
+	//else, no discounting
+	return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+	
+	this->resetLmStates();
+	this->initialLmState();	
+
+	return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+	//add sentence start
+	S_LMStateInfo sentStartNode;
+	sentStartNode.locationInCorpus.posInCorpus = 1;	//if corpus is indexed correctly position 1 should be <s>
+	sentStartNode.locationInCorpus.len = 1;
+	sentStartNode.cachedNextWordExtension.clear();
+
+	this->allLMStates.push_back(sentStartNode);
+	this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+	this->allLMStates.clear();
+	this->ngramLocation2LmStateId.clear();
+}
+
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+	if(lmState>=this->allLMStates.size()){
+		cerr<<"Invalid LM State: "<<lmState<<endl;
+		exit(-1);
+	}
+
+	//first check if we have already seen this 'nextWord' before
+	map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache;
+	iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord );
+
+	if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet
+
+		//search for it in the corpus
+		S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus;
+		S_NgramLocationInCorpus updatedNgramLocation;
+		
+		double logProb = this->logProbFromFreq(
+			correspondingNgramLocation.posInCorpus, 
+			correspondingNgramLocation.len, 
+			nextWord, 
+			updatedNgramLocation.posInCorpus,
+			updatedNgramLocation.len);
+		
+		//caching the logprob of 'nextword' given the lmState
+		int updatedLmStateId;
+		map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId;
+		iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation);
+		if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){	//this updated lm state does not exist yet
+			S_LMStateInfo newLmStateNode;
+
+			newLmStateNode.locationInCorpus = updatedNgramLocation;
+			newLmStateNode.cachedNextWordExtension.clear();
+			
+			this->allLMStates.push_back(newLmStateNode);
+			updatedLmStateId = this->allLMStates.size() -1 ;
+			this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId));
+		}
+		else{
+			updatedLmStateId = iterNgramLocation2LmStateId->second;
+		}
+
+		//cache this
+		S_CachedLmInfo cachedLmInfo;
+		cachedLmInfo.logProb = logProb;
+		cachedLmInfo.nextState = updatedLmStateId;
+
+		this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo));
+
+		//updated next state
+		nextState = updatedLmStateId;
+		
+		return logProb;
+	}
+
+	nextState = iterNextWordExtensionCache->second.nextState;
+
+	return iterNextWordExtensionCache->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase. 
+* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function.
+ * @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+	double logProb = 0;
+	
+	if (phrase.size() == 0) {
+		nextState = lmState;
+		return logProb;
+	}
+
+	for(int i=0;i<phrase.size();i++){
+		logProb+=this->logProb(lmState, phrase[i], nextState);
+		lmState = nextState;
+	}
+
+	return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+	LMState dummyNextState;
+	return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+
+	double logProb;
+
+	double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+	memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+	this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+
+	logProb = this->calcLogProb(freqTable);
+
+	free(freqTable);
+
+	return logProb;
+
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+	switch(this->interpolationStrategy){
+	case 'e':
+		return this->calcLogProb_equalWeightedInterpolation(freq);
+		break;
+	case 'i':
+		return this->calcLogProb_ibmHeuristicInterpolation(freq);
+		break;
+	case 'm':
+		return this->calcLogProb_maxProbInterpolation(freq);
+		break;
+	default:
+		cerr<<"Unknown interpolation strategy!\n";
+		exit(0);
+	}
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+	double prob = 0.0;
+
+	
+	if(freq[0]>0){
+
+		int i=0;
+		bool stillMatched = true;
+
+		while(stillMatched && (i<this->maxN)){
+			if(freq[2*i]>0){
+				prob+=freq[2*i]/freq[2*i+1];
+			}
+			else{
+				stillMatched = false;
+			}
+
+			i++;
+		}
+
+		return log(prob/(double)this->maxN);
+	}
+	else{	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+	double prob = 0.0;
+	if(freq[0]==0){	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+
+	double remainingWeightSum = 1.0;
+
+	//find the first non-zero match
+	int i = this->maxN - 1;
+
+	while(freq[2*i]==0){	//will stop for sure because freq[0]!=0
+		i--;
+	}
+
+	for(int j=i;j>=0;j--){
+		//for (j+1)-gram
+		double historyFreq = freq[2*j+1];
+		double logHistoryFreq = log(historyFreq);
+		if(logHistoryFreq>1){
+			logHistoryFreq = 1.0;	//cap it to 1
+		}
+
+		double reliability = 0.1*logHistoryFreq+0.3;	//heuristics for reliability of the history
+		double adjustedWeights = remainingWeightSum *  reliability;
+		
+		prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+		remainingWeightSum -= adjustedWeights;
+	}
+
+	return log(prob);	
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+	double maxProb = 0.0;
+	
+	if(freq[0]>0){
+
+		int i=0;
+		bool stillMatched = true;
+
+		while(stillMatched && (i<this->maxN)){
+			if(freq[2*i]>0){
+				double prob=freq[2*i]/freq[2*i+1];
+
+				if(prob>maxProb){
+					maxProb = prob;
+				}
+			}
+			else{
+				stillMatched = false;
+			}
+
+			i++;
+		}
+
+		return log(maxProb);
+	}
+	else{	//unknown word
+		return SALM_LOG_PROB_UNK;
+	}
+}
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+	return this->voc->returnId(aWord);
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h
new file mode 100755
index 0000000..62427e5
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.h
@@ -0,0 +1,137 @@
+// Revision $Rev: 3794 $
+// Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+
+#if ! defined (__HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__)
+#define __HEADER_SUFFIXARRAY_LANGUAGE_MODEL_INCLUDED__
+
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include "salm_shared.h"
+
+/**
+* \ingroup lm
+**/
+typedef unsigned int LMState;
+
+
+/**
+* \ingroup lm
+**/
+typedef struct s_cachedLmInfo{
+	int nextState;
+	double logProb;
+}S_CachedLmInfo;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_NgramLocationInCorpus{
+	TextLenType posInCorpus;
+	unsigned char len;
+}S_NgramLocationInCorpus;
+
+/**
+* \ingroup lm
+**/
+typedef struct s_lmStateInfo{
+	S_NgramLocationInCorpus locationInCorpus;
+	map<IndexType, S_CachedLmInfo> cachedNextWordExtension;	//cached information of this LMState extended by the next word
+}S_LMStateInfo;
+
+/**
+* \ingroup lm
+**/
+struct lt_ngramLocationInCorpus
+{
+  bool operator()(S_NgramLocationInCorpus a, S_NgramLocationInCorpus b) const{
+		if(a.posInCorpus<b.posInCorpus){
+			return true;
+		}
+
+		if(a.posInCorpus>b.posInCorpus){
+			return false;
+		}
+
+		if(a.len<b.len){
+			return true;
+		}
+
+		return false;	
+	}
+};
+
+
+/**
+* \ingroup lm
+* C_SuffixArrayLanguageModel inherit the C_SuffixArraySearchApplicationBase class and C_SuffixArrayScanningBase
+* to provide functionalities of estimating the likelihood of a sentence given an indexed training corpus
+* 
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArrayLanguageModel : public C_SuffixArraySearchApplicationBase
+{
+
+public:
+	IndexType returnVocId(C_String aWord);	
+
+	/// At the beginning of a sentence, return the LMState and reset the cache
+	LMState beginOfSentenceState();
+	
+	/// Calculate the log prob of a word predicted by the history LM state
+	double logProb(LMState lmState, IndexType nextWord, LMState & nextState);
+	
+	/// The log prob of a phrase extending the history as a LMState
+	double logProb(LMState lmState, vector<IndexType> nextPhrase, LMState & nextState);	
+	
+	/// End of sentence
+	double logProbEnd(LMState lmState);	
+
+	///set the interploation strategy
+	void setParam_interpolationStrategy(char interpolationStrategy);
+
+
+	C_SuffixArrayLanguageModel(const char * cfgFileName);
+	C_SuffixArrayLanguageModel();
+	~C_SuffixArrayLanguageModel();
+
+
+private:
+
+	void calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+
+	//Log prob calculation
+	double logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen);
+	double calcLogProb(double *freq);
+	double calcLogProb_equalWeightedInterpolation(double *freq);
+	double calcLogProb_ibmHeuristicInterpolation(double *freq);
+	double calcLogProb_maxProbInterpolation(double * freq);
+
+	char interpolationStrategy;
+	int maxN;
+	IndexType vocIdForSentStart;
+	IndexType vocIdForSentEnd;
+	IndexType vocIdForCorpusEnd;
+
+	///Discounting
+	void constructDiscountingMap();
+	double *discountingMap;
+	double discountFreq(int n, unsigned int observedFreq);
+	bool applyDiscounting;
+	int maxFreqForDiscounting;
+	S_nGramScanningInfoElement * nGramScanningList;	
+	
+
+	///LM State and related functions
+	void resetLmStates();
+	void initialLmState();	
+	
+	//caching lm prob for each sentence	
+	vector<S_LMStateInfo> allLMStates;
+	map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus> ngramLocation2LmStateId;
+
+
+
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp
new file mode 100755
index 0000000..d7c96a2
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/CalcCountOfCounts.cpp
@@ -0,0 +1,34 @@
+
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, output the count-of-count information
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	//-----------------------------------------------------------------------------
+	if(argc<4){
+		fprintf(stderr,"\nGiven an indexed corpus, output the count of counts for n-grams.\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s fileNameStem maxN maxFreq\n\n",argv[0]);		
+		exit(0);
+	}
+	
+	unsigned int maxN = atoi(argv[2]);
+	unsigned int maxFreq = atoi(argv[3]);
+
+	C_SuffixArrayScanningBase saObj(argv[1], maxN);	
+	saObj.scanSuffixArrayForCountofCounts(maxFreq);
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp
new file mode 100755
index 0000000..8e9544a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/OutputHighFreqNgram.cpp
@@ -0,0 +1,70 @@
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Output n-gram types that have frequencies equal or higher than specified
+*
+*
+*	CfgFile Format:
+*       n1<tab>freq thresh for output n1-gram
+*       n2<tab>freq thresh for output n2-gram
+*       ... ... ...
+*        n1<tab>freq thresh for output n1-gram
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	//-----------------------------------------------------------------------------
+	if(argc<3){
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s fileNameStem cfgFile\n\n",argv[0]);
+	
+		fprintf(stderr,"\n\tCfgFile Format:");
+		fprintf(stderr,"\n\t\tn1<tab>freq thresh for output n1-gram");
+		fprintf(stderr,"\n\t\tn2<tab>freq thresh for output n2-gram");
+		fprintf(stderr,"\n\t\t... ... ...");
+		fprintf(stderr,"\n\t\tn1<tab>freq thresh for output n1-gram\n");
+
+		
+		exit(0);
+	}
+	
+	//processing the threshold file
+	map<int, unsigned int> threshMap;
+	map<int, unsigned int>::iterator iterThreshMap;
+	fstream threshFile;
+	threshFile.open(argv[2]);
+	int n;
+	int maxN = 0;
+	unsigned int thresh;	
+	while(! threshFile.eof()){
+		threshFile>>n>>thresh;
+		if(n>maxN){
+			maxN=n;
+		}
+		iterThreshMap = threshMap.find(n);
+		if(iterThreshMap==threshMap.end()){
+			threshMap.insert(make_pair(n,thresh));	//a little over-kill here, should have a well defined cfg file
+		}
+	}
+
+	C_SuffixArrayScanningBase saObj(argv[1], maxN);
+	iterThreshMap = threshMap.begin();
+	while(iterThreshMap!=threshMap.end()){
+		saObj.setNgramOutputFreqThresh(iterThreshMap->first, iterThreshMap->second);
+		iterThreshMap++;
+	}
+
+	saObj.scanSuffixArrayForHighFreqNgramType();
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp
new file mode 100755
index 0000000..35f9d3d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/Applications/TypeTokenFreqInCorpus.cpp
@@ -0,0 +1,32 @@
+#include "_SuffixArrayScanningBase.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include <iostream>
+#include <fstream>
+#include <map>
+
+using namespace std;
+
+/**
+* Given an indexed corpus, output the type/token information of the n-grams in the corpus.
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	//-----------------------------------------------------------------------------
+	if(argc<3){
+		fprintf(stderr,"\nGiven an indexed corpus, output the type token information for n-grams.\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s fileNameStem maxN \n\n",argv[0]);		
+		exit(0);
+	}
+	
+	unsigned int maxN = atoi(argv[2]);
+	
+	C_SuffixArrayScanningBase saObj(argv[1], maxN);	
+	saObj.scanSuffixArrayForTypeToken();
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp
new file mode 100755
index 0000000..9050408
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp
@@ -0,0 +1,338 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArrayScanningBase.h"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase()
+{
+	this->countOfCountsTable = 0;	//no memory has been allocated
+	this->maxFreqConsidered = 1000;	//for freq >1000, no need to discount, MLE is good enough
+}
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN)
+{
+	this->countOfCountsTable = 0;  //no memory has been allocated
+	this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+
+	//load suffix array
+	this->loadData(filename, false, true, true);
+
+	this->initializeForScanning(filename, maxN);
+}
+
+void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered)
+{
+	this->maxFreqConsidered = maxFreqConsidered;
+}
+
+
+/**
+* Initialize data structure needed for scanning after the suffix array has been loaded
+**/
+void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN)
+{
+	this->maxN = maxN;
+	this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+	this->countOfCountsTable = 0;	//no memory has been allocated
+
+	//initialize the scanning list
+	for(int i=0;i<this->maxN;i++){
+		this->nGramScanningList[i].freqSoFar=0;
+		this->nGramScanningList[i].vocId = 0;
+		this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1;	//default, do not output
+	}
+
+	//get vocID for sentEnd
+	this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+	if(this->vocIdForSentEnd==0){
+		cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+		exit(0);
+	}
+
+	this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+	if(this->vocIdForSentStart==0){
+		cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+		exit(0);
+	}
+	
+	this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+	if(this->vocIdForCorpusEnd==0){
+		cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+		exit(0);
+	}
+}
+
+C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase()
+{
+	free(this->nGramScanningList);
+
+	if(this->countOfCountsTable!=0){
+		free(this->countOfCountsTable);
+	}
+
+}
+
+void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh)
+{
+	if(n>this->maxN){
+		cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl;
+		exit(0);
+	}
+
+	this->nGramScanningList[n-1].freqThreshForOutput = freqThresh;
+}
+
+void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType()
+{
+	this->scanSuffixArray('H');
+
+}
+
+/// Count of counts is the number of n-gram types that occur a certain times in the corpus.
+/// Count of counts is important information in LM smoothing
+/// We scan the corpus for n-gram's type/token frequency and collect information for 1-gram, 2-gram,...and up to maxFreqConsidered-gram
+void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered)
+{
+	this->maxFreqConsidered = maxFreqConsidered;
+	this->constructCountOfCountsTable();
+	
+	//output the count of counts
+	cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl;
+	for(int i=0;i<this->maxN;i++){
+		cout<<i+1<<endl;
+		
+		unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered;
+		for(int freq=0;freq<maxFreqConsidered;freq++){
+			cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl;
+		}
+	}
+	
+}
+
+///Check from 1-gram to maxN-gram for type-token information
+///the process is similar to "scanSuffixArrayForHighFreqNgramType"
+void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken()
+{
+	this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+	this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+
+	//initialize
+	for(int n=0;n<maxN;n++){
+		this->typeFreq[n]=0;
+		this->tokenFreq[n]=0;
+	}
+
+
+	//scan the suffix array
+	this->scanSuffixArray('T');
+
+	//output
+	cout<<"n\tType\tToken\n";
+	for(int i=0;i<this->maxN;i++){
+		cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl;
+	}
+}
+
+/**
+* Allocate memory for count-of-counts table and scan the corpus to fill in count of counts
+* memory will be freed in the destructor
+**/
+void C_SuffixArrayScanningBase::constructCountOfCountsTable()
+{
+	if(this->countOfCountsTable!=0){	//if there is already a count of counts table
+		free(this->countOfCountsTable);
+	}
+
+	this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered);
+
+	if(this->countOfCountsTable==NULL){
+		cerr<<"Count of counts table can not be initialized. Exit\n";
+		exit(0);
+	}
+
+	for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){
+		this->countOfCountsTable[c]=0;
+	}
+
+	this->scanSuffixArray('C');
+
+
+}
+
+/**
+* Scan through the indexed corpus and according to the action type, 
+* perform actions accordingly when seeing a new n-gram type
+**/
+void C_SuffixArrayScanningBase::scanSuffixArray(char actionType)
+{
+	
+	int i,j;
+	bool stillMeaningful = true;	
+	TextLenType saPos=0;
+
+	while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+		TextLenType posInCorpus = this->suffix_list[saPos];
+		IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+		if(wordInCorpus<this->sentIdStart){	//SA positions pointing to sentID are not interesting
+			
+			if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){	//n-grams start with <s> and </s>, or <end of corpus> are not interested
+			
+				bool quit =false;
+				i=0;
+
+				while(!quit && (i<this->maxN)){
+					wordInCorpus = this->corpus_list[posInCorpus+i];
+					if(						
+						(wordInCorpus<this->sentIdStart)&&
+						(wordInCorpus!=this->vocIdForSentEnd)&&
+						(wordInCorpus!=this->vocIdForSentStart)&&
+						(wordInCorpus==this->nGramScanningList[i].vocId)){	//still match
+
+						this->nGramScanningList[i].freqSoFar++;
+					}
+					else{	//we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+									
+						bool validNgramUpSoFar = true;
+						unsigned int freqSoFar;
+						C_String tmpPhrase; //for output high freq n-grams
+
+						//prepare the prefix of the n-grams
+						if(actionType=='H'){
+							//common i-gram
+							for(j=0;j<=i-1;j++){
+								if(this->nGramScanningList[j].vocId==0){	//one of the word in the common i-gram is a NULL word, not a valid n-gram
+									validNgramUpSoFar = false;
+								}
+								tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+								tmpPhrase.appending(C_String(" "));
+							}	
+						}
+
+
+						for(j=i;j<this->maxN;j++){				
+							
+							
+							if(this->nGramScanningList[j].vocId==0){		//a NULL word, then this n-gram and longer ones in the scan window are invalid
+								validNgramUpSoFar = false;
+							}
+
+							if(validNgramUpSoFar){		//perform actions depends on actionType
+								
+								switch(actionType){
+
+								case 'C':	//count of counts
+									freqSoFar = this->nGramScanningList[j].freqSoFar;
+									if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+										//increase the count for (j+1)-gram with freq freqSoFar
+										this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++;
+									}
+									break;
+
+								case 'H':	//output high-freq n-grams
+									tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+									tmpPhrase.appending(C_String(" "));
+
+									if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){							
+										cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl;
+									}
+									break;
+
+								case 'T':	//type-token statistics
+									if(this->nGramScanningList[j].freqSoFar>0){
+										typeFreq[j]++;
+									}
+
+									tokenFreq[j]+=this->nGramScanningList[j].freqSoFar;
+
+									break;	
+								default: 
+									cerr<<"Unknown action!\n";
+									exit(-1);
+								}
+							}
+
+							//finished output, now clear the list from point of i
+							if((posInCorpus+j)<this->corpusSize){
+								wordInCorpus = this->corpus_list[posInCorpus+j];
+							}
+							else{
+								wordInCorpus = 0;	//out of bound for corpus
+							}
+
+							if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+								wordInCorpus=0;	//write 0 for <sentId>, <s> and </s>
+								this->nGramScanningList[j].freqSoFar = 0;
+							}
+							else{
+								this->nGramScanningList[j].freqSoFar = 1;
+							}
+
+							this->nGramScanningList[j].vocId = wordInCorpus;							
+						}
+
+						quit=true;	//at i+1 gram, already not match, no need to check for longer
+					}
+
+					i++;
+				}
+			}
+		}
+		else{
+			stillMeaningful = false;	//once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+		}
+
+		saPos++;
+	}
+
+	//at the end of corpus (according to suffix order)
+	C_String finalTmpString;	//for output high-freq n-gram type
+	bool validNgramUpSoFar = true;
+	unsigned int freqSoFar;
+	for(i=0;i<this->maxN;i++){
+		if(this->nGramScanningList[i].vocId==0){	//invalide word
+			validNgramUpSoFar = false;
+		}
+
+		if(validNgramUpSoFar){
+			switch(actionType){
+			case 'C':	//for count-of-counts
+				freqSoFar = this->nGramScanningList[i].freqSoFar;
+				if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+					//increase the count for (i+1)-gram with freq freqSoFar
+					this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++;
+				}
+				break;
+
+			case 'H':	//for high-freq n-gram types
+				finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId));
+				finalTmpString.appending(C_String(" "));
+				if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){			
+					cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl;
+				}
+				break;
+
+			case 'T':	//for type-token statistics
+				if(this->nGramScanningList[i].freqSoFar>0){
+					typeFreq[i]++;
+				}
+
+				tokenFreq[i]+=this->nGramScanningList[i].freqSoFar;
+				break;
+
+			default: 
+				cerr<<"Unknown action!\n";
+				exit(-1);
+			}
+		}
+	}
+
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~
new file mode 100755
index 0000000..fd8bae8
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.cpp~
@@ -0,0 +1,338 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArrayScanningBase.h"
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase()
+{
+	this->countOfCountsTable = 0;	//no memory has been allocated
+	this->maxFreqConsidered = 1000;	//for freq >1000, no need to discount, MLE is good enough
+}
+
+C_SuffixArrayScanningBase::C_SuffixArrayScanningBase(const char * filename, unsigned int maxN)
+{
+	this->countOfCountsTable = 0;  //no memory has been allocated
+	this->maxFreqConsidered = 1000; //for freq >1000, no need to discount, MLE is good enough
+
+	//load suffix array
+	this->loadData(filename, false, true, true);
+
+	this->initializeForScanning(filename, maxN);
+}
+
+void C_SuffixArrayScanningBase::setParam_maxFreqConsidered(int maxFreqConsidered)
+{
+	this->maxFreqConsidered = maxFreqConsidered;
+}
+
+
+/**
+* Initialize data structure needed for scanning after the suffix array has been loaded
+**/
+void C_SuffixArrayScanningBase::initializeForScanning(const char * filename, unsigned int maxN)
+{
+	this->maxN = maxN;
+	this->nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+	this->countOfCountsTable = 0;	//no memory has been allocated
+
+	//initialize the scanning list
+	for(int i=0;i<this->maxN;i++){
+		this->nGramScanningList[i].freqSoFar=0;
+		this->nGramScanningList[i].vocId = 0;
+		this->nGramScanningList[i].freqThreshForOutput = (unsigned int) -1;	//default, do not output
+	}
+
+	//get vocID for sentEnd
+	this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+	if(this->vocIdForSentEnd==0){
+		cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+		exit(0);
+	}
+
+	this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+	if(this->vocIdForSentStart==0){
+		cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+		exit(0);
+	}
+	
+	this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+	if(this->vocIdForCorpusEnd==0){
+		cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+		exit(0);
+	}
+}
+
+C_SuffixArrayScanningBase::~C_SuffixArrayScanningBase()
+{
+	free(this->nGramScanningList);
+
+	if(this->countOfCountsTable!=0){
+		free(this->countOfCountsTable);
+	}
+
+}
+
+void C_SuffixArrayScanningBase::setNgramOutputFreqThresh(int n, unsigned int freqThresh)
+{
+	if(n>this->maxN){
+		cerr<<"Illegal operation.n="<<n<<" is greater than maxN="<<this->maxN<<endl;
+		exit(0);
+	}
+
+	this->nGramScanningList[n-1].freqThreshForOutput = freqThresh;
+}
+
+void C_SuffixArrayScanningBase::scanSuffixArrayForHighFreqNgramType()
+{
+	this->scanSuffixArray('H');
+
+}
+
+/// Count of counts is the number of n-gram types that occur a certain times in the corpus.
+/// Count of counts is important information in LM smoothing
+/// We scan the corpus for n-gram's type/token frequency and collect information for 1-gram, 2-gram,...and up to maxFreqConsidered-gram
+void C_SuffixArrayScanningBase::scanSuffixArrayForCountofCounts(int maxFreqConsidered)
+{
+	this->maxFreqConsidered = maxFreqConsidered;
+	this->constructCountOfCountsTable();
+	
+	//output the count of counts
+	cout<<this->maxN<<"\t"<<maxFreqConsidered<<endl;
+	for(int i=0;i<this->maxN;i++){
+		cout<<i+1<<endl;
+		
+		unsigned int * ccTableForThisN = this->countOfCountsTable + i*maxFreqConsidered;
+		for(int freq=0;freq<maxFreqConsidered;freq++){
+			cout<<freq+1<<"\t"<<ccTableForThisN[freq]<<endl;
+		}
+	}
+	
+}
+
+///Check from 1-gram to maxN-gram for type-token information
+///the process is similar to "scanSuffixArrayForHighFreqNgramType"
+void C_SuffixArrayScanningBase::scanSuffixArrayForTypeToken()
+{
+	this->typeFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+	this->tokenFreq = (unsigned int *) malloc(sizeof(unsigned int)*maxN);
+
+	//initialize
+	for(int n=0;n<maxN;n++){
+		this->typeFreq[n]=0;
+		this->tokenFreq[n]=0;
+	}
+
+
+	//scan the suffix array
+	this->scanSuffixArray('T');
+
+	//output
+	cout<<"n\tType\tToken\n";
+	for(int i=0;i<this->maxN;i++){
+		cout<<i+1<<"\t"<<typeFreq[i]<<"\t"<<tokenFreq[i]<<endl;
+	}
+}
+
+/**
+* Allocate memory for count-of-counts table and scan the corpus to fill in count of counts
+* memory will be freed in the destructor
+**/
+void C_SuffixArrayScanningBase::constructCountOfCountsTable()
+{
+	if(this->countOfCountsTable!=0){	//if there is already a count of counts table
+		free(this->countOfCountsTable);
+	}
+
+	this->countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqConsidered);
+
+	if(this->countOfCountsTable==NULL){
+		cerr<<"Count of counts table can not be initialized. Exit\n";
+		exit(0);
+	}
+
+	for(int c=0;c<this->maxN*this->maxFreqConsidered;c++){
+		this->countOfCountsTable[c]=0;
+	}
+
+	this->scanSuffixArray('C');
+
+
+}
+
+/**
+* Scan through the indexed corpus and according to the action type, 
+* perform actions accordingly when seeing a new n-gram type
+**/
+void C_SuffixArrayScanningBase::scanSuffixArray(char actionType)
+{
+	
+	int i,j;
+	bool stillMeaningful = true;	
+	TextLenType saPos=0;
+
+	while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+		TextLenType posInCorpus = this->suffix_list[saPos];
+		IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+		if(wordInCorpus<this->sentIdStart){	//SA positions pointing to sentID are not interesting
+			
+			if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){	//n-grams start with <s> and </s>, or <end of corpus> are not interested
+			
+				bool quit =false;
+				i=0;
+
+				while(!quit && (i<this->maxN)){
+					wordInCorpus = this->corpus_list[posInCorpus+i];
+					if(						
+						(wordInCorpus<this->sentIdStart)&&
+						(wordInCorpus!=this->vocIdForSentEnd)&&
+						(wordInCorpus!=this->vocIdForSentStart)&&
+						(wordInCorpus==this->nGramScanningList[i].vocId)){	//still match
+
+						this->nGramScanningList[i].freqSoFar++;
+					}
+					else{	//we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+									
+						bool validNgramUpSoFar = true;
+						unsigned int freqSoFar;
+						C_String tmpPhrase; //for output high freq n-grams
+
+						//prepare the prefix of the n-grams
+						if(actionType=='H'){
+							//common i-gram
+							for(j=0;j<=i-1;j++){
+								if(this->nGramScanningList[j].vocId==0){	//one of the word in the common i-gram is a NULL word, not a valid n-gram
+									validNgramUpSoFar = false;
+								}
+								tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+								tmpPhrase.appending(C_String(" "));
+							}	
+						}
+
+
+						for(j=i;j<this->maxN;j++){				
+							
+							
+							if(this->nGramScanningList[j].vocId==0){		//a NULL word, then this n-gram and longer ones in the scan window are invalid
+								validNgramUpSoFar = false;
+							}
+
+							if(validNgramUpSoFar){		//perform actions depends on actionType
+								
+								switch(actionType){
+
+								case 'C':	//count of counts
+									freqSoFar = this->nGramScanningList[j].freqSoFar;
+									if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+										//increase the count for (j+1)-gram with freq freqSoFar
+										this->countOfCountsTable[j*this->maxFreqConsidered+freqSoFar-1]++;
+									}
+									break;
+
+								case 'H':	//output high-freq n-grams
+									tmpPhrase.appending(this->voc->getText(this->nGramScanningList[j].vocId));
+									tmpPhrase.appending(C_String(" "));
+
+									if(this->nGramScanningList[j].freqSoFar>=this->nGramScanningList[j].freqThreshForOutput){							
+										cout<<tmpPhrase.toString()<<"\t"<<this->nGramScanningList[j].freqSoFar<<endl;
+									}
+									break;
+
+								case 'T':	//type-token statistics
+									if(this->nGramScanningList[j].freqSoFar>0){
+										typeFreq[j]++;
+									}
+
+									tokenFreq[j]+=this->nGramScanningList[j].freqSoFar;
+
+									break;	
+								default: 
+									cerr<<"Unknown action!\n";
+									exit(-1);
+								}
+							}
+
+							//finished output, now clear the list from point of i
+							if((posInCorpus+j)<this->corpusSize){
+								wordInCorpus = this->corpus_list[posInCorpus+j];
+							}
+							else{
+								wordInCorpus = 0;	//out of bound for corpus
+							}
+
+							if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+								wordInCorpus=0;	//write 0 for <sentId>, <s> and </s>
+								this->nGramScanningList[j].freqSoFar = 0;
+							}
+							else{
+								this->nGramScanningList[j].freqSoFar = 1;
+							}
+
+							this->nGramScanningList[j].vocId = wordInCorpus;							
+						}
+
+						quit=true;	//at i+1 gram, already not match, no need to check for longer
+					}
+
+					i++;
+				}
+			}
+		}
+		else{
+			stillMeaningful = false;	//once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+		}
+
+		saPos++;
+	}
+
+	//at the end of corpus (according to suffix order)
+	C_String finalTmpString;	//for output high-freq n-gram type
+	bool validNgramUpSoFar = true;
+	unsigned int freqSoFar;
+	for(i=0;i<this->maxN;i++){
+		if(this->nGramScanningList[i].vocId==0){	//invalide word
+			validNgramUpSoFar = false;
+		}
+
+		if(validNgramUpSoFar){
+			switch(actionType){
+			case 'C':	//for count-of-counts
+				freqSoFar = this->nGramScanningList[i].freqSoFar;
+				if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqConsidered) ){
+					//increase the count for (i+1)-gram with freq freqSoFar
+					this->countOfCountsTable[i*this->maxFreqConsidered+freqSoFar-1]++;
+				}
+				break;
+
+			case 'H':	//for high-freq n-gram types
+				finalTmpString.appending(this->voc->getText(this->nGramScanningList[i].vocId));
+				finalTmpString.appending(C_String(" "));
+				if(this->nGramScanningList[i].freqSoFar>this->nGramScanningList[i].freqThreshForOutput){			
+					cout<<finalTmpString.toString()<<"\t"<<this->nGramScanningList[i].freqSoFar<<endl;
+				}
+				break;
+
+			case 'T':	//for type-token statistics
+				if(this->nGramScanningList[i].freqSoFar>0){
+					typeFreq[i]++;
+				}
+
+				tokenFreq[i]+=this->nGramScanningList[i].freqSoFar;
+				break;
+
+			default: 
+				cerr<<"Unknown action!\n";
+				exit(-1);
+			}
+		}
+	}
+
+}
diff --git a/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h
new file mode 100755
index 0000000..c517b72
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayScan/_SuffixArrayScanningBase.h
@@ -0,0 +1,53 @@
+#if !defined (_HEADER_SUFFIX_ARRAY_SCANNING_BASE_CLASS_)
+#define _HEADER_SUFFIX_ARRAY_SCANNING_BASE_CLASS_
+
+
+#include "_SuffixArrayApplicationBase.h"
+
+
+
+
+/**
+* \ingroup scan
+* C_SuffixArrayScanningBase class provides functions to scan through an indexed corpus
+* and output information such as the type/token frequency of the data
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArrayScanningBase : public C_SuffixArrayApplicationBase
+{
+public:	
+	void setNgramOutputFreqThresh(int n, unsigned int freqThresh);
+	void scanSuffixArrayForHighFreqNgramType();
+	void scanSuffixArrayForCountofCounts(int maxFreqConsidered);
+	void scanSuffixArrayForTypeToken();
+
+	C_SuffixArrayScanningBase(const char * filename, unsigned int maxN);
+	C_SuffixArrayScanningBase();
+	~C_SuffixArrayScanningBase();
+
+protected:
+	void setParam_maxFreqConsidered(int maxFreqConsidered);
+	void constructCountOfCountsTable();
+	void initializeForScanning(const char * filename, unsigned int maxN);
+	
+	int maxN;
+	int maxFreqConsidered;
+	
+	unsigned int * countOfCountsTable;
+	
+	IndexType vocIdForSentStart;
+	IndexType vocIdForSentEnd;
+	IndexType vocIdForCorpusEnd;
+
+private:
+	void scanSuffixArray(char actionType);	
+
+	S_nGramScanningInfoElement * nGramScanningList;	
+	
+	
+	unsigned int * typeFreq;
+	unsigned int * tokenFreq;
+};
+
+#endif
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp
new file mode 100755
index 0000000..24b8cc4
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp
@@ -0,0 +1,130 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+int SHOW_DEBUG_INFO = 0;
+
+typedef struct s_ngram_freq_info{
+	C_String ngramText;
+	vector<IndexType> ngram;
+	unsigned int freq;
+}S_Ngram_Freq_Info;
+
+/**
+* Given several corpora indexed by their suffix array,
+* collect counts of n-grams in a list from all the corpora.
+* This is useful when a corpus is very large,
+* one can split the data into many chunks and sum up the n-gram frquencies.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+	//check parameters
+	if(argc<2){
+		cerr<<"\n-------------------------------------------";
+		cerr<<"\nUsage:";
+		cerr<<"\n\t"<<argv[0]<<" n-gram_list_filename < list of suffix arry used";
+		cerr<<"\nNote:";
+		cerr<<"\n\tn-gram_list_filename.id_voc must exist first.";
+		cerr<<"\n-------------------------------------------\n\n";
+
+		exit(0);
+	}
+
+	//load vocabulary
+	char id_voc_filename[1024];
+	sprintf(id_voc_filename, "%s.id_voc", argv[1]);
+	C_IDVocabulary voc(id_voc_filename);
+
+	//load the n-gram list
+	vector<S_Ngram_Freq_Info> ngramList;
+
+	ifstream NgramListFile;
+	NgramListFile.open(argv[1]);
+	char tmpString[4096];
+	while(!NgramListFile.eof()){
+
+		NgramListFile.getline(tmpString, 4096, '\n');
+
+		if(strlen(tmpString)>0){
+			S_Ngram_Freq_Info tmpNode;
+			tmpNode.ngramText = C_String(tmpString);
+			tmpNode.freq = 1;
+			tmpNode.ngram.clear();
+
+			//conver the n-gram as string to vocId
+			char tmpToken[MAX_TOKEN_LEN];
+			memset(tmpToken,0,MAX_TOKEN_LEN);
+			int pos = 0;
+			int inputLen = strlen(tmpString);
+
+			for(int posInInput = 0; posInInput<inputLen; posInInput++){
+				char thisChar = tmpString[posInInput];
+
+				if((thisChar==' ')||(thisChar=='\t')){  //delimiters
+					if(strlen(tmpToken)>0){
+						tmpToken[pos] = '\0';               
+						tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+						pos=0;
+						tmpToken[pos] = '\0';
+					}
+				}
+				else{
+					tmpToken[pos] = thisChar;
+					pos++;
+					if(pos>=MAX_TOKEN_LEN){ //we can handle it
+						fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+						exit(0);
+					}
+				}
+			}
+
+			tmpToken[pos] = '\0';
+			if(strlen(tmpToken)>0){     
+				tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+			}
+
+			ngramList.push_back(tmpNode);
+		}
+		tmpString[0]='\0';
+	}
+	cerr<<"Total "<<ngramList.size()<<" ngrams loaded.\n";
+
+	//loop over all suffix array and collec the n-gram counts
+	char sa_filename[1024];
+	while(! cin.eof()){
+		cin>>sa_filename;
+
+		if(strlen(sa_filename)>0){
+			cerr<<"Considering "<<sa_filename<<endl;
+
+			C_SuffixArraySearchApplicationBase sa;
+			sa.loadData_forSearch(sa_filename, true, true);
+
+			for(int i=0; i<ngramList.size(); i++){
+				unsigned int freq;
+
+				freq = sa.freqOfExactPhraseMatch(ngramList[i].ngram);
+
+				ngramList[i].freq+=freq;	
+			}
+		}
+
+		sa_filename[0]=0;
+	}
+
+
+	for(int m=0;m<ngramList.size();m++){
+		cout<<ngramList[m].freq<<"\t";
+		cout<<ngramList[m].ngramText.toString()<<"\n";
+	}
+
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~
new file mode 100755
index 0000000..492b770
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/CollectNgramFreqCount.cpp~
@@ -0,0 +1,129 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+using namespace std;
+int SHOW_DEBUG_INFO = 0;
+
+typedef struct s_ngram_freq_info{
+	C_String ngramText;
+	vector<IndexType> ngram;
+	unsigned int freq;
+}S_Ngram_Freq_Info;
+
+/**
+* Given several corpora indexed by their suffix array,
+* collect counts of n-grams in a list from all the corpora.
+* This is useful when a corpus is very large,
+* one can split the data into many chunks and sum up the n-gram frquencies.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+	//check parameters
+	if(argc<2){
+		cerr<<"\n-------------------------------------------";
+		cerr<<"\nUsage:";
+		cerr<<"\n\t"<<argv[0]<<" n-gram_list_filename < list of suffix arry used";
+		cerr<<"\nNote:";
+		cerr<<"\n\tn-gram_list_filename.id_voc must exist first.";
+		cerr<<"\n-------------------------------------------\n\n";
+
+		exit(0);
+	}
+
+	//load vocabulary
+	char id_voc_filename[1024];
+	sprintf(id_voc_filename, "%s.id_voc", argv[1]);
+	C_IDVocabulary voc(id_voc_filename);
+
+	//load the n-gram list
+	vector<S_Ngram_Freq_Info> ngramList;
+
+	ifstream NgramListFile;
+	NgramListFile.open(argv[1]);
+	char tmpString[4096];
+	while(!NgramListFile.eof()){
+
+		NgramListFile.getline(tmpString, 4096, '\n');
+
+		if(strlen(tmpString)>0){
+			S_Ngram_Freq_Info tmpNode;
+			tmpNode.ngramText = C_String(tmpString);
+			tmpNode.freq = 1;
+			tmpNode.ngram.clear();
+
+			//conver the n-gram as string to vocId
+			char tmpToken[MAX_TOKEN_LEN];
+			memset(tmpToken,0,MAX_TOKEN_LEN);
+			int pos = 0;
+			int inputLen = strlen(tmpString);
+
+			for(int posInInput = 0; posInInput<inputLen; posInInput++){
+				char thisChar = tmpString[posInInput];
+
+				if((thisChar==' ')||(thisChar=='\t')){  //delimiters
+					if(strlen(tmpToken)>0){
+						tmpToken[pos] = '\0';               
+						tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+						pos=0;
+						tmpToken[pos] = '\0';
+					}
+				}
+				else{
+					tmpToken[pos] = thisChar;
+					pos++;
+					if(pos>=MAX_TOKEN_LEN){ //we can handle it
+						fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+						exit(0);
+					}
+				}
+			}
+
+			tmpToken[pos] = '\0';
+			if(strlen(tmpToken)>0){     
+				tmpNode.ngram.push_back(voc.returnId(C_String(tmpToken)));
+			}
+
+			ngramList.push_back(tmpNode);
+		}
+		tmpString[0]='\0';
+	}
+	cerr<<"Total "<<ngramList.size()<<" ngrams loaded.\n";
+
+	//loop over all suffix array and collec the n-gram counts
+	char sa_filename[1024];
+	while(! cin.eof()){
+		cin>>sa_filename;
+
+		if(strlen(sa_filename)>0){
+			cerr<<"Considering "<<sa_filename<<endl;
+
+			C_SuffixArraySearchApplicationBase sa;
+			sa.loadData_forSearch(sa_filename, true, true);
+
+			for(int i=0; i<ngramList.size(); i++){
+				unsigned int freq;
+
+				freq = sa.freqOfExactPhraseMatch(ngramList[i].ngram);
+
+				ngramList[i].freq+=freq;	
+			}
+		}
+
+		sa_filename[0]=0;
+	}
+
+
+	for(int m=0;m<ngramList.size();m++){
+		cout<<ngramList[m].freq<<"\t";
+		cout<<ngramList[m].ngramText.toString()<<"\n";
+	}
+
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp
new file mode 100755
index 0000000..9d47f3a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp
@@ -0,0 +1,72 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <map>
+#include <cstring>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, filter out the duplicated sentences in the data
+* and output the unique sentences within.
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	if(argc<2){		
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s fileNameStem < original corpus > corpus with uniq sentences\n",argv[0]);
+		
+		exit(0);
+	}
+
+	map< pair<TextLenType, int>, bool> duplicatedSentAlreadyOutput;
+	map< pair<TextLenType, int>, bool>::iterator iterDuplicatedSentAlreadyOutput;
+
+
+	C_SuffixArraySearchApplicationBase sa;	
+	sa.loadData_forSearch(argv[1], false, true);
+
+	unsigned long totalFilteredSent = 0;
+
+	cerr<<"Filtering duplicated sentences:\n";
+	char tmpString[4000];
+	while(!cin.eof()){
+	  cin.getline(tmpString,100000,'\n');
+	  if(strlen(tmpString)>0){
+		  TextLenType freq = 0;
+		  TextLenType firstOccurrence;
+		  int sentLen;
+
+		  freq = sa.freqOfExactPhraseMatchAndFirstOccurrence(tmpString, firstOccurrence, sentLen);
+
+		  if(freq>1){	//freq is at least 1, because this is the same corpus
+				//then there are multiple occurrences of this sentence
+				//check if we have already output it
+				iterDuplicatedSentAlreadyOutput = duplicatedSentAlreadyOutput.find(make_pair(firstOccurrence, sentLen));
+				
+				if(iterDuplicatedSentAlreadyOutput == duplicatedSentAlreadyOutput.end()){	//we haven't output it
+					cout<<tmpString<<endl;
+					duplicatedSentAlreadyOutput.insert(make_pair(make_pair(firstOccurrence, sentLen), true));					
+				}
+				else{	
+					//it has been output already, ignore it
+					totalFilteredSent++;
+				}
+		  }
+		  else{	//freq==1, no duplication
+			  cout<<tmpString<<endl;
+		  }
+		  
+	  }
+	}
+
+	cerr<<"Total "<<totalFilteredSent<<" duplicated sentences are filtered\n";
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~
new file mode 100755
index 0000000..1278b3f
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FilterDuplicatedSentences.cpp~
@@ -0,0 +1,71 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <map>
+
+using namespace std;
+
+/**
+* Given a corpus indexed by its suffix array, filter out the duplicated sentences in the data
+* and output the unique sentences within.
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	if(argc<2){		
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s fileNameStem < original corpus > corpus with uniq sentences\n",argv[0]);
+		
+		exit(0);
+	}
+
+	map< pair<TextLenType, int>, bool> duplicatedSentAlreadyOutput;
+	map< pair<TextLenType, int>, bool>::iterator iterDuplicatedSentAlreadyOutput;
+
+
+	C_SuffixArraySearchApplicationBase sa;	
+	sa.loadData_forSearch(argv[1], false, true);
+
+	unsigned long totalFilteredSent = 0;
+
+	cerr<<"Filtering duplicated sentences:\n";
+	char tmpString[4000];
+	while(!cin.eof()){
+	  cin.getline(tmpString,100000,'\n');
+	  if(strlen(tmpString)>0){
+		  TextLenType freq = 0;
+		  TextLenType firstOccurrence;
+		  int sentLen;
+
+		  freq = sa.freqOfExactPhraseMatchAndFirstOccurrence(tmpString, firstOccurrence, sentLen);
+
+		  if(freq>1){	//freq is at least 1, because this is the same corpus
+				//then there are multiple occurrences of this sentence
+				//check if we have already output it
+				iterDuplicatedSentAlreadyOutput = duplicatedSentAlreadyOutput.find(make_pair(firstOccurrence, sentLen));
+				
+				if(iterDuplicatedSentAlreadyOutput == duplicatedSentAlreadyOutput.end()){	//we haven't output it
+					cout<<tmpString<<endl;
+					duplicatedSentAlreadyOutput.insert(make_pair(make_pair(firstOccurrence, sentLen), true));					
+				}
+				else{	
+					//it has been output already, ignore it
+					totalFilteredSent++;
+				}
+		  }
+		  else{	//freq==1, no duplication
+			  cout<<tmpString<<endl;
+		  }
+		  
+	  }
+	}
+
+	cerr<<"Total "<<totalFilteredSent<<" duplicated sentences are filtered\n";
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp
new file mode 100755
index 0000000..3daf337
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp
@@ -0,0 +1,47 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+
+/**
+* Application main functionL ExactNgramMatchingFreq
+* Input from stdin ngrams with each line containing one n-gram
+* Search the corpus for the occurrences of each n-gram and output their frequencies in the corpus
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	if(argc<2){		
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s fileNameStem \n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	C_SuffixArraySearchApplicationBase sa;
+	sa.loadData_forSearch(argv[1], false, true);	//we need vocabulary, but do not need offset information here
+
+	cerr<<"Input N-grams:\n";
+	char tmpString[1000];
+	while(!cin.eof()){
+	  cin.getline(tmpString,100000,'\n');
+	  if(strlen(tmpString)>0){
+		  TextLenType freq = 0;
+		  freq = sa.freqOfExactPhraseMatch(tmpString);
+		  cout<<freq<<": "<<tmpString<<endl;
+	  }
+	}
+
+	return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~
new file mode 100755
index 0000000..4c63c0b
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/FrequencyOfNgrams.cpp~
@@ -0,0 +1,46 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+
+/**
+* Application main functionL ExactNgramMatchingFreq
+* Input from stdin ngrams with each line containing one n-gram
+* Search the corpus for the occurrences of each n-gram and output their frequencies in the corpus
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	if(argc<2){		
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s fileNameStem \n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	C_SuffixArraySearchApplicationBase sa;
+	sa.loadData_forSearch(argv[1], false, true);	//we need vocabulary, but do not need offset information here
+
+	cerr<<"Input N-grams:\n";
+	char tmpString[1000];
+	while(!cin.eof()){
+	  cin.getline(tmpString,100000,'\n');
+	  if(strlen(tmpString)>0){
+		  TextLenType freq = 0;
+		  freq = sa.freqOfExactPhraseMatch(tmpString);
+		  cout<<freq<<": "<<tmpString<<endl;
+	  }
+	}
+
+	return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp
new file mode 100755
index 0000000..421e503
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp
@@ -0,0 +1,85 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include <vector>
+#include <iostream>
+#include <cstring>
+#include "_SuffixArraySearchApplicationBase.h"
+
+using namespace std;
+
+
+/**
+* Return locations of all the embedded n-grams of a sentence in the indexed corpus
+*
+* Revison $Rev: 3794 $
+* Last modified: $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+
+	//-----------------------------------------------------------------------------
+	//check arguments
+	if(argc<2){		
+		fprintf(stderr,"\n\nOutput locations of all the matched embedded n-grams of a sentence in an indexed corpus\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem [highestFreq maxRet smallestUnit longestUnit] < list of sentences\n\n",argv[0]);
+		
+		exit(-1);
+	}
+	
+
+	int highFreq;
+    int maxRet;
+    int smallestUnit;
+    int longestUnit;
+
+	C_SuffixArraySearchApplicationBase saObj;
+
+	saObj.loadData_forSearch(argv[1], false, false);
+
+	if(argc>=6){	//if argument of highestFreq, maxRet, smallestUnits are set
+		highFreq = atoi(argv[2]);
+		maxRet = atoi(argv[3]);
+		smallestUnit = atoi(argv[4]);
+		longestUnit = atoi(argv[5]);
+
+		saObj.setParam_highestFreqThresholdForReport(highFreq);
+		saObj.setParam_reportMaxOccurrenceOfOneNgram(maxRet);
+		saObj.setParam_shortestUnitToReport(smallestUnit);
+        saObj.setParam_longestUnitToReport(longestUnit);
+	}
+
+	cerr<<"Input sentences:\n";
+
+	char sentence[10000];
+	
+	while(!cin.eof()){
+		cin.getline(sentence,10000,'\n');
+		if(strlen(sentence)>0){
+
+			vector<C_String> sentAsCStringVector = saObj.convertCharStringToCStringVector(sentence);	//for later display purpose
+			
+			
+			vector<S_phraseLocationElement> locations;
+			locations = saObj.findPhrasesInASentence(sentence);
+		  
+			if(locations.size()==0){
+				cout<<"Nothing can be found in the corpus.\n";
+			}
+			else{
+				for(int i=0;i<locations.size(); i++){
+					cout<<"N-gram ["<<(int)locations[i].posStartInSrcSent<<", "<<(int)locations[i].posEndInSrcSent<<"]: ";
+					for(int j=locations[i].posStartInSrcSent; j<=locations[i].posEndInSrcSent; j++){
+						cout<<sentAsCStringVector[j-1].toString()<<" ";
+					}
+					cout<<" found in corpus: ";
+					cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+			  }
+		  }
+		  cout<<endl;
+	  }
+	}
+
+
+
+	return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~
new file mode 100755
index 0000000..cd7a86a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateEmbeddedNgramsInCorpus.cpp~
@@ -0,0 +1,84 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include <vector>
+#include <iostream>
+#include "_SuffixArraySearchApplicationBase.h"
+
+using namespace std;
+
+
+/**
+* Return locations of all the embedded n-grams of a sentence in the indexed corpus
+*
+* Revison $Rev: 3794 $
+* Last modified: $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+
+	//-----------------------------------------------------------------------------
+	//check arguments
+	if(argc<2){		
+		fprintf(stderr,"\n\nOutput locations of all the matched embedded n-grams of a sentence in an indexed corpus\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem [highestFreq maxRet smallestUnit longestUnit] < list of sentences\n\n",argv[0]);
+		
+		exit(-1);
+	}
+	
+
+	int highFreq;
+    int maxRet;
+    int smallestUnit;
+    int longestUnit;
+
+	C_SuffixArraySearchApplicationBase saObj;
+
+	saObj.loadData_forSearch(argv[1], false, false);
+
+	if(argc>=6){	//if argument of highestFreq, maxRet, smallestUnits are set
+		highFreq = atoi(argv[2]);
+		maxRet = atoi(argv[3]);
+		smallestUnit = atoi(argv[4]);
+		longestUnit = atoi(argv[5]);
+
+		saObj.setParam_highestFreqThresholdForReport(highFreq);
+		saObj.setParam_reportMaxOccurrenceOfOneNgram(maxRet);
+		saObj.setParam_shortestUnitToReport(smallestUnit);
+        saObj.setParam_longestUnitToReport(longestUnit);
+	}
+
+	cerr<<"Input sentences:\n";
+
+	char sentence[10000];
+	
+	while(!cin.eof()){
+		cin.getline(sentence,10000,'\n');
+		if(strlen(sentence)>0){
+
+			vector<C_String> sentAsCStringVector = saObj.convertCharStringToCStringVector(sentence);	//for later display purpose
+			
+			
+			vector<S_phraseLocationElement> locations;
+			locations = saObj.findPhrasesInASentence(sentence);
+		  
+			if(locations.size()==0){
+				cout<<"Nothing can be found in the corpus.\n";
+			}
+			else{
+				for(int i=0;i<locations.size(); i++){
+					cout<<"N-gram ["<<(int)locations[i].posStartInSrcSent<<", "<<(int)locations[i].posEndInSrcSent<<"]: ";
+					for(int j=locations[i].posStartInSrcSent; j<=locations[i].posEndInSrcSent; j++){
+						cout<<sentAsCStringVector[j-1].toString()<<" ";
+					}
+					cout<<" found in corpus: ";
+					cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+			  }
+		  }
+		  cout<<endl;
+	  }
+	}
+
+
+
+	return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp
new file mode 100755
index 0000000..deb8b81
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp
@@ -0,0 +1,67 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <vector>
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+/**
+* \ingroup search
+*
+* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs
+* SentID and offset are all 1-based
+*
+* Note:
+*		The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement
+*		To output it as a number, one needs to cast it to integer type for proper display
+*
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	if(argc<2){		
+		fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]);
+		
+		exit(-1);
+	}
+
+	//-----------------------------------------------------------------------------	
+
+	C_SuffixArraySearchApplicationBase saObj;
+	
+	//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
+	saObj.loadData_forSearch(argv[1], false, false);
+
+
+	cerr<<"Input N-grams:\n";
+	char tmpString[10000];
+	while(!cin.eof()){
+	  cin.getline(tmpString,10000,'\n');
+	  if(strlen(tmpString)>0){
+		  vector<S_SimplePhraseLocationElement> locations;
+
+		  locations = saObj.locateExactPhraseInCorpus(tmpString);
+		  
+		  if(locations.size()==0){
+			  cout<<"No occurrences found.\n";
+		  }
+		  else{
+			  for(int i=0;i<locations.size(); i++){
+				  cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+			  }
+		  }
+		  cout<<endl;
+	  }
+	}
+
+	return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~
new file mode 100755
index 0000000..71097f9
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/LocateNgramInCorpus.cpp~
@@ -0,0 +1,66 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <vector>
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup search
+*
+* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs
+* SentID and offset are all 1-based
+*
+* Note:
+*		The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement
+*		To output it as a number, one needs to cast it to integer type for proper display
+*
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+	if(argc<2){		
+		fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]);
+		
+		exit(-1);
+	}
+
+	//-----------------------------------------------------------------------------	
+
+	C_SuffixArraySearchApplicationBase saObj;
+	
+	//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
+	saObj.loadData_forSearch(argv[1], false, false);
+
+
+	cerr<<"Input N-grams:\n";
+	char tmpString[10000];
+	while(!cin.eof()){
+	  cin.getline(tmpString,10000,'\n');
+	  if(strlen(tmpString)>0){
+		  vector<S_SimplePhraseLocationElement> locations;
+
+		  locations = saObj.locateExactPhraseInCorpus(tmpString);
+		  
+		  if(locations.size()==0){
+			  cout<<"No occurrences found.\n";
+		  }
+		  else{
+			  for(int i=0;i<locations.size(); i++){
+				  cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
+			  }
+		  }
+		  cout<<endl;
+	  }
+	}
+
+	return 0;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp
new file mode 100755
index 0000000..e614fdc
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp
@@ -0,0 +1,132 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cstring>
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+/**
+*	Given the indexed training corpus, analyze the token/type matching ratio of the n-grams in the testing data.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/	
+int main(int argc, char* argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+		fprintf(stderr,"\nOutput the n-gram matching statistics of a testing data given an indexed corpus\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	//-----------------------------------------------------------------------------	
+		
+	C_SuffixArraySearchApplicationBase SA;
+	
+	map<int, pair<int, unsigned long> > results4OneSent;
+	map<int, pair<int, unsigned long> >::iterator iterResult;
+
+	vector<int> nGramTokenCountsInTest;
+	vector<int> nGramInTestMatched;
+	vector<double> nGramFreqInTrainMatched;
+
+	int maxSentLen = 4086;
+	nGramTokenCountsInTest.reserve(maxSentLen);
+	nGramInTestMatched.reserve(maxSentLen);
+	nGramFreqInTrainMatched.reserve(maxSentLen);
+
+	//initialize
+	for(int i=0;i<maxSentLen;i++){
+		nGramTokenCountsInTest.push_back(0);
+		nGramInTestMatched.push_back(0);
+		nGramFreqInTrainMatched.push_back(0);
+	}
+
+	char fileName[1000];
+	char tmpString[10000];
+
+	strcpy(fileName, argv[1]);
+	
+	fprintf(stderr,"Loading data...\n");
+	SA.loadData_forSearch(fileName, false, true);
+
+	fprintf(stderr,"Input sentences:\n");
+
+	long ltime1, ltime2;
+
+	time( &ltime1 );
+
+	int totalSentences = 0;
+	int matchedSentences = 0;
+	while(!cin.eof()){
+		int sentLen;
+		cin.getline(tmpString,10000,'\n');
+
+		if(strlen(tmpString)>0){
+			
+			totalSentences++;
+		  
+			results4OneSent.clear();
+			results4OneSent = SA.returnNGramMatchingStatForOneSent(tmpString, sentLen);
+
+			if(sentLen>maxSentLen){
+				cerr<<"Sentence too long, we can not handle it! Exit.\n";
+				exit(0);
+			}
+
+			for(int j=1;j<=sentLen;j++){	//j-gram
+				nGramTokenCountsInTest[j]+=(sentLen-j+1);	//number of j-grams in the sentence;
+			}
+
+			iterResult=results4OneSent.begin();
+			while(iterResult!=results4OneSent.end()){
+				
+				nGramInTestMatched[iterResult->first]+=iterResult->second.first;
+				nGramFreqInTrainMatched[iterResult->first]+=iterResult->second.second;
+				
+				if(iterResult->first==sentLen){	//a complete match
+					matchedSentences++;
+				}
+
+				iterResult++;
+			}
+		}
+
+		tmpString[0]=0;
+		
+	}
+
+	int n = 1;
+	while(nGramInTestMatched[n]!=0){		
+		int matched = nGramInTestMatched[n];
+		int totalInTest = nGramTokenCountsInTest[n];
+		cout<<"N="<<n<<":\t"<<matched<<" / "<<totalInTest<<"\t";
+		printf("%.1f\t", double(matched)/double(totalInTest)*100.0);
+		cout<<"OccInTrain= "<<nGramFreqInTrainMatched[n]<<endl;
+	  
+	  n++;
+	}
+
+	cout<<"\nOut of "<<totalSentences<<" input sentences, "<<matchedSentences<<" can be found in the training data.\n";;
+	time( &ltime2 );	
+	cout<<"Time cost:"<<ltime2-ltime2<<" seconds\n";
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~
new file mode 100755
index 0000000..d33d3a9
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NGramMatchingStat4TestSet.cpp~
@@ -0,0 +1,131 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_SuffixArraySearchApplicationBase.h"
+
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+/**
+*	Given the indexed training corpus, analyze the token/type matching ratio of the n-grams in the testing data.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/	
+int main(int argc, char* argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+		fprintf(stderr,"\nOutput the n-gram matching statistics of a testing data given an indexed corpus\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	//-----------------------------------------------------------------------------	
+		
+	C_SuffixArraySearchApplicationBase SA;
+	
+	map<int, pair<int, unsigned long> > results4OneSent;
+	map<int, pair<int, unsigned long> >::iterator iterResult;
+
+	vector<int> nGramTokenCountsInTest;
+	vector<int> nGramInTestMatched;
+	vector<double> nGramFreqInTrainMatched;
+
+	int maxSentLen = 4086;
+	nGramTokenCountsInTest.reserve(maxSentLen);
+	nGramInTestMatched.reserve(maxSentLen);
+	nGramFreqInTrainMatched.reserve(maxSentLen);
+
+	//initialize
+	for(int i=0;i<maxSentLen;i++){
+		nGramTokenCountsInTest.push_back(0);
+		nGramInTestMatched.push_back(0);
+		nGramFreqInTrainMatched.push_back(0);
+	}
+
+	char fileName[1000];
+	char tmpString[10000];
+
+	strcpy(fileName, argv[1]);
+	
+	fprintf(stderr,"Loading data...\n");
+	SA.loadData_forSearch(fileName, false, true);
+
+	fprintf(stderr,"Input sentences:\n");
+
+	long ltime1, ltime2;
+
+	time( &ltime1 );
+
+	int totalSentences = 0;
+	int matchedSentences = 0;
+	while(!cin.eof()){
+		int sentLen;
+		cin.getline(tmpString,10000,'\n');
+
+		if(strlen(tmpString)>0){
+			
+			totalSentences++;
+		  
+			results4OneSent.clear();
+			results4OneSent = SA.returnNGramMatchingStatForOneSent(tmpString, sentLen);
+
+			if(sentLen>maxSentLen){
+				cerr<<"Sentence too long, we can not handle it! Exit.\n";
+				exit(0);
+			}
+
+			for(int j=1;j<=sentLen;j++){	//j-gram
+				nGramTokenCountsInTest[j]+=(sentLen-j+1);	//number of j-grams in the sentence;
+			}
+
+			iterResult=results4OneSent.begin();
+			while(iterResult!=results4OneSent.end()){
+				
+				nGramInTestMatched[iterResult->first]+=iterResult->second.first;
+				nGramFreqInTrainMatched[iterResult->first]+=iterResult->second.second;
+				
+				if(iterResult->first==sentLen){	//a complete match
+					matchedSentences++;
+				}
+
+				iterResult++;
+			}
+		}
+
+		tmpString[0]=0;
+		
+	}
+
+	int n = 1;
+	while(nGramInTestMatched[n]!=0){		
+		int matched = nGramInTestMatched[n];
+		int totalInTest = nGramTokenCountsInTest[n];
+		cout<<"N="<<n<<":\t"<<matched<<" / "<<totalInTest<<"\t";
+		printf("%.1f\t", double(matched)/double(totalInTest)*100.0);
+		cout<<"OccInTrain= "<<nGramFreqInTrainMatched[n]<<endl;
+	  
+	  n++;
+	}
+
+	cout<<"\nOut of "<<totalSentences<<" input sentences, "<<matchedSentences<<" can be found in the training data.\n";;
+	time( &ltime2 );	
+	cout<<"Time cost:"<<ltime2-ltime2<<" seconds\n";
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp
new file mode 100755
index 0000000..ca12119
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp
@@ -0,0 +1,50 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+/**
+* Given a corpus indexed by its suffix array, input a sentence from STDIN and output the frequencies of its embedded n-grams in the corpus.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	//-----------------------------------------------------------------------------	
+		
+	C_SuffixArraySearchApplicationBase SA;
+		
+	char tmpString[1000];	
+	
+	fprintf(stderr,"Loading data...\n");
+	SA.loadData_forSearch(argv[1], false, true);
+
+	fprintf(stderr,"Input Sentences:\n");
+
+	while(!cin.eof()){
+	  cin.getline(tmpString,100000,'\n');
+	  if(strlen(tmpString)>0){		  
+		  SA.displayNgramMatchingFreq4Sent(tmpString);	  
+	  }
+	}
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~
new file mode 100755
index 0000000..5e2433b
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreq4Sent.cpp~
@@ -0,0 +1,49 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+/**
+* Given a corpus indexed by its suffix array, input a sentence from STDIN and output the frequencies of its embedded n-grams in the corpus.
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	//-----------------------------------------------------------------------------	
+		
+	C_SuffixArraySearchApplicationBase SA;
+		
+	char tmpString[1000];	
+	
+	fprintf(stderr,"Loading data...\n");
+	SA.loadData_forSearch(argv[1], false, true);
+
+	fprintf(stderr,"Input Sentences:\n");
+
+	while(!cin.eof()){
+	  cin.getline(tmpString,100000,'\n');
+	  if(strlen(tmpString)>0){		  
+		  SA.displayNgramMatchingFreq4Sent(tmpString);	  
+	  }
+	}
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp
new file mode 100755
index 0000000..544a230
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp
@@ -0,0 +1,144 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "float.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+    n = index / sentLen + 1;
+    posInSrcSent = index % sentLen;
+}
+
+///Given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+    unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+    return indexInTable;
+}
+
+/**
+* Given a corpus indexed by its suffix array
+* calcuate the non-compositionalities of the embedded n-grams in a testing sentence
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	//-----------------------------------------------------------------------------	
+		
+	C_SuffixArraySearchApplicationBase SA;
+		
+	char tmpString[1000];
+	double bigN = 1000000;
+	
+	fprintf(stderr,"Loading data...\n");
+	SA.loadData_forSearch(argv[1], false, true);
+
+	fprintf(stderr,"Input Sentences:\n");
+
+	while(!cin.eof()){
+		cin.getline(tmpString,100000,'\n');
+		if(strlen(tmpString)>0){
+			
+			SA.displayNgramMatchingFreq4Sent(tmpString);
+
+			printf("\n");
+
+			int sentLen;
+		  
+			S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+		  
+			//convert this to frequency table
+			double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen);
+
+			for(unsigned int i=0;i<(sentLen*sentLen);i++){
+				//all the short n-grams should all exist and their frequency information should be in table now
+				unsigned int startPos, n;
+				double minNc;
+				int leftNWithMinNc;
+
+				local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n);
+
+				if(matchingTable[i].found){
+					double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1; 
+					freqTable[i]=freq;
+
+					
+
+					//consider all splitting method
+					minNc = DBL_MAX;
+					
+					for(unsigned int leftN=1;leftN<n;leftN++){
+						int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen);
+						int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen);
+
+						double leftFreq = freqTable[index_left];
+						double rightFreq = freqTable[index_right];
+
+						double nc = freq*bigN/(leftFreq*rightFreq);
+
+						if(nc<minNc){
+							minNc = nc;
+							leftNWithMinNc = leftN;
+						}
+
+					}					
+				}
+				else{
+					freqTable[i]=0;
+					minNc = 0;
+				}
+
+				if(startPos==0){
+					printf("\n%d\t",n);
+				}
+
+				if(n==1){
+					printf("A\t");	//atom word, no way to break it
+				}
+				else{
+					if(minNc>0){
+						printf("%.1f[%d]\t", minNc, leftNWithMinNc);
+					}
+					else{
+						printf("_\t");
+					}
+				}
+			}
+
+			printf("\n");
+		
+
+			free(matchingTable);
+			free(freqTable);
+
+
+	  }
+	}
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~
new file mode 100755
index 0000000..294724e
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramMatchingFreqAndNonCompositionality4Sent.cpp~
@@ -0,0 +1,145 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "float.h"
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+
+using namespace std;
+
+int SHOW_DEBUG_INFO = 0;
+
+///Given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void local_oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+    n = index / sentLen + 1;
+    posInSrcSent = index % sentLen;
+}
+
+///Given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int local_twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+    unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+    return indexInTable;
+}
+
+/**
+* Given a corpus indexed by its suffix array
+* calcuate the non-compositionalities of the embedded n-grams in a testing sentence
+*
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < testing sentences\n\n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	//-----------------------------------------------------------------------------	
+		
+	C_SuffixArraySearchApplicationBase SA;
+		
+	char tmpString[1000];
+	double bigN = 1000000;
+	
+	fprintf(stderr,"Loading data...\n");
+	SA.loadData_forSearch(argv[1], false, true);
+
+	fprintf(stderr,"Input Sentences:\n");
+
+	while(!cin.eof()){
+		cin.getline(tmpString,100000,'\n');
+		if(strlen(tmpString)>0){
+			
+			SA.displayNgramMatchingFreq4Sent(tmpString);
+
+			printf("\n");
+
+			int sentLen;
+		  
+			S_sentSearchTableElement * matchingTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+		  
+			//convert this to frequency table
+			double * freqTable = (double *) malloc (sizeof(double)*sentLen*sentLen);
+
+			for(unsigned int i=0;i<(sentLen*sentLen);i++){
+				//all the short n-grams should all exist and their frequency information should be in table now
+				unsigned int startPos, n;
+				double minNc;
+				int leftNWithMinNc;
+
+				local_oneDimensionTableIndexToTwoDimension(i, sentLen, startPos, n);
+
+				if(matchingTable[i].found){
+					double freq = matchingTable[i].endingPosInSA - matchingTable[i].startPosInSA +1; 
+					freqTable[i]=freq;
+
+					
+
+					//consider all splitting method
+					minNc = DBL_MAX;
+					
+					for(unsigned int leftN=1;leftN<n;leftN++){
+						int index_left = local_twoDimensionIndexToOneDimensionTableIndex(startPos, leftN, sentLen);
+						int index_right = local_twoDimensionIndexToOneDimensionTableIndex(startPos+leftN, n-leftN, sentLen);
+
+						double leftFreq = freqTable[index_left];
+						double rightFreq = freqTable[index_right];
+
+						double nc = freq*bigN/(leftFreq*rightFreq);
+
+						if(nc<minNc){
+							minNc = nc;
+							leftNWithMinNc = leftN;
+						}
+
+					}					
+				}
+				else{
+					freqTable[i]=0;
+					minNc = 0;
+				}
+
+				if(startPos==0){
+					printf("\n%d\t",n);
+				}
+
+				if(n==1){
+					printf("A\t");	//atom word, no way to break it
+				}
+				else{
+					if(minNc>0){
+						printf("%.1f[%d]\t", minNc, leftNWithMinNc);
+					}
+					else{
+						printf("_\t");
+					}
+				}
+			}
+
+			printf("\n");
+		
+
+			free(matchingTable);
+			free(freqTable);
+
+
+	  }
+	}
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp
new file mode 100755
index 0000000..9697f4a
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp
@@ -0,0 +1,178 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_String.h"
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+#include <cstring>
+
+using namespace std;
+
+
+vector<C_String> convertTextToStringVector(const char * sentText)
+{  
+
+    vector<C_String> sentAsStringVect;
+
+    char tmpToken[MAX_TOKEN_LEN];
+    memset(tmpToken,0,MAX_TOKEN_LEN);
+
+    int pos = 0;
+
+    int inputLen = strlen(sentText);
+
+    for(int posInInput = 0; posInInput<inputLen; posInInput++){
+        char thisChar = sentText[posInInput];
+
+        if((thisChar==' ')||(thisChar=='\t')){  //delimiters
+            if(strlen(tmpToken)>0){
+                tmpToken[pos] = '\0';               
+                sentAsStringVect.push_back(C_String(tmpToken));
+                pos=0;
+                tmpToken[pos] = '\0';
+            }
+        }
+        else{
+            tmpToken[pos] = thisChar;
+            pos++;
+            if(pos>=MAX_TOKEN_LEN){ //we can handle it
+                fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+                exit(0);
+            }
+        }
+    }
+
+    tmpToken[pos] = '\0';
+    if(strlen(tmpToken)>0){     
+        sentAsStringVect.push_back(C_String(tmpToken));
+    }
+
+    return sentAsStringVect;
+}
+
+/**
+* \ingroup search
+*
+* Given the training corpus indexed by its suffix array,
+* output all the n-grams in a testing data that can be found in the training corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+		fprintf(stderr,"\nOutput the matched n-gram types a testing data set given an indexed corpus\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	//-----------------------------------------------------------------------------	
+		
+	C_SuffixArraySearchApplicationBase SA;
+	
+	map<C_String, double> matchedNgrams;
+	map<C_String, double>::iterator iterMatchedNgrams;
+
+
+	int maxSentLen = 4086;
+
+
+	char fileName[1000];
+	char tmpString[10000];
+
+	strcpy(fileName, argv[1]);
+	
+	fprintf(stderr,"Loading data...\n");
+	SA.loadData_forSearch(fileName, false, true);
+
+	cerr<<"Input sentences:\n";
+
+	long ltime1, ltime2;
+
+	time( &ltime1 );
+
+	int totalSentences = 0;
+	int matchedSentences = 0;
+	while(!cin.eof()){
+		cin.getline(tmpString,10000,'\n');
+
+		if(strlen(tmpString)>0){
+			vector<C_String> sentAsStringVector = convertTextToStringVector(tmpString);
+			
+			int sentLen;
+			S_sentSearchTableElement * freqTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+		  
+			if(sentLen!=sentAsStringVector.size()){
+				cerr<<"Something wrong, can not proceed.!\n";
+				exit(-1);
+			}
+			
+
+			//go over the frequency table
+			for(int startPos = 0; startPos<sentLen; startPos++){
+				C_String ngram;
+				bool stillMatching = true;
+				int n=1;
+				while(stillMatching & (n<=(sentLen-startPos)) ){
+				
+					ngram.appending(sentAsStringVector[startPos+n-1]);
+
+					int posInFreqTable = (n-1)*sentLen+startPos;
+					if(freqTable[posInFreqTable].found){
+						double frequency = freqTable[posInFreqTable].endingPosInSA - freqTable[posInFreqTable].startPosInSA + 1;
+
+						iterMatchedNgrams = matchedNgrams.find(ngram);
+						if(iterMatchedNgrams!=matchedNgrams.end()){	//exist already
+							iterMatchedNgrams->second=frequency;	//frequency is not meaningful in this case, just use it because map need some values to be mapped to
+						}
+						else{
+							matchedNgrams.insert(make_pair(ngram, frequency));
+						}
+					}
+					else{
+						stillMatching = false;
+					}
+					
+
+					ngram.appending(C_String(" "));
+
+					n++;
+				}
+			}
+
+		}
+
+		tmpString[0]=0;
+		
+	}
+
+
+	//now output all the n-grams
+	iterMatchedNgrams = matchedNgrams.begin();
+	while(iterMatchedNgrams != matchedNgrams.end()){
+		cout<<(iterMatchedNgrams->first).toString()<<endl;
+
+		iterMatchedNgrams++;
+	}
+
+
+	time( &ltime2 );	
+	cerr<<"Time spent:"<<ltime2-ltime2<<" seconds\n";
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~
new file mode 100755
index 0000000..5418db6
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/Applications/NgramTypeInTestSetMatchedInCorpus.cpp~
@@ -0,0 +1,177 @@
+#include "stdio.h"
+#include "stdlib.h"
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+#include "_String.h"
+#include "_SuffixArraySearchApplicationBase.h"
+
+#include <time.h>
+#include <stdio.h>
+#include <map>
+
+using namespace std;
+
+
+vector<C_String> convertTextToStringVector(const char * sentText)
+{  
+
+    vector<C_String> sentAsStringVect;
+
+    char tmpToken[MAX_TOKEN_LEN];
+    memset(tmpToken,0,MAX_TOKEN_LEN);
+
+    int pos = 0;
+
+    int inputLen = strlen(sentText);
+
+    for(int posInInput = 0; posInInput<inputLen; posInInput++){
+        char thisChar = sentText[posInInput];
+
+        if((thisChar==' ')||(thisChar=='\t')){  //delimiters
+            if(strlen(tmpToken)>0){
+                tmpToken[pos] = '\0';               
+                sentAsStringVect.push_back(C_String(tmpToken));
+                pos=0;
+                tmpToken[pos] = '\0';
+            }
+        }
+        else{
+            tmpToken[pos] = thisChar;
+            pos++;
+            if(pos>=MAX_TOKEN_LEN){ //we can handle it
+                fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+                exit(0);
+            }
+        }
+    }
+
+    tmpToken[pos] = '\0';
+    if(strlen(tmpToken)>0){     
+        sentAsStringVect.push_back(C_String(tmpToken));
+    }
+
+    return sentAsStringVect;
+}
+
+/**
+* \ingroup search
+*
+* Given the training corpus indexed by its suffix array,
+* output all the n-grams in a testing data that can be found in the training corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char* argv[]){
+	//-----------------------------------------------------------------------------
+	//check parameter
+
+
+	if(argc<2){		
+		fprintf(stderr,"\nOutput the matched n-gram types a testing data set given an indexed corpus\n");
+		fprintf(stderr,"\nUsage:\n");
+		fprintf(stderr,"\n%s corpusFileNameStem < testing data\n\n",argv[0]);
+		
+		exit(0);
+	}
+
+
+	//-----------------------------------------------------------------------------	
+		
+	C_SuffixArraySearchApplicationBase SA;
+	
+	map<C_String, double> matchedNgrams;
+	map<C_String, double>::iterator iterMatchedNgrams;
+
+
+	int maxSentLen = 4086;
+
+
+	char fileName[1000];
+	char tmpString[10000];
+
+	strcpy(fileName, argv[1]);
+	
+	fprintf(stderr,"Loading data...\n");
+	SA.loadData_forSearch(fileName, false, true);
+
+	cerr<<"Input sentences:\n";
+
+	long ltime1, ltime2;
+
+	time( &ltime1 );
+
+	int totalSentences = 0;
+	int matchedSentences = 0;
+	while(!cin.eof()){
+		cin.getline(tmpString,10000,'\n');
+
+		if(strlen(tmpString)>0){
+			vector<C_String> sentAsStringVector = convertTextToStringVector(tmpString);
+			
+			int sentLen;
+			S_sentSearchTableElement * freqTable = SA.constructNgramSearchTable4SentWithLCP(tmpString, sentLen);
+		  
+			if(sentLen!=sentAsStringVector.size()){
+				cerr<<"Something wrong, can not proceed.!\n";
+				exit(-1);
+			}
+			
+
+			//go over the frequency table
+			for(int startPos = 0; startPos<sentLen; startPos++){
+				C_String ngram;
+				bool stillMatching = true;
+				int n=1;
+				while(stillMatching & (n<=(sentLen-startPos)) ){
+				
+					ngram.appending(sentAsStringVector[startPos+n-1]);
+
+					int posInFreqTable = (n-1)*sentLen+startPos;
+					if(freqTable[posInFreqTable].found){
+						double frequency = freqTable[posInFreqTable].endingPosInSA - freqTable[posInFreqTable].startPosInSA + 1;
+
+						iterMatchedNgrams = matchedNgrams.find(ngram);
+						if(iterMatchedNgrams!=matchedNgrams.end()){	//exist already
+							iterMatchedNgrams->second=frequency;	//frequency is not meaningful in this case, just use it because map need some values to be mapped to
+						}
+						else{
+							matchedNgrams.insert(make_pair(ngram, frequency));
+						}
+					}
+					else{
+						stillMatching = false;
+					}
+					
+
+					ngram.appending(C_String(" "));
+
+					n++;
+				}
+			}
+
+		}
+
+		tmpString[0]=0;
+		
+	}
+
+
+	//now output all the n-grams
+	iterMatchedNgrams = matchedNgrams.begin();
+	while(iterMatchedNgrams != matchedNgrams.end()){
+		cout<<(iterMatchedNgrams->first).toString()<<endl;
+
+		iterMatchedNgrams++;
+	}
+
+
+	time( &ltime2 );	
+	cerr<<"Time spent:"<<ltime2-ltime2<<" seconds\n";
+
+	return 1;
+}
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp
new file mode 100755
index 0000000..ebb2ed5
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp
@@ -0,0 +1,754 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <stdlib.h>
+#include <cstring>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase()
+{
+
+    this->reportMaxOccurrenceOfOneNgram = -1;    
+	this->highestFreqThresholdForReport = -1;
+	this->shortestUnitToReport = 1;
+    this->longestUnitToReport = -1; //no constraint
+
+    this->level1Buckets = NULL;
+	this->noLevel1Bucket = false;	//by default, build level1 bucket
+
+    this->noOffset = false; //by default, load offset    
+}
+
+C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase()
+{
+
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter highestFreqThresholdForReport is set so that very high frequent n-grams such as unigram "the" is skipped
+* high frequent n-grams occur too often in the corpus and their statistics can often be estimated offline.
+* Default value = -1 (no effective threshold)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport)
+{
+    this->highestFreqThresholdForReport = highestFreqThresholdForReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter shortestUnitToReport is set so that short n-grams can be skipped to speed up the process
+* Default value = 1 (no effective constraint)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport)
+{
+    this->shortestUnitToReport = shortestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter longestUnitToReport is set to skip long n-gram matches
+*
+* Default value = -1 (no effective limit, output all the matched n-grams no matter how long they are)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport)
+{
+    this->longestUnitToReport = longestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter reportMaxOccurrenceOfOneNgram is set to output information of only the "first" few occurrences of the matched n-gram
+* Since the order is based on the order of the corresponding suffices in the corpus,
+* the output occurrences are usually not the first few occurrences of the n-gram in the corpus
+**/
+void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram)
+{
+    this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram;
+}
+
+
+
+/**
+* Load the indexed corpus, suffix array, offset and vocabulary into memory
+* Note: if C_SuffixArraySearchApplicationBase will be used in the application to return the sentenceId/offset in sentence for the matched n-gram
+* then noOffset needs to be set to be false (to load the offset)
+**/
+void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset)
+{
+
+	this->loadData(filename, noVoc, noOffset, false);	//call the constructor of the super class, load data and build level1Bucket
+
+	if(! this->noOffset){
+        TextLenType lastSentId;
+        unsigned char tmpOffset;
+        this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset);
+        this->totalSentNum = lastSentId;
+    }
+    else{
+        //we do not have offset information, simply travel to the sentence head
+        TextLenType pos = this->corpusSize-3;
+        while(this->corpus_list[pos]<this->sentIdStart){    //still actual words
+            pos--;
+        }
+        //at this position, it should be the <sentId> for the last sentence
+        this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1;
+    }
+    cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n";
+
+}
+
+
+///return 0 if w = text
+///return 1 if w < text
+///return 2 if w > text
+///given that the prefix of lcp words are the same
+char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText)
+{   
+
+    IndexType vocInText = this->corpus_list[posInText+lcp];
+
+    if(vocInWord == vocInText){
+        return 0;
+    }
+    
+    if(vocInWord < vocInText){
+        return 1;
+    }
+
+    return 2;
+}
+
+/** Utility function
+* Convert an input sentence as char string into a vector of C_String objects
+**/
+vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText)
+{
+	vector<C_String> sentAsStringVector;
+
+	char tmpToken[MAX_TOKEN_LEN];
+    memset(tmpToken,0,MAX_TOKEN_LEN);
+
+    int pos = 0;
+
+    int inputLen = strlen(sentText);
+
+	for(int posInInput = 0; posInInput<inputLen; posInInput++){
+        char thisChar = sentText[posInInput];
+
+        if((thisChar==' ')||(thisChar=='\t')){  //delimiters
+            if(strlen(tmpToken)>0){
+                tmpToken[pos] = '\0';               
+                sentAsStringVector.push_back(C_String(tmpToken));
+                pos=0;
+                tmpToken[pos] = '\0';
+            }
+        }
+        else{
+            tmpToken[pos] = thisChar;
+            pos++;
+            if(pos>=MAX_TOKEN_LEN){ //we can handle it
+                fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+                exit(0);
+            }
+        }
+    }
+
+    tmpToken[pos] = '\0';
+    if(strlen(tmpToken)>0){     
+        sentAsStringVector.push_back(C_String(tmpToken));
+    }
+
+	return sentAsStringVector;
+
+}
+
+/**
+* Utility function: convert a sentence as a vector of C_String to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector)
+{
+	if(this->noVocabulary){
+        cerr<<"Vocabulary not available!\n";
+        exit(-1);
+    }
+
+	vector<IndexType> sentAsVocIdVector;
+
+	for(int i=0;i<sentAsStringVector.size();i++){
+		sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i]));	
+	}
+	return sentAsVocIdVector;
+}
+
+
+/**
+* Utility function:
+* Convert a sentence as character string to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText)
+{
+	vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText);
+	return this->convertCStringVectorToVocIdVector(sentAsCStringVector);
+}
+
+
+/**
+* If know the range where the phrase is, search in this range for it
+* position here are all positions in SA, not the positions in the textstring
+* 
+* LCP indicates that all the suffixes in the range has the same prefix with LCP length with the proposed n-gram phrase
+* only need to compare the "nextWord" at LCP+1 position
+*
+* return true if such phrase can be found inside the range, false if not
+**/
+bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos)
+{
+    TextLenType leftPos, rightPos, middlePos;
+
+    //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket
+    //e.g. range correspondes to [ab, ad], but we are searching for (aa)
+    //so first step is to make sure the lcp+next word is still in this range
+    if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){
+        //phrase+next word < text corresponding rangeStart, we could not find it inside this range
+        return false;
+    }
+
+    if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){
+        //phrase+next word > text corresponding to rangeEnd
+        return false;
+    }
+    
+    //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd])
+
+
+    //search for left bound ( the pos in text which is the min(text>=w))
+    //at any time, Left<w<=Right (actually Left<=w<=Right)
+    leftPos = rangeStartPos;
+    rightPos = rangeEndPos; 
+    while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop
+
+        middlePos = (TextLenType)((leftPos + rightPos) / 2);
+        if(((leftPos + rightPos) % 2) != 0){            
+            middlePos++; //bias towards right
+        }
+
+        if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){ 
+            // phrase <= middlePos in Text, go left
+            rightPos = middlePos;
+        }
+        else{
+            leftPos = middlePos;    //word > middle, go right
+        }
+
+    }
+    //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range
+    //here we can only guarantee that Left<=w, so need to check if Left==w at lcp
+    if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){
+        resultStartPos = leftPos;
+    }
+    else{
+        resultStartPos = rightPos;
+    }
+
+    //search for right bound ( the value which is the max(text<=w))
+    //at any time, Left<w<=Right (actually Left<=w<=Right)
+    leftPos = rangeStartPos;
+    rightPos = rangeEndPos;         
+    while( rightPos > (leftPos+1)){ //stop when right = left + 1
+        middlePos = (TextLenType) ((leftPos + rightPos) / 2 );  //bias towards left
+        
+        if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right
+            leftPos = middlePos;
+        }
+        else{
+            rightPos = middlePos;   // ==1, phrase < middlePos
+        }
+    }
+    //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range
+    //here we can only guarantee that w<=Right, so need to check if Right==w at lcp
+    if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){
+        resultEndPos = rightPos;
+    }
+    else{
+        resultEndPos = leftPos;
+    }
+
+    if(resultEndPos>=resultStartPos){
+        return true;
+    }
+
+    return false;   //could not find this phrase
+}
+
+///memory allocated here, remember to free the memory when the table is not needed any more in the 
+///calling function
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen)
+{
+	vector<IndexType> sentInVocId = this->convertStringToVocId(sentText);
+	sentLen = sentInVocId.size();
+
+	return this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+}
+
+
+///constructing the n-gram search table
+///memory allocated here, remember to free the memory when the table is not needed any more in the 
+///calling function
+///
+///faster than constructNgramSearchTable4Sent because the suffixes in the range given by n-1 gram can 
+///guaranteed to have the first n-1 words to be the same as the n-1 gram
+///only needs to compare the following one word 
+///
+/// for a sentence as:w1, w2,....
+/// cell [i,j] in the table is for n-gram from w_(j-1)...w_(j+i-1), that is a 
+/// (i+1)-gram starting at position j+1 in sentence
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId)
+{
+    int sentLen = sentInVocId.size();
+    S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement));
+    
+    //for consistency, initialize all cells
+    for(int c=0;c<(sentLen*sentLen);c++){
+        table[c].found = false;
+        table[c].startPosInSA = 0;
+        table[c].endingPosInSA = 0;
+    }
+    
+    TextLenType startPos, endPos;
+
+    //initialize word level elements
+    for(int i=0;i<sentLen;i++){
+        IndexType vocId = sentInVocId[i];
+        //cout<<vocId<<" ";
+        if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word
+            table[i].found = false;
+        }
+        else{
+            table[i].startPosInSA = this->level1Buckets[vocId].first;
+            table[i].endingPosInSA = this->level1Buckets[vocId].last;
+
+            if(table[i].startPosInSA<=table[i].endingPosInSA){
+                table[i].found = true;
+            }
+            else{   //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc
+                table[i].found = false;
+            }
+        }
+    }
+    
+
+    //filling in the cells in the table row by row
+    //basically this means we start by looking for smaller units first
+    //if they are found, search for longer n-grams
+    for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent
+        int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension
+        int levelN_0 = n * sentLen;
+        for(int j=0;j<= (sentLen - 1 - n); j++){    //possible starting point for n+1 gram
+            //necessary conditions that this n+1 gram exist are:
+            //the two sub n-gram all exist in the corpus            
+            if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){
+                IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram                              
+
+                //n+1 gram has to be in the range of the n-gram in SA
+                startPos = table[levelN_1_0 + j].startPosInSA;
+                endPos = table[levelN_1_0 + j].endingPosInSA;
+
+                TextLenType foundPosStart = 0;
+                TextLenType foundPosEnd = 0;
+
+                //the prefix of n words of all suffixes between [startPos, endPos] is the same as the
+                //prefix of the n words in the proposed n+1 gram, no need to compare
+                //only need to compare the n+1 word, which is "nextWord" here
+                if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){                 
+                    table[levelN_0 + j].found = true;
+                    table[levelN_0 + j].startPosInSA =  foundPosStart;
+                    table[levelN_0 + j].endingPosInSA = foundPosEnd;
+                }
+                else{
+                    table[levelN_0 + j].found = false;
+                }
+
+            }
+            else{
+                table[levelN_0 + j].found = false;
+            }
+        }
+    }
+    return table;
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent)
+{
+    vector<IndexType> sentInVocId = this->convertStringToVocId(sent);
+    this->displayNgramMatchingFreq4Sent(sentInVocId);
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId)
+{
+    int sentLen = sentInVocId.size();
+    
+    int i,j;
+
+    //construct the n-gram search table    
+    S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId);
+  
+    //show sentence
+    cout<<"\t";
+    for(i=0;i<sentLen;i++){
+        cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t";
+    }
+    cout<<endl;
+
+    //show frequency of each n-gram
+    i=0;
+    bool stillMatch = true;
+    while(stillMatch &&( i<sentLen)){
+        cout<<i+1<<"\t";
+        int startForRow = i*sentLen;
+        bool anyGood = false;
+        for(j=0;j<= (sentLen - 1 - i); j++){
+            if(table[startForRow+j].found){
+                //this is for regular case              
+                if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){  //more than one occurrence
+                    cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1;
+                    anyGood = true;
+                }
+                else{
+                    cout<<"0";
+                }
+    
+            }
+            else{
+                cout<<"0";
+            }
+            cout<<"\t";
+        }
+
+        stillMatch = anyGood;
+        cout<<endl;
+        i++;
+    }
+    
+    free(table);
+}
+
+///given the pos of a word in corpus, return its offset in the sentence
+///and the sentence ID
+///offset has to be loaded
+///we do not check it here for efficicency purposes
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset)
+{
+    offset = this->offset_list[pos];
+    sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+    offset--;   //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen)
+{
+    offset = this->offset_list[pos];
+    sentLen = this->offset_list[pos-offset];
+    sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+    offset--;   //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs)
+{
+    if(srcSentAsVocIDs.size()>255){
+        cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n";
+        exit(0);
+    }
+
+    unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size();
+
+    //construct the n-gram search table 
+    S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs);
+
+    //Now, we know all the n-grams we are looking for
+    //output the results
+    vector<S_phraseLocationElement> allFoundNgrams;
+    S_phraseLocationElement tmpNode;    
+
+    int longestUnitToReportForThisSent = sentLen;
+    if(this->longestUnitToReport!=-1){
+        //and if longestUnitToReport is shorter than sentLen
+        if(this->longestUnitToReport<sentLen){
+            longestUnitToReportForThisSent = this->longestUnitToReport;
+        }
+    }
+    
+    for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){
+        int firstPosInRow = r*sentLen;
+        for(unsigned char c=0; c<= (sentLen - 1 - r); c++){
+            if(table[firstPosInRow + c].found){ //at this position the ngram was found
+                tmpNode.posStartInSrcSent = c + 1;  //position starts from 1
+                tmpNode.posEndInSrcSent = r + c + 1;
+
+                //now for all ocurrences, find their sentId and realative positions
+                TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA;
+                TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA;
+                
+                if( (this->highestFreqThresholdForReport <= 0) ||    //no limit
+                    ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport ))
+                ){  
+                    // we don't want to retrieve high-freq n-gram which is very time consuming
+                    //and meaningless for translation, such as 1M occurrences of "of the" in the corpus
+                                        
+
+                    if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){
+                        //and for each n-gram, report only a limited amount of occurrences
+                        endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1;
+                    }
+
+                    TextLenType sentId;
+                    unsigned char posInSent;
+                    for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){
+                        this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent);
+                        tmpNode.sentIdInCorpus = sentId;
+                        tmpNode.posInSentInCorpus = posInSent;
+
+                        allFoundNgrams.push_back(tmpNode);
+                    }
+                }
+            }
+
+        }
+    }
+    
+    free(table);
+
+    return allFoundNgrams;  
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent)
+{
+    //use the vocabulary associated with this corpus to convert words to vocIDs
+    vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent);
+
+    return this->findPhrasesInASentence(srcSentAsVocIDs);
+}
+
+
+bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd)
+{
+    int phraseLen = phrase.size();
+
+    //first check if there are any <unk> in the phrase
+    for(int i=0;i<phrase.size();i++){
+        if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){
+            return false;   //return empty matching result
+        }
+    }
+
+    TextLenType currentRangeStart, currentRangeEnd;
+    TextLenType narrowedRangeStart, narrowedRangeEnd;
+    IndexType vocId;
+
+    //for word 1
+    vocId = phrase[0];
+    currentRangeStart = this->level1Buckets[vocId].first;
+    currentRangeEnd = this->level1Buckets[vocId].last;
+
+    if(currentRangeStart>currentRangeEnd){
+        return false;   //even this 1-gram does not exist
+    }
+
+    int posInPhrase = 1;    
+    while( posInPhrase<phraseLen ){
+        vocId = phrase[posInPhrase];
+        bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd);
+
+        if(! stillExist){
+            return false;
+        }
+        
+        currentRangeStart = narrowedRangeStart;
+        currentRangeEnd = narrowedRangeEnd;
+
+        posInPhrase++;
+    }
+
+    //we find the range of matching phrase, now get the sentId
+    rangeStart = currentRangeStart;
+    rangeEnd = currentRangeEnd;
+
+    return true;
+}
+
+///similar to construct the freq table
+///but only search for the exact phrase matching
+///Important: because locateSentIdFromPos is called which requires the offset information
+///Suffix array has to be initialized with offset loaded
+///i.e. initilized with loadData_forSearch(corpusName, bool noVoc, noOffset=fase)
+///otherwise the program will have segmentation fault
+///SALM does not check if offset has been loaded already for efficiency reasons because locateSendIdFromPos() is called frequently
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase)
+{
+    vector<S_SimplePhraseLocationElement> matchingResult;
+
+    TextLenType rangeStart, rangeEnd;
+
+    if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+        //we find some match
+        S_SimplePhraseLocationElement tmpNode;
+        for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){
+            this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus);
+            matchingResult.push_back(tmpNode);
+        }
+    }
+
+    return matchingResult;
+}
+
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase)
+{
+    //use the vocabulary associated with this corpus to convert words to vocIds
+    vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+    return this->locateExactPhraseInCorpus(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase)
+{
+    TextLenType rangeStart, rangeEnd;
+
+    if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+        return rangeEnd - rangeStart + 1;
+    }
+
+    return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase)
+{
+    //use the vocabulary associated with this corpus to convert words to vocIds
+    vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+    return this->freqOfExactPhraseMatch(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen)
+{
+    TextLenType rangeStart, rangeEnd;
+	
+	sentLen = phrase.size();
+
+    if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+		startPosInSA = rangeStart;
+        return rangeEnd - rangeStart + 1;
+    }
+
+    return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen)
+{
+    //use the vocabulary associated with this corpus to convert words to vocIds
+    vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+    return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber()
+{
+    return this->totalSentNum;
+}
+
+///given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+    n = index / sentLen + 1;
+    posInSrcSent = index % sentLen;
+}
+
+///given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+    unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+    return indexInTable;
+}
+
+///simple return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent)
+{   
+    vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+    return this->numberOfMatcedNgram(sentInVocId);
+}
+
+///simply return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId)
+{       
+    int sentLen = sentInVocId.size();
+
+    S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+    unsigned int totalMatched = 0;
+
+    for(unsigned int i=0;i<(sentLen*sentLen);i++){      
+        if(table[i].found){
+            totalMatched++;
+        }           
+    }
+
+    free(table);
+    return totalMatched;
+}
+
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen)
+{
+	vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+	return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen);
+}
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen)
+{
+	sentLen = sentInVocId.size();
+	map<int, pair<int, unsigned long> > nGramMatched;
+	map<int, pair<int, unsigned long> >::iterator iterNGramMatched;
+
+	//construct the n-gram search table
+	S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+  
+	for(int n = 1; n <= sentLen; n++){		
+		for(int startPos=0; startPos <= (sentLen - n); startPos++){
+			int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen);
+
+			if(table[indexInTable].found){
+				
+				unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1;
+				iterNGramMatched = nGramMatched.find(n);
+				if(iterNGramMatched==nGramMatched.end()){//has not seen this before
+					nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) ));
+				}
+				else{
+					iterNGramMatched->second.first++;
+					iterNGramMatched->second.second+=freqInTraining;
+				}
+			}
+		}
+	}
+	
+	free(table);
+  
+	return nGramMatched;
+}
+
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~
new file mode 100755
index 0000000..94d272c
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.cpp~
@@ -0,0 +1,753 @@
+/**
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+
+#include "_SuffixArraySearchApplicationBase.h"
+#include <iostream>
+#include <stdlib.h>
+
+using namespace std;
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArraySearchApplicationBase::C_SuffixArraySearchApplicationBase()
+{
+
+    this->reportMaxOccurrenceOfOneNgram = -1;    
+	this->highestFreqThresholdForReport = -1;
+	this->shortestUnitToReport = 1;
+    this->longestUnitToReport = -1; //no constraint
+
+    this->level1Buckets = NULL;
+	this->noLevel1Bucket = false;	//by default, build level1 bucket
+
+    this->noOffset = false; //by default, load offset    
+}
+
+C_SuffixArraySearchApplicationBase::~C_SuffixArraySearchApplicationBase()
+{
+
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter highestFreqThresholdForReport is set so that very high frequent n-grams such as unigram "the" is skipped
+* high frequent n-grams occur too often in the corpus and their statistics can often be estimated offline.
+* Default value = -1 (no effective threshold)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport)
+{
+    this->highestFreqThresholdForReport = highestFreqThresholdForReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter shortestUnitToReport is set so that short n-grams can be skipped to speed up the process
+* Default value = 1 (no effective constraint)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_shortestUnitToReport(int shortestUnitToReport)
+{
+    this->shortestUnitToReport = shortestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter longestUnitToReport is set to skip long n-gram matches
+*
+* Default value = -1 (no effective limit, output all the matched n-grams no matter how long they are)
+**/
+void C_SuffixArraySearchApplicationBase::setParam_longestUnitToReport(int longestUnitToReport)
+{
+    this->longestUnitToReport = longestUnitToReport;
+}
+
+/**
+* When function findPhrasesInASentence(char * sentence) is called to return the locations of all the embedded n-grams in sentence
+* parameter reportMaxOccurrenceOfOneNgram is set to output information of only the "first" few occurrences of the matched n-gram
+* Since the order is based on the order of the corresponding suffices in the corpus,
+* the output occurrences are usually not the first few occurrences of the n-gram in the corpus
+**/
+void C_SuffixArraySearchApplicationBase::setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram)
+{
+    this->reportMaxOccurrenceOfOneNgram = reportMaxOccurrenceOfOneNgram;
+}
+
+
+
+/**
+* Load the indexed corpus, suffix array, offset and vocabulary into memory
+* Note: if C_SuffixArraySearchApplicationBase will be used in the application to return the sentenceId/offset in sentence for the matched n-gram
+* then noOffset needs to be set to be false (to load the offset)
+**/
+void C_SuffixArraySearchApplicationBase::loadData_forSearch(const char * filename, bool noVoc, bool noOffset)
+{
+
+	this->loadData(filename, noVoc, noOffset, false);	//call the constructor of the super class, load data and build level1Bucket
+
+	if(! this->noOffset){
+        TextLenType lastSentId;
+        unsigned char tmpOffset;
+        this->locateSendIdFromPos(this->corpusSize - 3, lastSentId, tmpOffset);
+        this->totalSentNum = lastSentId;
+    }
+    else{
+        //we do not have offset information, simply travel to the sentence head
+        TextLenType pos = this->corpusSize-3;
+        while(this->corpus_list[pos]<this->sentIdStart){    //still actual words
+            pos--;
+        }
+        //at this position, it should be the <sentId> for the last sentence
+        this->totalSentNum = this->corpus_list[pos] - this->sentIdStart +1;
+    }
+    cerr<<"Total: "<<this->totalSentNum<<" sentences loaded.\n";
+
+}
+
+
+///return 0 if w = text
+///return 1 if w < text
+///return 2 if w > text
+///given that the prefix of lcp words are the same
+char C_SuffixArraySearchApplicationBase::comparePhraseWithTextWithLCP(IndexType vocInWord, int lcp, TextLenType posInText)
+{   
+
+    IndexType vocInText = this->corpus_list[posInText+lcp];
+
+    if(vocInWord == vocInText){
+        return 0;
+    }
+    
+    if(vocInWord < vocInText){
+        return 1;
+    }
+
+    return 2;
+}
+
+/** Utility function
+* Convert an input sentence as char string into a vector of C_String objects
+**/
+vector<C_String> C_SuffixArraySearchApplicationBase::convertCharStringToCStringVector(const char * sentText)
+{
+	vector<C_String> sentAsStringVector;
+
+	char tmpToken[MAX_TOKEN_LEN];
+    memset(tmpToken,0,MAX_TOKEN_LEN);
+
+    int pos = 0;
+
+    int inputLen = strlen(sentText);
+
+	for(int posInInput = 0; posInInput<inputLen; posInInput++){
+        char thisChar = sentText[posInInput];
+
+        if((thisChar==' ')||(thisChar=='\t')){  //delimiters
+            if(strlen(tmpToken)>0){
+                tmpToken[pos] = '\0';               
+                sentAsStringVector.push_back(C_String(tmpToken));
+                pos=0;
+                tmpToken[pos] = '\0';
+            }
+        }
+        else{
+            tmpToken[pos] = thisChar;
+            pos++;
+            if(pos>=MAX_TOKEN_LEN){ //we can handle it
+                fprintf(stderr,"Can't read tokens that exceed length limit %d. Quit.\n", MAX_TOKEN_LEN);
+                exit(0);
+            }
+        }
+    }
+
+    tmpToken[pos] = '\0';
+    if(strlen(tmpToken)>0){     
+        sentAsStringVector.push_back(C_String(tmpToken));
+    }
+
+	return sentAsStringVector;
+
+}
+
+/**
+* Utility function: convert a sentence as a vector of C_String to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector)
+{
+	if(this->noVocabulary){
+        cerr<<"Vocabulary not available!\n";
+        exit(-1);
+    }
+
+	vector<IndexType> sentAsVocIdVector;
+
+	for(int i=0;i<sentAsStringVector.size();i++){
+		sentAsVocIdVector.push_back(this->voc->returnId(sentAsStringVector[i]));	
+	}
+	return sentAsVocIdVector;
+}
+
+
+/**
+* Utility function:
+* Convert a sentence as character string to a vector of vocIDs
+**/
+vector<IndexType> C_SuffixArraySearchApplicationBase::convertStringToVocId(const char * sentText)
+{
+	vector<C_String> sentAsCStringVector = this->convertCharStringToCStringVector(sentText);
+	return this->convertCStringVectorToVocIdVector(sentAsCStringVector);
+}
+
+
+/**
+* If know the range where the phrase is, search in this range for it
+* position here are all positions in SA, not the positions in the textstring
+* 
+* LCP indicates that all the suffixes in the range has the same prefix with LCP length with the proposed n-gram phrase
+* only need to compare the "nextWord" at LCP+1 position
+*
+* return true if such phrase can be found inside the range, false if not
+**/
+bool C_SuffixArraySearchApplicationBase::searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType &resultStartPos, TextLenType &resultEndPos)
+{
+    TextLenType leftPos, rightPos, middlePos;
+
+    //in case the phrase to be searched is beyond the bucket although the first LCP word is the same as this bucket
+    //e.g. range correspondes to [ab, ad], but we are searching for (aa)
+    //so first step is to make sure the lcp+next word is still in this range
+    if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeStartPos])==1){
+        //phrase+next word < text corresponding rangeStart, we could not find it inside this range
+        return false;
+    }
+
+    if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rangeEndPos])==2){
+        //phrase+next word > text corresponding to rangeEnd
+        return false;
+    }
+    
+    //now we are sure that text(SA[rangeStart]) <= phrase <= text(SA[rangeEnd])
+
+
+    //search for left bound ( the pos in text which is the min(text>=w))
+    //at any time, Left<w<=Right (actually Left<=w<=Right)
+    leftPos = rangeStartPos;
+    rightPos = rangeEndPos; 
+    while( rightPos > (leftPos+1)){ //at the time when right = left +1, we should stop
+
+        middlePos = (TextLenType)((leftPos + rightPos) / 2);
+        if(((leftPos + rightPos) % 2) != 0){            
+            middlePos++; //bias towards right
+        }
+
+        if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 2 ){ 
+            // phrase <= middlePos in Text, go left
+            rightPos = middlePos;
+        }
+        else{
+            leftPos = middlePos;    //word > middle, go right
+        }
+
+    }
+    //in previous implementation, we can gurantee that Left<w, because we take rangeStartPos-- from original range
+    //here we can only guarantee that Left<=w, so need to check if Left==w at lcp
+    if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[leftPos])==0){
+        resultStartPos = leftPos;
+    }
+    else{
+        resultStartPos = rightPos;
+    }
+
+    //search for right bound ( the value which is the max(text<=w))
+    //at any time, Left<w<=Right (actually Left<=w<=Right)
+    leftPos = rangeStartPos;
+    rightPos = rangeEndPos;         
+    while( rightPos > (leftPos+1)){ //stop when right = left + 1
+        middlePos = (TextLenType) ((leftPos + rightPos) / 2 );  //bias towards left
+        
+        if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[middlePos]) != 1 ){ // phrase >= middlePos in Text, go right
+            leftPos = middlePos;
+        }
+        else{
+            rightPos = middlePos;   // ==1, phrase < middlePos
+        }
+    }
+    //in previous implementation, we can gurantee that w<Right, because we take rangeEndPos++ from original range
+    //here we can only guarantee that w<=Right, so need to check if Right==w at lcp
+    if(this->comparePhraseWithTextWithLCP(nextWord, lcp, this->suffix_list[rightPos])==0){
+        resultEndPos = rightPos;
+    }
+    else{
+        resultEndPos = leftPos;
+    }
+
+    if(resultEndPos>=resultStartPos){
+        return true;
+    }
+
+    return false;   //could not find this phrase
+}
+
+///memory allocated here, remember to free the memory when the table is not needed any more in the 
+///calling function
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen)
+{
+	vector<IndexType> sentInVocId = this->convertStringToVocId(sentText);
+	sentLen = sentInVocId.size();
+
+	return this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+}
+
+
+///constructing the n-gram search table
+///memory allocated here, remember to free the memory when the table is not needed any more in the 
+///calling function
+///
+///faster than constructNgramSearchTable4Sent because the suffixes in the range given by n-1 gram can 
+///guaranteed to have the first n-1 words to be the same as the n-1 gram
+///only needs to compare the following one word 
+///
+/// for a sentence as:w1, w2,....
+/// cell [i,j] in the table is for n-gram from w_(j-1)...w_(j+i-1), that is a 
+/// (i+1)-gram starting at position j+1 in sentence
+S_sentSearchTableElement * C_SuffixArraySearchApplicationBase::constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId)
+{
+    int sentLen = sentInVocId.size();
+    S_sentSearchTableElement * table = (S_sentSearchTableElement *) malloc( sentLen * sentLen * sizeof(S_sentSearchTableElement));
+    
+    //for consistency, initialize all cells
+    for(int c=0;c<(sentLen*sentLen);c++){
+        table[c].found = false;
+        table[c].startPosInSA = 0;
+        table[c].endingPosInSA = 0;
+    }
+    
+    TextLenType startPos, endPos;
+
+    //initialize word level elements
+    for(int i=0;i<sentLen;i++){
+        IndexType vocId = sentInVocId[i];
+        //cout<<vocId<<" ";
+        if((vocId==0)||(vocId>=this->sentIdStart)){ //vocId ==0 means this word is OOV <unk>, if vocId>=sentIdStart means for this corpus, we don't know this word
+            table[i].found = false;
+        }
+        else{
+            table[i].startPosInSA = this->level1Buckets[vocId].first;
+            table[i].endingPosInSA = this->level1Buckets[vocId].last;
+
+            if(table[i].startPosInSA<=table[i].endingPosInSA){
+                table[i].found = true;
+            }
+            else{   //because vocabulary is built on top of an existing voc, this corpus may not have all the occurrences of all the words in the voc
+                table[i].found = false;
+            }
+        }
+    }
+    
+
+    //filling in the cells in the table row by row
+    //basically this means we start by looking for smaller units first
+    //if they are found, search for longer n-grams
+    for(int n=1;n<sentLen;n++){ //finding n+1 gram. when n=sentLen-1, we are search for the occurrence of the whole sent
+        int levelN_1_0 = (n - 1) * sentLen; //map from two dimensional position to one-dimension
+        int levelN_0 = n * sentLen;
+        for(int j=0;j<= (sentLen - 1 - n); j++){    //possible starting point for n+1 gram
+            //necessary conditions that this n+1 gram exist are:
+            //the two sub n-gram all exist in the corpus            
+            if( table[levelN_1_0 + j].found && table[levelN_1_0 + j +1].found){
+                IndexType nextWord = sentInVocId[j+n]; //the last word of the n+1 gram                              
+
+                //n+1 gram has to be in the range of the n-gram in SA
+                startPos = table[levelN_1_0 + j].startPosInSA;
+                endPos = table[levelN_1_0 + j].endingPosInSA;
+
+                TextLenType foundPosStart = 0;
+                TextLenType foundPosEnd = 0;
+
+                //the prefix of n words of all suffixes between [startPos, endPos] is the same as the
+                //prefix of the n words in the proposed n+1 gram, no need to compare
+                //only need to compare the n+1 word, which is "nextWord" here
+                if(this->searchPhraseGivenRangeWithLCP(nextWord, n, startPos, endPos, foundPosStart, foundPosEnd)){                 
+                    table[levelN_0 + j].found = true;
+                    table[levelN_0 + j].startPosInSA =  foundPosStart;
+                    table[levelN_0 + j].endingPosInSA = foundPosEnd;
+                }
+                else{
+                    table[levelN_0 + j].found = false;
+                }
+
+            }
+            else{
+                table[levelN_0 + j].found = false;
+            }
+        }
+    }
+    return table;
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(const char * sent)
+{
+    vector<IndexType> sentInVocId = this->convertStringToVocId(sent);
+    this->displayNgramMatchingFreq4Sent(sentInVocId);
+}
+
+void C_SuffixArraySearchApplicationBase::displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId)
+{
+    int sentLen = sentInVocId.size();
+    
+    int i,j;
+
+    //construct the n-gram search table    
+    S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(sentInVocId);
+  
+    //show sentence
+    cout<<"\t";
+    for(i=0;i<sentLen;i++){
+        cout<<this->voc->getText(sentInVocId[i]).toString()<<"\t";
+    }
+    cout<<endl;
+
+    //show frequency of each n-gram
+    i=0;
+    bool stillMatch = true;
+    while(stillMatch &&( i<sentLen)){
+        cout<<i+1<<"\t";
+        int startForRow = i*sentLen;
+        bool anyGood = false;
+        for(j=0;j<= (sentLen - 1 - i); j++){
+            if(table[startForRow+j].found){
+                //this is for regular case              
+                if(table[startForRow+j].endingPosInSA>=table[startForRow+j].startPosInSA){  //more than one occurrence
+                    cout<<table[startForRow+j].endingPosInSA-table[startForRow+j].startPosInSA + 1;
+                    anyGood = true;
+                }
+                else{
+                    cout<<"0";
+                }
+    
+            }
+            else{
+                cout<<"0";
+            }
+            cout<<"\t";
+        }
+
+        stillMatch = anyGood;
+        cout<<endl;
+        i++;
+    }
+    
+    free(table);
+}
+
+///given the pos of a word in corpus, return its offset in the sentence
+///and the sentence ID
+///offset has to be loaded
+///we do not check it here for efficicency purposes
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset)
+{
+    offset = this->offset_list[pos];
+    sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+    offset--;   //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+void C_SuffixArraySearchApplicationBase::locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen)
+{
+    offset = this->offset_list[pos];
+    sentLen = this->offset_list[pos-offset];
+    sentId = this->corpus_list[pos-offset] - this->sentIdStart + 1;
+
+    offset--;   //because <s> is considered in the corpus when indexing the SA, but there is no <s> in the real corpus
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs)
+{
+    if(srcSentAsVocIDs.size()>255){
+        cerr<<"Sorry, I prefer to handle sentences with less than 255 words. Please cut the sentence short and try it again.\n";
+        exit(0);
+    }
+
+    unsigned char sentLen = (unsigned char) srcSentAsVocIDs.size();
+
+    //construct the n-gram search table 
+    S_sentSearchTableElement * table = constructNgramSearchTable4SentWithLCP(srcSentAsVocIDs);
+
+    //Now, we know all the n-grams we are looking for
+    //output the results
+    vector<S_phraseLocationElement> allFoundNgrams;
+    S_phraseLocationElement tmpNode;    
+
+    int longestUnitToReportForThisSent = sentLen;
+    if(this->longestUnitToReport!=-1){
+        //and if longestUnitToReport is shorter than sentLen
+        if(this->longestUnitToReport<sentLen){
+            longestUnitToReportForThisSent = this->longestUnitToReport;
+        }
+    }
+    
+    for(unsigned char r = this->shortestUnitToReport - 1; r< longestUnitToReportForThisSent; r++){
+        int firstPosInRow = r*sentLen;
+        for(unsigned char c=0; c<= (sentLen - 1 - r); c++){
+            if(table[firstPosInRow + c].found){ //at this position the ngram was found
+                tmpNode.posStartInSrcSent = c + 1;  //position starts from 1
+                tmpNode.posEndInSrcSent = r + c + 1;
+
+                //now for all ocurrences, find their sentId and realative positions
+                TextLenType startPosInSA = table[firstPosInRow + c].startPosInSA;
+                TextLenType endPosInSA = table[firstPosInRow + c].endingPosInSA;
+                
+                if( (this->highestFreqThresholdForReport <= 0) ||    //no limit
+                    ( (this->highestFreqThresholdForReport > 0 ) && ( (endPosInSA - startPosInSA) < this->highestFreqThresholdForReport ))
+                ){  
+                    // we don't want to retrieve high-freq n-gram which is very time consuming
+                    //and meaningless for translation, such as 1M occurrences of "of the" in the corpus
+                                        
+
+                    if((this->reportMaxOccurrenceOfOneNgram > 0) && ( (endPosInSA - startPosInSA +1) > this->reportMaxOccurrenceOfOneNgram) ){
+                        //and for each n-gram, report only a limited amount of occurrences
+                        endPosInSA = startPosInSA + this->reportMaxOccurrenceOfOneNgram - 1;
+                    }
+
+                    TextLenType sentId;
+                    unsigned char posInSent;
+                    for(TextLenType iterator =startPosInSA; iterator <=endPosInSA; iterator++ ){
+                        this->locateSendIdFromPos(this->suffix_list[iterator], sentId, posInSent);
+                        tmpNode.sentIdInCorpus = sentId;
+                        tmpNode.posInSentInCorpus = posInSent;
+
+                        allFoundNgrams.push_back(tmpNode);
+                    }
+                }
+            }
+
+        }
+    }
+    
+    free(table);
+
+    return allFoundNgrams;  
+}
+
+vector<S_phraseLocationElement> C_SuffixArraySearchApplicationBase::findPhrasesInASentence(const char * srcSent)
+{
+    //use the vocabulary associated with this corpus to convert words to vocIDs
+    vector<IndexType> srcSentAsVocIDs = this->convertStringToVocId(srcSent);
+
+    return this->findPhrasesInASentence(srcSentAsVocIDs);
+}
+
+
+bool C_SuffixArraySearchApplicationBase::locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd)
+{
+    int phraseLen = phrase.size();
+
+    //first check if there are any <unk> in the phrase
+    for(int i=0;i<phrase.size();i++){
+        if((phrase[i]==0)||(phrase[i]>=this->sentIdStart)){
+            return false;   //return empty matching result
+        }
+    }
+
+    TextLenType currentRangeStart, currentRangeEnd;
+    TextLenType narrowedRangeStart, narrowedRangeEnd;
+    IndexType vocId;
+
+    //for word 1
+    vocId = phrase[0];
+    currentRangeStart = this->level1Buckets[vocId].first;
+    currentRangeEnd = this->level1Buckets[vocId].last;
+
+    if(currentRangeStart>currentRangeEnd){
+        return false;   //even this 1-gram does not exist
+    }
+
+    int posInPhrase = 1;    
+    while( posInPhrase<phraseLen ){
+        vocId = phrase[posInPhrase];
+        bool stillExist = this->searchPhraseGivenRangeWithLCP(vocId, posInPhrase, currentRangeStart, currentRangeEnd, narrowedRangeStart, narrowedRangeEnd);
+
+        if(! stillExist){
+            return false;
+        }
+        
+        currentRangeStart = narrowedRangeStart;
+        currentRangeEnd = narrowedRangeEnd;
+
+        posInPhrase++;
+    }
+
+    //we find the range of matching phrase, now get the sentId
+    rangeStart = currentRangeStart;
+    rangeEnd = currentRangeEnd;
+
+    return true;
+}
+
+///similar to construct the freq table
+///but only search for the exact phrase matching
+///Important: because locateSentIdFromPos is called which requires the offset information
+///Suffix array has to be initialized with offset loaded
+///i.e. initilized with loadData_forSearch(corpusName, bool noVoc, noOffset=fase)
+///otherwise the program will have segmentation fault
+///SALM does not check if offset has been loaded already for efficiency reasons because locateSendIdFromPos() is called frequently
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(vector<IndexType> & phrase)
+{
+    vector<S_SimplePhraseLocationElement> matchingResult;
+
+    TextLenType rangeStart, rangeEnd;
+
+    if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+        //we find some match
+        S_SimplePhraseLocationElement tmpNode;
+        for(TextLenType saPos = rangeStart; saPos <= rangeEnd; saPos++){
+            this->locateSendIdFromPos(this->suffix_list[saPos], tmpNode.sentIdInCorpus, tmpNode.posInSentInCorpus);
+            matchingResult.push_back(tmpNode);
+        }
+    }
+
+    return matchingResult;
+}
+
+vector<S_SimplePhraseLocationElement> C_SuffixArraySearchApplicationBase::locateExactPhraseInCorpus(const char *phrase)
+{
+    //use the vocabulary associated with this corpus to convert words to vocIds
+    vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+    return this->locateExactPhraseInCorpus(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(vector<IndexType> & phrase)
+{
+    TextLenType rangeStart, rangeEnd;
+
+    if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+        return rangeEnd - rangeStart + 1;
+    }
+
+    return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatch(const char *phrase)
+{
+    //use the vocabulary associated with this corpus to convert words to vocIds
+    vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+    return this->freqOfExactPhraseMatch(phraseAsVocIDs);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen)
+{
+    TextLenType rangeStart, rangeEnd;
+	
+	sentLen = phrase.size();
+
+    if(this->locateSAPositionRangeForExactPhraseMatch(phrase, rangeStart, rangeEnd)){
+		startPosInSA = rangeStart;
+        return rangeEnd - rangeStart + 1;
+    }
+
+    return 0;
+}
+
+TextLenType C_SuffixArraySearchApplicationBase::freqOfExactPhraseMatchAndFirstOccurrence(const char *phrase, TextLenType & startPosInSA, int & sentLen)
+{
+    //use the vocabulary associated with this corpus to convert words to vocIds
+    vector<IndexType> phraseAsVocIDs = this->convertStringToVocId(phrase);
+
+    return this->freqOfExactPhraseMatchAndFirstOccurrence(phraseAsVocIDs, startPosInSA, sentLen);
+}
+
+
+TextLenType C_SuffixArraySearchApplicationBase::returnTotalSentNumber()
+{
+    return this->totalSentNum;
+}
+
+///given src sentence length, convert the index in one-dimensional table to pair<startingPosInSrcSent, n>
+///startingPosInSrcSent starts at 0, n is the n-gram length
+void C_SuffixArraySearchApplicationBase::oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n)
+{
+    n = index / sentLen + 1;
+    posInSrcSent = index % sentLen;
+}
+
+///given the starting position in src sentence and the length of the n-gram
+///calculate the index in the table
+///posInSent starts at 0, n is the actual len of n-gram, starts at 1
+unsigned int C_SuffixArraySearchApplicationBase::twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen)
+{
+    unsigned int indexInTable = (n-1)*sentLen + posInSent;
+
+    return indexInTable;
+}
+
+///simple return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(const char *srcSent)
+{   
+    vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+    return this->numberOfMatcedNgram(sentInVocId);
+}
+
+///simply return how many n-grams are matched
+unsigned int C_SuffixArraySearchApplicationBase::numberOfMatcedNgram(vector<IndexType> & sentInVocId)
+{       
+    int sentLen = sentInVocId.size();
+
+    S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+
+    unsigned int totalMatched = 0;
+
+    for(unsigned int i=0;i<(sentLen*sentLen);i++){      
+        if(table[i].found){
+            totalMatched++;
+        }           
+    }
+
+    free(table);
+    return totalMatched;
+}
+
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen)
+{
+	vector<IndexType> sentInVocId = this->convertStringToVocId(srcSent);
+	return this->returnNGramMatchingStatForOneSent(sentInVocId, sentLen);
+}
+
+map<int, pair<int, unsigned long> > C_SuffixArraySearchApplicationBase::returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int &sentLen)
+{
+	sentLen = sentInVocId.size();
+	map<int, pair<int, unsigned long> > nGramMatched;
+	map<int, pair<int, unsigned long> >::iterator iterNGramMatched;
+
+	//construct the n-gram search table
+	S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(sentInVocId);
+  
+	for(int n = 1; n <= sentLen; n++){		
+		for(int startPos=0; startPos <= (sentLen - n); startPos++){
+			int indexInTable = this->twoDimensionIndexToOneDimensionTableIndex(startPos, n, sentLen);
+
+			if(table[indexInTable].found){
+				
+				unsigned long freqInTraining = table[indexInTable].endingPosInSA - table[indexInTable].startPosInSA + 1;
+				iterNGramMatched = nGramMatched.find(n);
+				if(iterNGramMatched==nGramMatched.end()){//has not seen this before
+					nGramMatched.insert(make_pair(n, make_pair(1, freqInTraining) ));
+				}
+				else{
+					iterNGramMatched->second.first++;
+					iterNGramMatched->second.second+=freqInTraining;
+				}
+			}
+		}
+	}
+	
+	free(table);
+  
+	return nGramMatched;
+}
+
diff --git a/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h
new file mode 100755
index 0000000..2c0070d
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArraySearch/_SuffixArraySearchApplicationBase.h
@@ -0,0 +1,127 @@
+#if !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)
+#define __SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_
+
+#include "_SuffixArrayApplicationBase.h"
+/**
+* \ingroup search
+* Used by locateExactPhraseInCorpus() to return the location of an matched n-gram in the corpus
+* as a pair of <sentenceId, offset pos in sentence>
+**/
+typedef struct simplePhraseLocationElement
+{
+	TextLenType sentIdInCorpus;
+	unsigned char posInSentInCorpus;
+}S_SimplePhraseLocationElement;
+
+/**
+* \ingroup search
+* Used by findPhraseInASentence() to return the location of an embedded n-gram in the corpus
+* <posStartInSrcSent, posEndInSrcSent> represents the embedded n-gram in the sentence
+* <sentIdInCorpus, posInSentInCorpus> represents the location in the corpus
+**/
+typedef struct phraseLocationElement
+{
+	unsigned char posStartInSrcSent;
+	unsigned char posEndInSrcSent;
+	TextLenType sentIdInCorpus;
+	unsigned char posInSentInCorpus;
+}S_phraseLocationElement;
+
+/**
+* \ingroup search
+**/
+typedef struct phraseLocationWithSrcSentElement
+{
+	int srcPosStart;
+	int srcPosEnd;	
+	TextLenType sentId;
+	TextLenType posInSent;
+	vector<C_String> sentence;
+}S_phraseLocationWithSrcSentElement;
+
+/**
+* \ingroup search
+**/
+typedef struct sentSearchTableElement
+{
+	bool found;
+	TextLenType startPosInSA;
+	TextLenType endingPosInSA;
+}S_sentSearchTableElement;
+
+
+/**
+* \ingroup search
+* Base class for suffix array search applications
+* Provides functions to search n-grams in the corpus
+* Including the frequency of the n-gram and the actual location (sentenceID+offset in sentence)
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_SuffixArraySearchApplicationBase : public C_SuffixArrayApplicationBase  
+{
+public:
+	void loadData_forSearch(const char * filename, bool noVoc, bool noOffset);
+
+	unsigned int numberOfMatcedNgram(const char * srcSent);
+	unsigned int numberOfMatcedNgram(vector<IndexType> & sentInVocId);
+
+	TextLenType freqOfExactPhraseMatch(const char * phrase);
+	TextLenType freqOfExactPhraseMatch(vector<IndexType> & phrase);
+
+	TextLenType freqOfExactPhraseMatchAndFirstOccurrence(const char * phrase, TextLenType & startPosInSA, int & sentLen);
+	TextLenType freqOfExactPhraseMatchAndFirstOccurrence(vector<IndexType> & phrase, TextLenType & startPosInSA, int & sentLen);
+
+	vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(const char * phrase);
+	vector<S_SimplePhraseLocationElement> locateExactPhraseInCorpus(vector<IndexType> & phrase);
+
+	vector<S_phraseLocationElement> findPhrasesInASentence(const char * srcSent);
+	vector<S_phraseLocationElement> findPhrasesInASentence(vector<IndexType> & srcSentAsVocIDs);
+
+	void displayNgramMatchingFreq4Sent(const char *);
+	void displayNgramMatchingFreq4Sent(vector<IndexType> & sentInVocId);
+
+	map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(const char * srcSent, int & sentLen);
+	map<int, pair<int, unsigned long> > returnNGramMatchingStatForOneSent(vector<IndexType> & sentInVocId, int & sentLen);
+
+	S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP(const char * sentText, int & sentLen);
+	S_sentSearchTableElement * constructNgramSearchTable4SentWithLCP( vector<IndexType> & sentInVocId);
+
+	void setParam_reportMaxOccurrenceOfOneNgram(int reportMaxOccurrenceOfOneNgram);
+	void setParam_highestFreqThresholdForReport(int highestFreqThresholdForReport);
+	void setParam_longestUnitToReport(int longestUnitToReport);
+	void setParam_shortestUnitToReport(int shortestUnitToReport);
+
+	TextLenType returnTotalSentNumber();
+
+	vector<IndexType> convertStringToVocId(const char * sentText);
+	vector<C_String> convertCharStringToCStringVector(const char * sentText);
+	vector<IndexType> convertCStringVectorToVocIdVector(vector<C_String> & sentAsStringVector);
+
+
+	C_SuffixArraySearchApplicationBase();
+	virtual ~C_SuffixArraySearchApplicationBase();
+
+protected:
+	bool locateSAPositionRangeForExactPhraseMatch(vector<IndexType> & phrase, TextLenType & rangeStart, TextLenType & rangeEnd);
+
+	bool searchPhraseGivenRangeWithLCP(IndexType nextWord, int lcp, TextLenType rangeStartPos, TextLenType rangeEndPos, TextLenType & resultStartPos, TextLenType & resultEndPos);
+	char comparePhraseWithTextWithLCP(IndexType, int, TextLenType);
+
+	void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset);
+	void locateSendIdFromPos(TextLenType pos, TextLenType & sentId, unsigned char & offset, unsigned char & sentLen);
+
+	
+	unsigned int twoDimensionIndexToOneDimensionTableIndex(unsigned int posInSent, unsigned int n, unsigned int sentLen);
+	void oneDimensionTableIndexToTwoDimension(unsigned int index, unsigned int sentLen, unsigned int &posInSrcSent, unsigned int &n);
+
+	int reportMaxOccurrenceOfOneNgram;
+	int highestFreqThresholdForReport;
+	int longestUnitToReport;
+	int shortestUnitToReport;
+
+	TextLenType totalSentNum;
+};
+
+#endif // !defined(__SUFFIXARRAYSEARCHAPPLICATIONBASE_H__INCLUDED_)
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp
new file mode 100755
index 0000000..91962fe
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp
@@ -0,0 +1,314 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayApplicationBase.h"
+
+#include "malloc.h"
+#include "time.h"
+
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase()
+{
+	this->level1Buckets = NULL;
+	this->noVocabulary = false;	//by default, still load the vocabulary
+	this->noOffset = false;	//by default, load offset
+	this->noLevel1Bucket = false;	//by default, construct level1 bucket
+}
+
+C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase()
+{
+	if(this->level1Buckets!=NULL){
+		free(this->level1Buckets);
+	}
+
+	//not necessary too
+	free(this->corpus_list);
+	free(this->suffix_list);
+
+	if(! this->noOffset){
+		free(this->offset_list);
+	}
+
+	if(! this->noVocabulary){
+		delete(this->voc);
+	}
+}
+
+/**
+* Load the indexed corpus, suffix array, vocabulary, offset into memory for follow up applications
+* It is optional to load vocabulary, offset depends on the argument.
+* In the case when the testing data shares the same vocabulary as the training data and only vocIDs are used to represent the sentence/n-grams
+* then vocabulary which maps between vocId and the word text can be skipped to save some memory.
+*
+* If the suffix array object does not need to locate the sentence id of an occurred n-gram, then offset information is not needed.
+*
+* Be very careful here, the suffix array class does not check if offset has been loaded in the search function to make it efficient
+* you need to know what the suffix array class will be used (whether offset is needed) and load it properly
+* @param fileNameStem The filename of the corpus. This should be the same filename used in IndexSA
+* @param noVoc If set to be 'true', vocabulary will not be loaded
+* @param noOffset If set to be 'true', the offset information will not be loaded. <sentId, offsetInSent> information for an n-gram's occurrences can not be calculated.
+* @param noLevel1Bucket Level1Bucket is used to speed up the search at the cost of additional memory. For applications which do not need to locate n-grams in the corpus (such as the corpus scanning application), then there is no need to create Level1Bucket
+**/
+void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket)
+{
+	long ltime1, ltime2;
+
+	this->noVocabulary = noVoc;
+	this->noOffset = noOffset;
+	this->noLevel1Bucket = noLevel1Bucket;
+	
+	
+	char tmpString[1000];
+
+	//the order of loading the data is important, do not change
+	if(! this->noVocabulary){
+		time( &ltime1 );
+		cerr<<"Loading Vocabulary...\n";
+		sprintf(tmpString,"%s.id_voc",fileNameStem);
+		this->loadVoc(tmpString);
+		time( &ltime2);
+		cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n";
+	}
+	
+	time( &ltime1 );
+	cerr<<"Loading corpus...\n";
+	sprintf(tmpString,"%s.sa_corpus",fileNameStem);	
+	this->loadCorpusAndInitMem(tmpString);
+	time( &ltime2);
+	cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n";
+	
+	time( &ltime1 );
+	cerr<<"Loading suffix...\n";
+	sprintf(tmpString,"%s.sa_suffix",fileNameStem);
+	this->loadSuffix(tmpString);
+	time( &ltime2);
+	cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+	if(! this->noOffset){
+		time( &ltime1 );
+		cerr<<"Loading offset...\n";
+		sprintf(tmpString,"%s.sa_offset",fileNameStem);
+		this->loadOffset(tmpString);
+		time( &ltime2);
+		cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n";
+	}
+}
+
+void C_SuffixArrayApplicationBase::loadVoc(const char *filename)
+{
+	this->voc =  new C_IDVocabulary(filename);
+}
+
+void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename)
+{
+	unsigned int dwRead = 0;
+	FILE *  CorpusInputFile = fopen(filename, "rb");
+
+	if(!CorpusInputFile){
+		cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n";
+		exit(0);
+	}
+	
+	//first, read the size of the corpus
+	dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile);
+	
+	//allocate memory for all data structure
+	this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize);
+	if(! this->corpus_list){
+		cerr<<"Can not allocate memory to load the corpus!\n";
+		exit(0);
+	}
+
+	this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize);
+	if(! this->suffix_list){
+		cerr<<"Can not allocate memory to load the suffix!\n";
+		exit(0);
+	}
+
+	if(! this->noOffset){
+		this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize);
+		if(! this->offset_list){
+			cerr<<"Can not allocate memory to load the offset!\n";
+			exit(0);
+		}
+	}
+
+	//read the corpus file
+	unsigned int totalRead = 0;
+	unsigned int remaining = this->corpusSize;
+	unsigned int oneBatchReadSize;
+	char * currentPosInCorpusList = (char *) this->corpus_list;
+	while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
+		oneBatchReadSize = SIZE_ONE_READ;
+		if(remaining<SIZE_ONE_READ){
+			oneBatchReadSize = remaining;
+		}
+
+		dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
+				
+		totalRead+=dwRead;
+		remaining-=dwRead;
+
+		currentPosInCorpusList+=sizeof(IndexType)*dwRead;
+	}
+	if(totalRead!=this->corpusSize){
+		cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl;
+		exit(0);
+	}
+	fclose(CorpusInputFile);
+
+	this->sentIdStart = this->corpus_list[0];
+	this->vocIdForSentStart = this->corpus_list[1];
+	this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1];
+	this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2];
+
+	if(! this->noLevel1Bucket){
+		//in this corpus, we will have at most sentIdStart-1 word types
+		//the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data
+		this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart);	
+		
+		//initialize the level1 buckets
+		for(IndexType i=0;i<this->sentIdStart;i++){
+			this->level1Buckets[i].first = (TextLenType) -1;
+			this->level1Buckets[i].last = 0;
+		}
+	}
+}
+
+void C_SuffixArrayApplicationBase::loadSuffix(const char *filename)
+{
+	unsigned int dwRead = 0;
+	FILE *  SuffixInputFile = fopen(filename, "rb");
+	if(!SuffixInputFile){
+		cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl;
+		exit(0);
+	}
+
+	//first, read in the size of the suffix array
+	TextLenType suffixArraySize;
+	dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile);
+	
+	if(suffixArraySize!=this->corpusSize){
+		cerr<<"Something wrong, the suffix array size is different from the corpus size.\n";
+		cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl;
+		exit(0);
+	}
+
+	//read all the suffix into memory
+	unsigned int totalRead = 0;
+	unsigned int remaining = suffixArraySize;
+	unsigned int oneBatchReadSize;
+	char * currentPosInSuffixList = (char *) this->suffix_list;
+	while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
+		oneBatchReadSize = SIZE_ONE_READ;
+		if(remaining<SIZE_ONE_READ){
+			oneBatchReadSize = remaining;
+		}
+
+		dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
+		
+		totalRead+=dwRead;
+		remaining -= dwRead;
+
+		currentPosInSuffixList+=sizeof(TextLenType)*dwRead;
+	}	
+	if(totalRead!=suffixArraySize){
+		cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl;
+		exit(0);
+	}
+
+	fclose(SuffixInputFile);
+
+	if(! this->noLevel1Bucket){
+		//build level-1 bucket
+		cerr<<"Initialize level-1 buckets...\n";
+		IndexType currentVocId = 0;
+		IndexType vocId;
+		TextLenType pos;
+		TextLenType lastSaIndex = 0;
+		
+		for(TextLenType i=0; i<suffixArraySize; i++){
+			pos = this->suffix_list[i];
+			
+			//for level1 bucket
+			vocId = this->corpus_list[pos];
+
+			if(vocId<this->sentIdStart){	//is a meaningful word type
+				if(vocId!=currentVocId){
+					this->level1Buckets[currentVocId].last = lastSaIndex;	//for first word which is <unk> this does not matter
+					this->level1Buckets[vocId].first = i;
+					
+					currentVocId=vocId;				
+				}
+
+				lastSaIndex = i;
+			}	
+		}
+
+		//for the last word type
+		this->level1Buckets[currentVocId].last = lastSaIndex;
+	}
+	else{
+		this->level1Buckets = NULL;
+	}
+}
+
+void C_SuffixArrayApplicationBase::loadOffset(const char *filename)
+{
+	unsigned int dwRead = 0;
+	FILE *  OffsetInputFile = fopen(filename, "rb");
+	
+	if(!OffsetInputFile){
+		cerr<<"Offset file: "<<filename<<" does not exist!"<<endl;
+		exit(0);
+	}
+		
+	//first, read the size of the corpus	
+	TextLenType offsetListLen;
+	dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile);	
+	if(offsetListLen!=this->corpusSize){
+		cerr<<"Text length is inconsistent with the length of the offset.\n";
+		exit(0);
+	}
+
+	//read all the suffix into memory
+	unsigned int totalRead = 0;
+	unsigned int remaining = offsetListLen;
+	unsigned int oneBatchReadSize;
+	char * currentOffsetListPos = (char *) this->offset_list;
+	while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){
+		oneBatchReadSize = SIZE_ONE_READ;
+
+		if(remaining<SIZE_ONE_READ){
+			oneBatchReadSize = remaining;
+		}
+
+		dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile);
+		
+		totalRead+=dwRead;
+		remaining-=dwRead;
+
+		currentOffsetListPos+=sizeof(unsigned char)*dwRead;
+
+	}
+	if(totalRead!=offsetListLen){
+		cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl;
+		exit(0);
+	}
+	fclose(OffsetInputFile);
+	
+}
+
+TextLenType C_SuffixArrayApplicationBase::returnCorpusSize()
+{
+    return this->corpusSize;
+}
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~
new file mode 100755
index 0000000..bd17287
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.cpp~
@@ -0,0 +1,313 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayApplicationBase.h"
+
+#include "malloc.h"
+#include "time.h"
+
+#include <iostream>
+#include <fstream>
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+C_SuffixArrayApplicationBase::C_SuffixArrayApplicationBase()
+{
+	this->level1Buckets = NULL;
+	this->noVocabulary = false;	//by default, still load the vocabulary
+	this->noOffset = false;	//by default, load offset
+	this->noLevel1Bucket = false;	//by default, construct level1 bucket
+}
+
+C_SuffixArrayApplicationBase::~C_SuffixArrayApplicationBase()
+{
+	if(this->level1Buckets!=NULL){
+		free(this->level1Buckets);
+	}
+
+	//not necessary too
+	free(this->corpus_list);
+	free(this->suffix_list);
+
+	if(! this->noOffset){
+		free(this->offset_list);
+	}
+
+	if(! this->noVocabulary){
+		delete(this->voc);
+	}
+}
+
+/**
+* Load the indexed corpus, suffix array, vocabulary, offset into memory for follow up applications
+* It is optional to load vocabulary, offset depends on the argument.
+* In the case when the testing data shares the same vocabulary as the training data and only vocIDs are used to represent the sentence/n-grams
+* then vocabulary which maps between vocId and the word text can be skipped to save some memory.
+*
+* If the suffix array object does not need to locate the sentence id of an occurred n-gram, then offset information is not needed.
+*
+* Be very careful here, the suffix array class does not check if offset has been loaded in the search function to make it efficient
+* you need to know what the suffix array class will be used (whether offset is needed) and load it properly
+* @param fileNameStem The filename of the corpus. This should be the same filename used in IndexSA
+* @param noVoc If set to be 'true', vocabulary will not be loaded
+* @param noOffset If set to be 'true', the offset information will not be loaded. <sentId, offsetInSent> information for an n-gram's occurrences can not be calculated.
+* @param noLevel1Bucket Level1Bucket is used to speed up the search at the cost of additional memory. For applications which do not need to locate n-grams in the corpus (such as the corpus scanning application), then there is no need to create Level1Bucket
+**/
+void C_SuffixArrayApplicationBase::loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket)
+{
+	long ltime1, ltime2;
+
+	this->noVocabulary = noVoc;
+	this->noOffset = noOffset;
+	this->noLevel1Bucket = noLevel1Bucket;
+	
+	
+	char tmpString[1000];
+
+	//the order of loading the data is important, do not change
+	if(! this->noVocabulary){
+		time( &ltime1 );
+		cerr<<"Loading Vocabulary...\n";
+		sprintf(tmpString,"%s.id_voc",fileNameStem);
+		this->loadVoc(tmpString);
+		time( &ltime2);
+		cerr<<"Vocabulary loaded in "<<ltime2-ltime1<<" seconds.\n";
+	}
+	
+	time( &ltime1 );
+	cerr<<"Loading corpus...\n";
+	sprintf(tmpString,"%s.sa_corpus",fileNameStem);	
+	this->loadCorpusAndInitMem(tmpString);
+	time( &ltime2);
+	cerr<<"Corpus loaded in "<<ltime2-ltime1<<" seconds.\n";
+	
+	time( &ltime1 );
+	cerr<<"Loading suffix...\n";
+	sprintf(tmpString,"%s.sa_suffix",fileNameStem);
+	this->loadSuffix(tmpString);
+	time( &ltime2);
+	cerr<<"Suffix loaded in "<<ltime2-ltime1<<" seconds.\n";
+
+	if(! this->noOffset){
+		time( &ltime1 );
+		cerr<<"Loading offset...\n";
+		sprintf(tmpString,"%s.sa_offset",fileNameStem);
+		this->loadOffset(tmpString);
+		time( &ltime2);
+		cerr<<"Offset loaded in "<<ltime2-ltime1<<" seconds.\n";
+	}
+}
+
+void C_SuffixArrayApplicationBase::loadVoc(const char *filename)
+{
+	this->voc =  new C_IDVocabulary(filename);
+}
+
+void C_SuffixArrayApplicationBase::loadCorpusAndInitMem(const char *filename)
+{
+	unsigned int dwRead = 0;
+	FILE *  CorpusInputFile = fopen(filename, "rb");
+
+	if(!CorpusInputFile){
+		cerr<<"Corpus file: "<<filename<<" does not exist or can not be opened!\n";
+		exit(0);
+	}
+	
+	//first, read the size of the corpus
+	dwRead = fread( &(this->corpusSize), sizeof(TextLenType), 1, CorpusInputFile);
+	
+	//allocate memory for all data structure
+	this->corpus_list = (IndexType *) malloc(sizeof(IndexType)*this->corpusSize);
+	if(! this->corpus_list){
+		cerr<<"Can not allocate memory to load the corpus!\n";
+		exit(0);
+	}
+
+	this->suffix_list = (TextLenType *) malloc(sizeof(TextLenType)*this->corpusSize);
+	if(! this->suffix_list){
+		cerr<<"Can not allocate memory to load the suffix!\n";
+		exit(0);
+	}
+
+	if(! this->noOffset){
+		this->offset_list = (unsigned char *) malloc(sizeof(unsigned char)*this->corpusSize);
+		if(! this->offset_list){
+			cerr<<"Can not allocate memory to load the offset!\n";
+			exit(0);
+		}
+	}
+
+	//read the corpus file
+	unsigned int totalRead = 0;
+	unsigned int remaining = this->corpusSize;
+	unsigned int oneBatchReadSize;
+	char * currentPosInCorpusList = (char *) this->corpus_list;
+	while(! feof(CorpusInputFile) && (totalRead<this->corpusSize)){
+		oneBatchReadSize = SIZE_ONE_READ;
+		if(remaining<SIZE_ONE_READ){
+			oneBatchReadSize = remaining;
+		}
+
+		dwRead = fread( currentPosInCorpusList, sizeof(IndexType), oneBatchReadSize, CorpusInputFile);
+				
+		totalRead+=dwRead;
+		remaining-=dwRead;
+
+		currentPosInCorpusList+=sizeof(IndexType)*dwRead;
+	}
+	if(totalRead!=this->corpusSize){
+		cerr<<"Expecting "<<this->corpusSize<<" words from the corpus, read-in "<<totalRead<<endl;
+		exit(0);
+	}
+	fclose(CorpusInputFile);
+
+	this->sentIdStart = this->corpus_list[0];
+	this->vocIdForSentStart = this->corpus_list[1];
+	this->vocIdForCorpusEnd = this->corpus_list[this->corpusSize-1];
+	this->vocIdForSentEnd = this->corpus_list[this->corpusSize-2];
+
+	if(! this->noLevel1Bucket){
+		//in this corpus, we will have at most sentIdStart-1 word types
+		//the index in the array correspond to the vocId, 0 is for <unk> and the last one is for <sentIdStart-1> which is the largest vocId observed in the data
+		this->level1Buckets = (S_level1BucketElement *) malloc(sizeof(S_level1BucketElement)* this->sentIdStart);	
+		
+		//initialize the level1 buckets
+		for(IndexType i=0;i<this->sentIdStart;i++){
+			this->level1Buckets[i].first = (TextLenType) -1;
+			this->level1Buckets[i].last = 0;
+		}
+	}
+}
+
+void C_SuffixArrayApplicationBase::loadSuffix(const char *filename)
+{
+	unsigned int dwRead = 0;
+	FILE *  SuffixInputFile = fopen(filename, "rb");
+	if(!SuffixInputFile){
+		cerr<<"Suffix file: "<<filename<<" does not exist!"<<endl;
+		exit(0);
+	}
+
+	//first, read in the size of the suffix array
+	TextLenType suffixArraySize;
+	dwRead = fread( &suffixArraySize, sizeof(TextLenType), 1, SuffixInputFile);
+	
+	if(suffixArraySize!=this->corpusSize){
+		cerr<<"Something wrong, the suffix array size is different from the corpus size.\n";
+		cerr<<"Corpus has "<<this->corpusSize<<" words and suffix array reported: "<<suffixArraySize<<endl;
+		exit(0);
+	}
+
+	//read all the suffix into memory
+	unsigned int totalRead = 0;
+	unsigned int remaining = suffixArraySize;
+	unsigned int oneBatchReadSize;
+	char * currentPosInSuffixList = (char *) this->suffix_list;
+	while(! feof(SuffixInputFile) && (totalRead<suffixArraySize)){
+		oneBatchReadSize = SIZE_ONE_READ;
+		if(remaining<SIZE_ONE_READ){
+			oneBatchReadSize = remaining;
+		}
+
+		dwRead = fread( currentPosInSuffixList, sizeof(TextLenType), oneBatchReadSize, SuffixInputFile);
+		
+		totalRead+=dwRead;
+		remaining -= dwRead;
+
+		currentPosInSuffixList+=sizeof(TextLenType)*dwRead;
+	}	
+	if(totalRead!=suffixArraySize){
+		cerr<<"Expecting "<<suffixArraySize<<" words from the suffix list, read-in "<<totalRead<<endl;
+		exit(0);
+	}
+
+	fclose(SuffixInputFile);
+
+	if(! this->noLevel1Bucket){
+		//build level-1 bucket
+		cerr<<"Initialize level-1 buckets...\n";
+		IndexType currentVocId = 0;
+		IndexType vocId;
+		TextLenType pos;
+		TextLenType lastSaIndex = 0;
+		
+		for(TextLenType i=0; i<suffixArraySize; i++){
+			pos = this->suffix_list[i];
+			
+			//for level1 bucket
+			vocId = this->corpus_list[pos];
+
+			if(vocId<this->sentIdStart){	//is a meaningful word type
+				if(vocId!=currentVocId){
+					this->level1Buckets[currentVocId].last = lastSaIndex;	//for first word which is <unk> this does not matter
+					this->level1Buckets[vocId].first = i;
+					
+					currentVocId=vocId;				
+				}
+
+				lastSaIndex = i;
+			}	
+		}
+
+		//for the last word type
+		this->level1Buckets[currentVocId].last = lastSaIndex;
+	}
+	else{
+		this->level1Buckets = NULL;
+	}
+}
+
+void C_SuffixArrayApplicationBase::loadOffset(const char *filename)
+{
+	unsigned int dwRead = 0;
+	FILE *  OffsetInputFile = fopen(filename, "rb");
+	
+	if(!OffsetInputFile){
+		cerr<<"Offset file: "<<filename<<" does not exist!"<<endl;
+		exit(0);
+	}
+		
+	//first, read the size of the corpus	
+	TextLenType offsetListLen;
+	dwRead = fread( &offsetListLen, sizeof(TextLenType), 1, OffsetInputFile);	
+	if(offsetListLen!=this->corpusSize){
+		cerr<<"Text length is inconsistent with the length of the offset.\n";
+		exit(0);
+	}
+
+	//read all the suffix into memory
+	unsigned int totalRead = 0;
+	unsigned int remaining = offsetListLen;
+	unsigned int oneBatchReadSize;
+	char * currentOffsetListPos = (char *) this->offset_list;
+	while(! feof(OffsetInputFile) && (totalRead < offsetListLen)){
+		oneBatchReadSize = SIZE_ONE_READ;
+
+		if(remaining<SIZE_ONE_READ){
+			oneBatchReadSize = remaining;
+		}
+
+		dwRead = fread( currentOffsetListPos, sizeof(unsigned char), oneBatchReadSize, OffsetInputFile);
+		
+		totalRead+=dwRead;
+		remaining-=dwRead;
+
+		currentOffsetListPos+=sizeof(unsigned char)*dwRead;
+
+	}
+	if(totalRead!=offsetListLen){
+		cerr<<"Expecting "<<offsetListLen<<" words from the offset list, read-in "<<totalRead<<endl;
+		exit(0);
+	}
+	fclose(OffsetInputFile);
+	
+}
+
+TextLenType C_SuffixArrayApplicationBase::returnCorpusSize()
+{
+    return this->corpusSize;
+}
diff --git a/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h
new file mode 100755
index 0000000..74fad4e
--- /dev/null
+++ b/Src/SuffixArrayApplications/_SuffixArrayApplicationBase.h
@@ -0,0 +1,58 @@
+#if !defined(__SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_)
+#define __SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_
+
+#include "salm_shared.h"
+#include "_IDVocabulary.h"
+#include "_String.h"
+
+using namespace std;
+
+typedef struct level1BucketElement
+{
+	TextLenType first;
+	TextLenType last;
+} S_level1BucketElement;
+
+
+/**
+* Base class of Suffix Array applications
+* Providing functions to load the suffix array and initialize the required vocIDs
+* Revision $Rev: 3665 $
+* Last Modified $LastChangedDate: 2007-06-16 15:40:59 -0400 (Sat, 16 Jun 2007) $
+**/
+class C_SuffixArrayApplicationBase  
+{
+public:
+	void loadData(const char *fileNameStem, bool noVoc, bool noOffset, bool noLevel1Bucket);
+	TextLenType returnCorpusSize();
+
+	C_SuffixArrayApplicationBase();
+	virtual ~C_SuffixArrayApplicationBase();
+
+protected:	
+	TextLenType corpusSize;
+
+	void loadVoc(const char * filename);
+	void loadOffset(const char * filename);
+	void loadSuffix(const char * filename);
+	void loadCorpusAndInitMem(const char * filename);
+
+	bool noVocabulary;
+	bool noOffset;
+	bool noLevel1Bucket;
+
+	C_IDVocabulary * voc;
+	IndexType sentIdStart;
+	IndexType vocIdForSentStart;
+	IndexType vocIdForSentEnd;
+	IndexType vocIdForCorpusEnd;
+
+	IndexType * corpus_list;
+	unsigned char * offset_list;
+	TextLenType * suffix_list;
+
+	S_level1BucketElement * level1Buckets;
+
+};
+
+#endif // !defined(__SUFFIXARRAYAPPLICATIONBASE_H__INCLUDED_)
diff --git a/Src/Utils/InitializeVocabulary.cpp b/Src/Utils/InitializeVocabulary.cpp
new file mode 100755
index 0000000..b749568
--- /dev/null
+++ b/Src/Utils/InitializeVocabulary.cpp
@@ -0,0 +1,30 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_IDVocabulary.h"
+
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Intialize an empty vocabulary with reserved words
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	if(argc<2){
+		cerr<<"\nUsage:";
+		cerr<<"\n\t"<<argv[0]<<" vocabularyFileName\n\n";
+		exit(0);
+	}
+	
+	C_IDVocabulary voc;
+
+	voc.addingReservedWords();
+	voc.outputToFile(argv[1]);
+
+	return 0;
+
+}
diff --git a/Src/Utils/UpdateUniversalVoc.cpp b/Src/Utils/UpdateUniversalVoc.cpp
new file mode 100755
index 0000000..02ea6cb
--- /dev/null
+++ b/Src/Utils/UpdateUniversalVoc.cpp
@@ -0,0 +1,28 @@
+#include "stdio.h"
+#include "stdlib.h"
+#include "_UniversalVocabulary.h"
+
+#include <iostream>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Update the universal vocabulary with words in corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+int main(int argc, char * argv[]){
+	if(argc<3){
+		cerr<<"\nUsage:";
+		cerr<<"\n\t"<<argv[0]<<" universal_voc corpusFileName\n\n";
+		exit(0);
+	}
+
+	C_UniversalVocabulary universalVoc(argv[1]);
+
+	universalVoc.updateWithNewCorpus(argv[2]);
+
+	return 1;
+}
diff --git a/Src/Utils/_UniversalVocabulary.cpp b/Src/Utils/_UniversalVocabulary.cpp
new file mode 100755
index 0000000..3be91d2
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.cpp
@@ -0,0 +1,118 @@
+#include "_UniversalVocabulary.h"
+#include "malloc.h"
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cstring>
+#include <stdlib.h>
+
+using namespace std;
+
+C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
+{
+	int fileNameSize=strlen(universalVocFileName);
+	fileNameSize++;
+
+	this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
+	sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);
+
+	this->universalVoc = new C_IDVocabulary(universalVocFileName);
+
+}
+
+C_UniversalVocabulary::~C_UniversalVocabulary()
+{
+	free(this->universalCorpusFileName);
+	delete(this->universalVoc);
+}
+
+
+/**
+* Update the universal vocabulary with words in a new corpus
+* Output the updated universal vocabulary
+* Output the vocabulary needed for the new corpus too
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
+{
+
+	ifstream textStream;
+	textStream.open(newCorpusFileName);
+
+	if(textStream==NULL){
+		fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
+		exit(-1);
+	}
+
+
+	//add reserved words from universal voc
+	for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
+		C_String reservedWordText = this->universalVoc->getText(vocId);
+		this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
+	}
+
+	string aLine;
+	unsigned int sentNumber = 1;
+	unsigned int corpusSize = 0;
+
+	char * thisToken;
+	char delimit[] =" \t\r\n";
+	map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
+	
+
+	getline(textStream, aLine);
+	while(!textStream.eof()){
+
+		if(aLine.length()>0){
+
+			thisToken = strtok((char*) aLine.c_str(), delimit );
+			while( thisToken != NULL ) {			
+				
+				C_String thisWord(thisToken);
+
+				//check if this word has already been seen
+				iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);
+
+				if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
+					//new type
+					IndexType vocId = this->universalVoc->getId(thisWord);
+					this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
+				}
+
+				
+				// While there are tokens in "string"
+				// Get next token: 
+				thisToken = strtok( NULL, delimit);
+			}
+
+		}
+		
+		getline(textStream, aLine);
+	}
+
+
+	//now output the updated universal vocabulary
+	this->universalVoc->outputToFile(this->universalCorpusFileName);
+
+	//output the vocabulary needed for the new corpus
+	char vocabularyForNewCorpusFileName[1024];
+	sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);
+
+	ofstream outputVocFile;
+	outputVocFile.open(vocabularyForNewCorpusFileName);
+
+	if(!outputVocFile){
+		cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
+		exit(-1);
+	}
+
+	iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
+	while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
+		outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
+		iterWordsUsedInTheNewCorpus++;
+	}
+
+	outputVocFile.close();
+}
diff --git a/Src/Utils/_UniversalVocabulary.cpp~ b/Src/Utils/_UniversalVocabulary.cpp~
new file mode 100755
index 0000000..50a7396
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.cpp~
@@ -0,0 +1,117 @@
+#include "_UniversalVocabulary.h"
+#include "malloc.h"
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <cstring>
+
+using namespace std;
+
+C_UniversalVocabulary::C_UniversalVocabulary(const char * universalVocFileName)
+{
+	int fileNameSize=strlen(universalVocFileName);
+	fileNameSize++;
+
+	this->universalCorpusFileName = (char *) malloc(sizeof(char)*fileNameSize);
+	sprintf(this->universalCorpusFileName,"%s\0", universalVocFileName);
+
+	this->universalVoc = new C_IDVocabulary(universalVocFileName);
+
+}
+
+C_UniversalVocabulary::~C_UniversalVocabulary()
+{
+	free(this->universalCorpusFileName);
+	delete(this->universalVoc);
+}
+
+
+/**
+* Update the universal vocabulary with words in a new corpus
+* Output the updated universal vocabulary
+* Output the vocabulary needed for the new corpus too
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+void C_UniversalVocabulary::updateWithNewCorpus(const char * newCorpusFileName)
+{
+
+	ifstream textStream;
+	textStream.open(newCorpusFileName);
+
+	if(textStream==NULL){
+		fprintf(stderr,"Corpus file %s does not exist. Exit!\n",newCorpusFileName);
+		exit(-1);
+	}
+
+
+	//add reserved words from universal voc
+	for(IndexType vocId=1; vocId<=NUMBER_OF_RESERVED_WORDS_IN_VOC; vocId++){
+		C_String reservedWordText = this->universalVoc->getText(vocId);
+		this->wordsUsedInTheNewCorpus.insert(make_pair(reservedWordText, vocId));
+	}
+
+	string aLine;
+	unsigned int sentNumber = 1;
+	unsigned int corpusSize = 0;
+
+	char * thisToken;
+	char delimit[] =" \t\r\n";
+	map<C_String, IndexType, ltstr>::iterator iterWordsUsedInTheNewCorpus;
+	
+
+	getline(textStream, aLine);
+	while(!textStream.eof()){
+
+		if(aLine.length()>0){
+
+			thisToken = strtok((char*) aLine.c_str(), delimit );
+			while( thisToken != NULL ) {			
+				
+				C_String thisWord(thisToken);
+
+				//check if this word has already been seen
+				iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.find(thisWord);
+
+				if(iterWordsUsedInTheNewCorpus == this->wordsUsedInTheNewCorpus.end()){
+					//new type
+					IndexType vocId = this->universalVoc->getId(thisWord);
+					this->wordsUsedInTheNewCorpus.insert(make_pair(thisWord, vocId));
+				}
+
+				
+				// While there are tokens in "string"
+				// Get next token: 
+				thisToken = strtok( NULL, delimit);
+			}
+
+		}
+		
+		getline(textStream, aLine);
+	}
+
+
+	//now output the updated universal vocabulary
+	this->universalVoc->outputToFile(this->universalCorpusFileName);
+
+	//output the vocabulary needed for the new corpus
+	char vocabularyForNewCorpusFileName[1024];
+	sprintf(vocabularyForNewCorpusFileName, "%s.id_voc", newCorpusFileName);
+
+	ofstream outputVocFile;
+	outputVocFile.open(vocabularyForNewCorpusFileName);
+
+	if(!outputVocFile){
+		cerr<<"Can not open "<<vocabularyForNewCorpusFileName<<" to write vocabulary\n";
+		exit(-1);
+	}
+
+	iterWordsUsedInTheNewCorpus = this->wordsUsedInTheNewCorpus.begin();
+	while(iterWordsUsedInTheNewCorpus!=this->wordsUsedInTheNewCorpus.end()){
+		outputVocFile<<iterWordsUsedInTheNewCorpus->first.toString()<<"\t"<<iterWordsUsedInTheNewCorpus->second<<endl;
+		iterWordsUsedInTheNewCorpus++;
+	}
+
+	outputVocFile.close();
+}
diff --git a/Src/Utils/_UniversalVocabulary.h b/Src/Utils/_UniversalVocabulary.h
new file mode 100755
index 0000000..2df4954
--- /dev/null
+++ b/Src/Utils/_UniversalVocabulary.h
@@ -0,0 +1,38 @@
+#if !defined (__HEADER_UNIVERSAL_VOC_INCLUDED__)
+#define __HEADER_UNIVERSAL_VOC_INCLUDED__
+
+#include "salm_shared.h"
+#include "_IDVocabulary.h"
+#include "_String.h"
+
+#include <map>
+
+using namespace std;
+
+/**
+* \ingroup utils
+* Universal Vocabulary class provides function to update the univeral vocabulary
+* with the words in a new corpus
+* and output the vocabulary needed for the new corpus
+*
+* Revision $Rev: 3794 $
+* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
+**/
+class C_UniversalVocabulary{
+
+public:
+	void updateWithNewCorpus(const char * newCorpusFileName);
+
+	C_UniversalVocabulary(const char * universalVocFileName);
+	~C_UniversalVocabulary();
+
+private:
+	char * universalCorpusFileName;
+	C_IDVocabulary * universalVoc;
+
+	map<C_String, IndexType, ltstr> wordsUsedInTheNewCorpus;
+
+};
+
+
+#endif
author	Hieu Hoang <hieu@hoang.co.uk>	2013-11-25 13:56:37 +0400
committer	Hieu Hoang <hieu@hoang.co.uk>	2013-11-25 13:56:37 +0400
commit	a146dbec8f0391e247db1ae4c9b7af5c225436f9 (patch)
tree	1fa97934675448cdcffb26b4737887d551822a39 /Src