Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/salm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~')
-rwxr-xr-xSrc/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~690
1 files changed, 690 insertions, 0 deletions
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~
new file mode 100755
index 0000000..5241621
--- /dev/null
+++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~
@@ -0,0 +1,690 @@
+/**
+* Revision $Rev: 3815 $
+* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $
+**/
+
+#include "_SuffixArrayLanguageModel.h"
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include <memory.h>
+
+#include "math.h"
+
+using namespace std;
+
+
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel()
+{
+
+}
+
+C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel()
+{
+
+}
+
+
+/**
+* Construct the suffix array language model object
+* Using the training data corpusFileNameStem that has been indexed by IndexSA
+* Consider at most maxN-gram in language modeling
+* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting
+* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history
+* @param cfgFileName Configuration file that specifies the value of parameters for SALM
+*
+* Each line in the configuration file is a Keyword Value pair. Legal keywords are:
+* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified!
+* N : Highest order of n considered for n-gram LM estimation, default value = 5
+* MAX_FREQ_DISC : When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1.
+* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories
+* 'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities
+* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history)
+* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable
+**/
+C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName)
+{
+
+ fstream cfgFile;
+ cfgFile.open(cfgFileName,ios::in);
+
+ if(!cfgFile){
+ fprintf(stderr,"Configuration file does not exist! quit!!\n");
+ exit(0);
+ }
+
+ //-----------------------------------------------------------------------------
+ //reading parameters
+ char paraName[1024];
+ char corpusFileNameStem[1024];
+ corpusFileNameStem[0]=0;
+ this->maxFreqForDiscounting=-1;
+
+ this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob
+ this->maxN = 5; // default value; consider up to 5 words
+
+ while(!cfgFile.eof()){
+ cfgFile>>paraName;
+
+ if(strcmp(paraName,"CORPUS")==0){
+ cfgFile>>corpusFileNameStem;
+ }
+ else if(strcmp(paraName,"N")==0){
+ cfgFile>>this->maxN;
+ }
+ else if(strcmp(paraName,"MAX_FREQ_DISC")==0){
+ cfgFile>>maxFreqForDiscounting;
+ }
+ else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){
+ cfgFile>>this->interpolationStrategy;
+ }
+
+ paraName[0]=0;
+
+ }
+
+ //load corpus and suffix array
+ if(strlen(corpusFileNameStem)==0){
+ cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n";
+ exit(-1);
+ }
+ this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset,
+
+
+ //if apply discounting construct the discounting map
+ if(this->maxFreqForDiscounting<=0){
+ this->applyDiscounting = false;
+ }
+ else{
+ this->applyDiscounting = true;
+ this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map
+ }
+
+ //get vocID for sentEnd
+ this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_"));
+
+ if(this->vocIdForSentEnd==0){
+ cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_"));
+ if(this->vocIdForSentStart==0){
+ cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_"));
+ if(this->vocIdForCorpusEnd==0){
+ cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n";
+ exit(0);
+ }
+
+ this->interpolationStrategy = 'e'; //default: interpolation strategy: equally weighted n-gram conditional prob
+
+}
+
+
+/**
+* Similar to the function in C_SuffixArrayScanningBase
+* Scan the corpus to obtain count of counts information
+* and construct the discounting using Good-Turing smoothing
+**/
+void C_SuffixArrayLanguageModel::constructDiscountingMap()
+{
+ int i,j;
+ unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting);
+
+ if(countOfCountsTable==NULL){
+ cerr<<"Count of counts table can not be initialized. Exit\n";
+ exit(0);
+ }
+
+ //initialize count of counts table
+ for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){
+ countOfCountsTable[c]=0;
+ }
+
+ //initialize the scanning list
+ S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN);
+ for(i=0;i<this->maxN;i++){
+ nGramScanningList[i].freqSoFar=0;
+ nGramScanningList[i].vocId = 0;
+ nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output
+ }
+
+ bool stillMeaningful = true;
+ TextLenType saPos=0;
+
+ while(stillMeaningful && ( saPos<this->corpusSize ) ){
+
+ TextLenType posInCorpus = this->suffix_list[saPos];
+ IndexType wordInCorpus = this->corpus_list[posInCorpus];
+
+ if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting
+
+ if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested
+
+ bool quit =false;
+ i=0;
+
+ while(!quit && (i<this->maxN)){
+ wordInCorpus = this->corpus_list[posInCorpus+i];
+ if(
+ (wordInCorpus<this->sentIdStart)&&
+ (wordInCorpus!=this->vocIdForSentEnd)&&
+ (wordInCorpus!=this->vocIdForSentStart)&&
+ (wordInCorpus==nGramScanningList[i].vocId)){ //still match
+
+ nGramScanningList[i].freqSoFar++;
+ }
+ else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type
+
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+
+ for(j=i;j<this->maxN;j++){
+
+
+ if(nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){ //perform actions depends on actionType
+
+ freqSoFar = nGramScanningList[j].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (j+1)-gram with freq freqSoFar
+ countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+
+ //finished output, now clear the list from point of i
+ if((posInCorpus+j)<this->corpusSize){
+ wordInCorpus = this->corpus_list[posInCorpus+j];
+ }
+ else{
+ wordInCorpus = 0; //out of bound for corpus
+ }
+
+ if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){
+ wordInCorpus=0; //write 0 for <sentId>, <s> and </s>
+ nGramScanningList[j].freqSoFar = 0;
+ }
+ else{
+ nGramScanningList[j].freqSoFar = 1;
+ }
+
+ nGramScanningList[j].vocId = wordInCorpus;
+ }
+
+ quit=true; //at i+1 gram, already not match, no need to check for longer
+ }
+
+ i++;
+ }
+ }
+ }
+ else{
+ stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text
+ }
+
+ saPos++;
+ }
+
+ //at the end of corpus (according to suffix order)
+ bool validNgramUpSoFar = true;
+ unsigned int freqSoFar;
+ for(i=0;i<this->maxN;i++){
+ if(nGramScanningList[i].vocId==0){ //invalide word
+ validNgramUpSoFar = false;
+ }
+
+ if(validNgramUpSoFar){
+
+ freqSoFar = nGramScanningList[i].freqSoFar;
+ if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){
+ //increase the count for (i+1)-gram with freq freqSoFar
+ countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++;
+ }
+ }
+ }
+
+ //now, use Good-Turing discounting to create frequency mapping
+ //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted
+ this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting);
+
+ for(i=0;i<this->maxN;i++){
+ //for (i+1)-gram
+
+ unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting;
+ double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting;
+
+ for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq
+ //for all (freq+1) ngrams
+ if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists
+ discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]);
+ }
+ else{
+ discountingMapForThisN[freq] = -1;
+ }
+ }
+
+ discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency
+ }
+
+
+ free(countOfCountsTable);
+
+}
+
+///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1]
+///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord
+///only need to get freq(w_n | history) of different history
+///return in freq table, freq(history+Wn, history) for all the matched n
+///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history
+/// 3-gram freq, freq of 3-gram history
+///freqTable should have length of 2*n
+///return the longest match with this updated n-gram
+void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+ vector<IndexType> nGram;
+
+ if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk>
+ if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram
+ currentMatchStart++;
+ currentMatchLen--;
+ }
+
+ for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){
+ nGram.push_back(this->corpus_list[pos]);
+ }
+ }
+
+ nGram.push_back(nextWord);
+
+ int sentLen = nGram.size();
+
+ //construct the n-gram search table
+ S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram);
+
+ int startPosForNgram;
+ int startPosForLongestMatchingWithNextWord;
+ int cellIndexForLongestMatchingWithNextWord;
+
+ bool stillMatched = true;
+ bool atLeastOneMatched = false;
+
+ int indexForNgram;
+
+ unsigned int totalOccurrences;
+ unsigned int totalOccurrencesOfHistory;
+
+ //for unigram
+ indexForNgram = sentLen - 1;
+ if(table[indexForNgram].found){
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ if(this->applyDiscounting){
+ freqTable[0] = this->discountFreq(1, totalOccurrences);
+ }
+ else{
+ freqTable[0] = totalOccurrences;
+ }
+
+ freqTable[1] = this->corpusSize;
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = sentLen-1;
+ atLeastOneMatched = true;
+ }
+ else{
+ stillMatched = false;
+ }
+
+ int n=2; //considering 2-gram and longer n-gram now
+ startPosForNgram = sentLen - 2;
+ while((stillMatched)&&(startPosForNgram>=0)){
+
+ indexForNgram = (n-1) * sentLen + startPosForNgram;
+ int indexForHistory = (n-2) * sentLen + startPosForNgram;
+
+ if(table[indexForNgram].found){
+
+ totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1;
+ totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1;
+
+
+ if(this->applyDiscounting){
+ freqTable[2*n-2] = this->discountFreq(n, totalOccurrences);
+ }
+ else{
+ freqTable[2*n-2] = (double)totalOccurrences;
+ }
+
+ freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history
+
+ if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long
+ cellIndexForLongestMatchingWithNextWord = indexForNgram;
+ startPosForLongestMatchingWithNextWord = startPosForNgram;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ startPosForNgram--;
+ n++;
+ }
+
+ if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord'
+ updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA];
+ updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord);
+ }
+ else{
+ updatedMatchingStart = (TextLenType) -1;
+ updatedMatchingLen = 0;
+ }
+
+ free(table);
+
+}
+
+
+//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing
+double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq)
+{
+ if(n>=this->maxN){ //do not discount
+ return (double) observedFreq;
+ }
+
+ if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq
+ return (double) observedFreq;
+ }
+
+ //else, check the discount map
+ double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1];
+
+ if(discountedFreq>0){
+ return discountedFreq;
+ }
+
+ //else, no discounting
+ return (double) observedFreq;
+}
+
+
+///Start a new sentence now, clear up the sentence LM state
+LMState C_SuffixArrayLanguageModel::beginOfSentenceState()
+{
+
+ this->resetLmStates();
+ this->initialLmState();
+
+ return 0;
+}
+
+void C_SuffixArrayLanguageModel::initialLmState()
+{
+ //add sentence start
+ S_LMStateInfo sentStartNode;
+ sentStartNode.locationInCorpus.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s>
+ sentStartNode.locationInCorpus.len = 1;
+ sentStartNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(sentStartNode);
+ this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0));
+}
+
+void C_SuffixArrayLanguageModel::resetLmStates()
+{
+ this->allLMStates.clear();
+ this->ngramLocation2LmStateId.clear();
+}
+
+
+/**
+* Given the current history (as represented by the 'lmState'
+* caculate the log prob of nextWord given this history P(nextword|history)
+* and return the updated language model state with next word appended to the history
+* @param lmState Current language model state
+* @param nextWord The vocId of the next word (the word to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState)
+{
+ if(lmState>=this->allLMStates.size()){
+ cerr<<"Invalid LM State: "<<lmState<<endl;
+ exit(-1);
+ }
+
+ //first check if we have already seen this 'nextWord' before
+ map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache;
+ iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord );
+
+ if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet
+
+ //search for it in the corpus
+ S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus;
+ S_NgramLocationInCorpus updatedNgramLocation;
+
+ double logProb = this->logProbFromFreq(
+ correspondingNgramLocation.posInCorpus,
+ correspondingNgramLocation.len,
+ nextWord,
+ updatedNgramLocation.posInCorpus,
+ updatedNgramLocation.len);
+
+ //caching the logprob of 'nextword' given the lmState
+ int updatedLmStateId;
+ map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId;
+ iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation);
+ if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){ //this updated lm state does not exist yet
+ S_LMStateInfo newLmStateNode;
+
+ newLmStateNode.locationInCorpus = updatedNgramLocation;
+ newLmStateNode.cachedNextWordExtension.clear();
+
+ this->allLMStates.push_back(newLmStateNode);
+ updatedLmStateId = this->allLMStates.size() -1 ;
+ this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId));
+ }
+ else{
+ updatedLmStateId = iterNgramLocation2LmStateId->second;
+ }
+
+ //cache this
+ S_CachedLmInfo cachedLmInfo;
+ cachedLmInfo.logProb = logProb;
+ cachedLmInfo.nextState = updatedLmStateId;
+
+ this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo));
+
+ //updated next state
+ nextState = updatedLmStateId;
+
+ return logProb;
+ }
+
+ nextState = iterNextWordExtensionCache->second.nextState;
+
+ return iterNextWordExtensionCache->second.logProb;
+}
+
+
+/**
+* Given the history as lmState and append a phrase as a vector of IndexType,
+* calculate the LM prob and update the lm state
+* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase.
+* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function.
+ * @param lmState Current language model state
+* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted)
+* @param &nextState Returning the updated language model state when the next word is appended
+**/
+double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState)
+{
+ double logProb = 0;
+
+ if (phrase.size() == 0) {
+ nextState = lmState;
+ return logProb;
+ }
+
+ for(int i=0;i<phrase.size();i++){
+ logProb+=this->logProb(lmState, phrase[i], nextState);
+ lmState = nextState;
+ }
+
+ return logProb;
+}
+
+/**
+* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s>
+**/
+double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState)
+{
+ LMState dummyNextState;
+ return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState);
+}
+
+/**
+* Extend the current matched n-gram with next word, calculate the prob and update the updated range
+* the n-gram is represented by its position in the suffix array and the length
+* @param currentMatchStart Starting position of the current matched n-gram in corpus
+* @param currentMatchLen Length of the matched n-gram \
+* @param nextWord Vocabulary ID of the next word (the word to be predicted)
+* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus
+* @param &updatedMatchingLen The length of the extended n-gram
+**/
+double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen)
+{
+
+ double logProb;
+
+ double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN));
+ memset(freqTable, 0, 2*this->maxN*sizeof(double));
+
+ this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen);
+
+ logProb = this->calcLogProb(freqTable);
+
+ free(freqTable);
+
+ return logProb;
+
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb(double *freq)
+{
+ switch(this->interpolationStrategy){
+ case 'e':
+ return this->calcLogProb_equalWeightedInterpolation(freq);
+ break;
+ case 'i':
+ return this->calcLogProb_ibmHeuristicInterpolation(freq);
+ break;
+ case 'm':
+ return this->calcLogProb_maxProbInterpolation(freq);
+ break;
+ default:
+ cerr<<"Unknown interpolation strategy!\n";
+ exit(0);
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq)
+{
+ double prob = 0.0;
+
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ prob+=freq[2*i]/freq[2*i+1];
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(prob/(double)this->maxN);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq)
+{
+ double prob = 0.0;
+ if(freq[0]==0){ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+
+ double remainingWeightSum = 1.0;
+
+ //find the first non-zero match
+ int i = this->maxN - 1;
+
+ while(freq[2*i]==0){ //will stop for sure because freq[0]!=0
+ i--;
+ }
+
+ for(int j=i;j>=0;j--){
+ //for (j+1)-gram
+ double historyFreq = freq[2*j+1];
+ double logHistoryFreq = log(historyFreq);
+ if(logHistoryFreq>1){
+ logHistoryFreq = 1.0; //cap it to 1
+ }
+
+ double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history
+ double adjustedWeights = remainingWeightSum * reliability;
+
+ prob+=adjustedWeights * freq[2*i]/freq[2*i+1];
+
+ remainingWeightSum -= adjustedWeights;
+ }
+
+ return log(prob);
+}
+
+double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq)
+{
+ double maxProb = 0.0;
+
+ if(freq[0]>0){
+
+ int i=0;
+ bool stillMatched = true;
+
+ while(stillMatched && (i<this->maxN)){
+ if(freq[2*i]>0){
+ double prob=freq[2*i]/freq[2*i+1];
+
+ if(prob>maxProb){
+ maxProb = prob;
+ }
+ }
+ else{
+ stillMatched = false;
+ }
+
+ i++;
+ }
+
+ return log(maxProb);
+ }
+ else{ //unknown word
+ return SALM_LOG_PROB_UNK;
+ }
+}
+
+IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord)
+{
+ return this->voc->returnId(aWord);
+}