diff options
Diffstat (limited to 'Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~')
-rwxr-xr-x | Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ | 690 |
1 files changed, 690 insertions, 0 deletions
diff --git a/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ new file mode 100755 index 0000000..5241621 --- /dev/null +++ b/Src/SuffixArrayApplications/SuffixArrayLanguageModel/_SuffixArrayLanguageModel.cpp~ @@ -0,0 +1,690 @@ +/** +* Revision $Rev: 3815 $ +* Last Modified $LastChangedDate: 2007-07-06 14:31:12 -0400 (Fri, 06 Jul 2007) $ +**/ + +#include "_SuffixArrayLanguageModel.h" +#include <iostream> +#include <fstream> +#include <stdlib.h> +#include <memory.h> + +#include "math.h" + +using namespace std; + + +C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel() +{ + +} + +C_SuffixArrayLanguageModel::~C_SuffixArrayLanguageModel() +{ + +} + + +/** +* Construct the suffix array language model object +* Using the training data corpusFileNameStem that has been indexed by IndexSA +* Consider at most maxN-gram in language modeling +* For frequencies that are lower than maxFreqForDiscounting, use Good-Turing for discounting +* If maxFreqForDiscounting is set to be 0 or negative value, then discounting is turned off. Use MLE to estimate the probability of a word given history +* @param cfgFileName Configuration file that specifies the value of parameters for SALM +* +* Each line in the configuration file is a Keyword Value pair. Legal keywords are: +* CORPUS : corpusFileNameStem The training corpus filename used by IndexSA. Must be specified! +* N : Highest order of n considered for n-gram LM estimation, default value = 5 +* MAX_FREQ_DISC : When Good-Turing discounting is used, n-grams which have frequencies higher than this value will not be discounted. Negative value will disable the discounting. default value = -1. +* INTERPOLATION_STRATEGY : Set strategy to interploate the conditional probabilities of next word given different order of histories +* 'e' default. Equal weighted interpolation of unigram, bigram, trigram... probabiblities +* 'm' for using the maximum probabilty from all histories and use this value as P(next word | history) +* 'i' for deleted interpolation with weights determined by a heuristic that favors long n-gram probability when the frequency is reliable +**/ +C_SuffixArrayLanguageModel::C_SuffixArrayLanguageModel(const char * cfgFileName) +{ + + fstream cfgFile; + cfgFile.open(cfgFileName,ios::in); + + if(!cfgFile){ + fprintf(stderr,"Configuration file does not exist! quit!!\n"); + exit(0); + } + + //----------------------------------------------------------------------------- + //reading parameters + char paraName[1024]; + char corpusFileNameStem[1024]; + corpusFileNameStem[0]=0; + this->maxFreqForDiscounting=-1; + + this->interpolationStrategy = 'e'; //default interpolation strategy: equally weighted n-gram conditional prob + this->maxN = 5; // default value; consider up to 5 words + + while(!cfgFile.eof()){ + cfgFile>>paraName; + + if(strcmp(paraName,"CORPUS")==0){ + cfgFile>>corpusFileNameStem; + } + else if(strcmp(paraName,"N")==0){ + cfgFile>>this->maxN; + } + else if(strcmp(paraName,"MAX_FREQ_DISC")==0){ + cfgFile>>maxFreqForDiscounting; + } + else if(strcmp(paraName,"INTERPOLATION_STRATEGY")==0){ + cfgFile>>this->interpolationStrategy; + } + + paraName[0]=0; + + } + + //load corpus and suffix array + if(strlen(corpusFileNameStem)==0){ + cerr<<"CORPUS need to be specified in the configuration file. This should be the corpus name used for LM.\n"; + exit(-1); + } + this->loadData_forSearch(corpusFileNameStem, false, true); //call the constructor of the super class to load suffix array for corpusName, with vocabulary, no offset, + + + //if apply discounting construct the discounting map + if(this->maxFreqForDiscounting<=0){ + this->applyDiscounting = false; + } + else{ + this->applyDiscounting = true; + this->constructDiscountingMap(); //scan the corpus and construct the count of counts table and then discounting map + } + + //get vocID for sentEnd + this->vocIdForSentEnd = this->voc->returnId(C_String("_END_OF_SENTENCE_")); + + if(this->vocIdForSentEnd==0){ + cerr<<"VocID for _END_OF_SENTENCE_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForSentStart = this->voc->returnId(C_String("_SENTENCE_START_")); + if(this->vocIdForSentStart==0){ + cerr<<"VocID for _SENTENCE_START_ can not be found. Critical error.\n"; + exit(0); + } + + this->vocIdForCorpusEnd = this->voc->returnId(C_String("_END_OF_CORPUS_")); + if(this->vocIdForCorpusEnd==0){ + cerr<<"VocID for _END_OF_CORPUS_ can not be found. Critical error.\n"; + exit(0); + } + + this->interpolationStrategy = 'e'; //default: interpolation strategy: equally weighted n-gram conditional prob + +} + + +/** +* Similar to the function in C_SuffixArrayScanningBase +* Scan the corpus to obtain count of counts information +* and construct the discounting using Good-Turing smoothing +**/ +void C_SuffixArrayLanguageModel::constructDiscountingMap() +{ + int i,j; + unsigned int * countOfCountsTable = (unsigned int *) malloc(sizeof(unsigned int)*this->maxN*this->maxFreqForDiscounting); + + if(countOfCountsTable==NULL){ + cerr<<"Count of counts table can not be initialized. Exit\n"; + exit(0); + } + + //initialize count of counts table + for(int c=0;c<this->maxN*this->maxFreqForDiscounting;c++){ + countOfCountsTable[c]=0; + } + + //initialize the scanning list + S_nGramScanningInfoElement * nGramScanningList = (S_nGramScanningInfoElement *) malloc(sizeof(S_nGramScanningInfoElement)*this->maxN); + for(i=0;i<this->maxN;i++){ + nGramScanningList[i].freqSoFar=0; + nGramScanningList[i].vocId = 0; + nGramScanningList[i].freqThreshForOutput = (unsigned int) -1; //default, do not output + } + + bool stillMeaningful = true; + TextLenType saPos=0; + + while(stillMeaningful && ( saPos<this->corpusSize ) ){ + + TextLenType posInCorpus = this->suffix_list[saPos]; + IndexType wordInCorpus = this->corpus_list[posInCorpus]; + + if(wordInCorpus<this->sentIdStart){ //SA positions pointing to sentID are not interesting + + if((wordInCorpus!=this->vocIdForSentStart)&&(wordInCorpus!=this->vocIdForSentEnd)&&(wordInCorpus!=this->vocIdForCorpusEnd)){ //n-grams start with <s> and </s>, or <end of corpus> are not interested + + bool quit =false; + i=0; + + while(!quit && (i<this->maxN)){ + wordInCorpus = this->corpus_list[posInCorpus+i]; + if( + (wordInCorpus<this->sentIdStart)&& + (wordInCorpus!=this->vocIdForSentEnd)&& + (wordInCorpus!=this->vocIdForSentStart)&& + (wordInCorpus==nGramScanningList[i].vocId)){ //still match + + nGramScanningList[i].freqSoFar++; + } + else{ //we will have new (i+1) and longer n-grams soon, before that check if we should increase the count of counts for n because of this n-gram type + + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + + for(j=i;j<this->maxN;j++){ + + + if(nGramScanningList[j].vocId==0){ //a NULL word, then this n-gram and longer ones in the scan window are invalid + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ //perform actions depends on actionType + + freqSoFar = nGramScanningList[j].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){ + //increase the count for (j+1)-gram with freq freqSoFar + countOfCountsTable[j*this->maxFreqForDiscounting+freqSoFar-1]++; + } + } + + //finished output, now clear the list from point of i + if((posInCorpus+j)<this->corpusSize){ + wordInCorpus = this->corpus_list[posInCorpus+j]; + } + else{ + wordInCorpus = 0; //out of bound for corpus + } + + if((wordInCorpus==0)||(wordInCorpus>=this->sentIdStart)||(wordInCorpus==this->vocIdForSentEnd)||(wordInCorpus==this->vocIdForSentStart)){ + wordInCorpus=0; //write 0 for <sentId>, <s> and </s> + nGramScanningList[j].freqSoFar = 0; + } + else{ + nGramScanningList[j].freqSoFar = 1; + } + + nGramScanningList[j].vocId = wordInCorpus; + } + + quit=true; //at i+1 gram, already not match, no need to check for longer + } + + i++; + } + } + } + else{ + stillMeaningful = false; //once vocID is getting larger/equal than sentIdStart, everything follows it are <sentId> and no actual text + } + + saPos++; + } + + //at the end of corpus (according to suffix order) + bool validNgramUpSoFar = true; + unsigned int freqSoFar; + for(i=0;i<this->maxN;i++){ + if(nGramScanningList[i].vocId==0){ //invalide word + validNgramUpSoFar = false; + } + + if(validNgramUpSoFar){ + + freqSoFar = nGramScanningList[i].freqSoFar; + if( (freqSoFar > 0) && ( freqSoFar <= this->maxFreqForDiscounting) ){ + //increase the count for (i+1)-gram with freq freqSoFar + countOfCountsTable[i*this->maxFreqForDiscounting+freqSoFar-1]++; + } + } + } + + //now, use Good-Turing discounting to create frequency mapping + //still assign N*Freq table for simplicity, even though that for each N, only maxFreq-1 freq type will be discounted + this->discountingMap = (double *) malloc(sizeof(double) * this->maxN * this->maxFreqForDiscounting); + + for(i=0;i<this->maxN;i++){ + //for (i+1)-gram + + unsigned int * ccTableForThisN = countOfCountsTable + i*this->maxFreqForDiscounting; + double * discountingMapForThisN = this->discountingMap + i*this->maxFreqForDiscounting; + + for(int freq=0;freq<(this->maxFreqForDiscounting-1);freq++){ //only goes to maxFreq-1, because we can not discount maxFreq + //for all (freq+1) ngrams + if((ccTableForThisN[freq]>0)&&(ccTableForThisN[freq+1]>0)){ //both freq exists + discountingMapForThisN[freq] = (double)(ccTableForThisN[freq+1]*(freq+2))/(double)(ccTableForThisN[freq]); + } + else{ + discountingMapForThisN[freq] = -1; + } + } + + discountingMapForThisN[this->maxFreqForDiscounting-1] = -1; //won't be used, just for consistency + } + + + free(countOfCountsTable); + +} + +///if currently matched an n-gram at corpus position [currentMatchStart, currentMatchStart+currentMatchLen-1] +///get the freq for [currentMatchStart, currentMatchStart+currentMatchLen-1] + nextWord +///only need to get freq(w_n | history) of different history +///return in freq table, freq(history+Wn, history) for all the matched n +///freq: 1-gram Freq, corpusSize, 2-gram freq, freq of 2-gram history +/// 3-gram freq, freq of 3-gram history +///freqTable should have length of 2*n +///return the longest match with this updated n-gram +void C_SuffixArrayLanguageModel::calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, double *freqTable, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + vector<IndexType> nGram; + + if(currentMatchStart!=(TextLenType) -1){ //-1 will be <unk> + if(currentMatchLen==this->maxN){ //we consider only up to this->maxN for the extended n-gram + currentMatchStart++; + currentMatchLen--; + } + + for(TextLenType pos=currentMatchStart; pos<(currentMatchStart+currentMatchLen); pos++){ + nGram.push_back(this->corpus_list[pos]); + } + } + + nGram.push_back(nextWord); + + int sentLen = nGram.size(); + + //construct the n-gram search table + S_sentSearchTableElement * table = this->constructNgramSearchTable4SentWithLCP(nGram); + + int startPosForNgram; + int startPosForLongestMatchingWithNextWord; + int cellIndexForLongestMatchingWithNextWord; + + bool stillMatched = true; + bool atLeastOneMatched = false; + + int indexForNgram; + + unsigned int totalOccurrences; + unsigned int totalOccurrencesOfHistory; + + //for unigram + indexForNgram = sentLen - 1; + if(table[indexForNgram].found){ + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + if(this->applyDiscounting){ + freqTable[0] = this->discountFreq(1, totalOccurrences); + } + else{ + freqTable[0] = totalOccurrences; + } + + freqTable[1] = this->corpusSize; + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = sentLen-1; + atLeastOneMatched = true; + } + else{ + stillMatched = false; + } + + int n=2; //considering 2-gram and longer n-gram now + startPosForNgram = sentLen - 2; + while((stillMatched)&&(startPosForNgram>=0)){ + + indexForNgram = (n-1) * sentLen + startPosForNgram; + int indexForHistory = (n-2) * sentLen + startPosForNgram; + + if(table[indexForNgram].found){ + + totalOccurrences = table[indexForNgram].endingPosInSA - table[indexForNgram].startPosInSA + 1; + totalOccurrencesOfHistory = table[indexForHistory].endingPosInSA - table[indexForHistory].startPosInSA + 1; + + + if(this->applyDiscounting){ + freqTable[2*n-2] = this->discountFreq(n, totalOccurrences); + } + else{ + freqTable[2*n-2] = (double)totalOccurrences; + } + + freqTable[2*n-1] = (double) totalOccurrencesOfHistory; //do not discount the history + + if(n<this->maxN){ //new history is at most this->maxFreqForDiscounting-1 words long + cellIndexForLongestMatchingWithNextWord = indexForNgram; + startPosForLongestMatchingWithNextWord = startPosForNgram; + } + } + else{ + stillMatched = false; + } + + startPosForNgram--; + n++; + } + + if(atLeastOneMatched){ //at least one n-gram can be matched with 'nextWord' + updatedMatchingStart = this->suffix_list[table[cellIndexForLongestMatchingWithNextWord].startPosInSA]; + updatedMatchingLen = (unsigned char) (sentLen - startPosForLongestMatchingWithNextWord); + } + else{ + updatedMatchingStart = (TextLenType) -1; + updatedMatchingLen = 0; + } + + free(table); + +} + + +//given observedFreq of n-gram, return discounted freq using Good-Turing smoothing +double C_SuffixArrayLanguageModel::discountFreq(int n, unsigned int observedFreq) +{ + if(n>=this->maxN){ //do not discount + return (double) observedFreq; + } + + if(observedFreq>=(this->maxFreqForDiscounting-1)){ //no discounting for high freq + return (double) observedFreq; + } + + //else, check the discount map + double discountedFreq = this->discountingMap[ (n-1) * this->maxFreqForDiscounting + observedFreq -1]; + + if(discountedFreq>0){ + return discountedFreq; + } + + //else, no discounting + return (double) observedFreq; +} + + +///Start a new sentence now, clear up the sentence LM state +LMState C_SuffixArrayLanguageModel::beginOfSentenceState() +{ + + this->resetLmStates(); + this->initialLmState(); + + return 0; +} + +void C_SuffixArrayLanguageModel::initialLmState() +{ + //add sentence start + S_LMStateInfo sentStartNode; + sentStartNode.locationInCorpus.posInCorpus = 1; //if corpus is indexed correctly position 1 should be <s> + sentStartNode.locationInCorpus.len = 1; + sentStartNode.cachedNextWordExtension.clear(); + + this->allLMStates.push_back(sentStartNode); + this->ngramLocation2LmStateId.insert(make_pair(sentStartNode.locationInCorpus, 0)); +} + +void C_SuffixArrayLanguageModel::resetLmStates() +{ + this->allLMStates.clear(); + this->ngramLocation2LmStateId.clear(); +} + + +/** +* Given the current history (as represented by the 'lmState' +* caculate the log prob of nextWord given this history P(nextword|history) +* and return the updated language model state with next word appended to the history +* @param lmState Current language model state +* @param nextWord The vocId of the next word (the word to be predicted) +* @param &nextState Returning the updated language model state when the next word is appended +**/ +double C_SuffixArrayLanguageModel::logProb(LMState lmState, IndexType nextWord, LMState & nextState) +{ + if(lmState>=this->allLMStates.size()){ + cerr<<"Invalid LM State: "<<lmState<<endl; + exit(-1); + } + + //first check if we have already seen this 'nextWord' before + map< IndexType, S_CachedLmInfo>::iterator iterNextWordExtensionCache; + iterNextWordExtensionCache = this->allLMStates[lmState].cachedNextWordExtension.find( nextWord ); + + if(iterNextWordExtensionCache==this->allLMStates[lmState].cachedNextWordExtension.end()){ //we haven't seen this lmState+word yet + + //search for it in the corpus + S_NgramLocationInCorpus correspondingNgramLocation = this->allLMStates[lmState].locationInCorpus; + S_NgramLocationInCorpus updatedNgramLocation; + + double logProb = this->logProbFromFreq( + correspondingNgramLocation.posInCorpus, + correspondingNgramLocation.len, + nextWord, + updatedNgramLocation.posInCorpus, + updatedNgramLocation.len); + + //caching the logprob of 'nextword' given the lmState + int updatedLmStateId; + map<S_NgramLocationInCorpus, int, lt_ngramLocationInCorpus>::iterator iterNgramLocation2LmStateId; + iterNgramLocation2LmStateId = this->ngramLocation2LmStateId.find(updatedNgramLocation); + if(iterNgramLocation2LmStateId==this->ngramLocation2LmStateId.end()){ //this updated lm state does not exist yet + S_LMStateInfo newLmStateNode; + + newLmStateNode.locationInCorpus = updatedNgramLocation; + newLmStateNode.cachedNextWordExtension.clear(); + + this->allLMStates.push_back(newLmStateNode); + updatedLmStateId = this->allLMStates.size() -1 ; + this->ngramLocation2LmStateId.insert(make_pair(updatedNgramLocation, updatedLmStateId)); + } + else{ + updatedLmStateId = iterNgramLocation2LmStateId->second; + } + + //cache this + S_CachedLmInfo cachedLmInfo; + cachedLmInfo.logProb = logProb; + cachedLmInfo.nextState = updatedLmStateId; + + this->allLMStates[lmState].cachedNextWordExtension.insert(make_pair(nextWord, cachedLmInfo)); + + //updated next state + nextState = updatedLmStateId; + + return logProb; + } + + nextState = iterNextWordExtensionCache->second.nextState; + + return iterNextWordExtensionCache->second.logProb; +} + + +/** +* Given the history as lmState and append a phrase as a vector of IndexType, +* calculate the LM prob and update the lm state +* Modification suggested by Erik Peterson (eepter@cs.cmu.edu) to check the size of phrase. +* For cases where phrase is empty, i.e. phrase.size()==0, nextState will not be updated correctly and may cause problems in the calling function. + * @param lmState Current language model state +* @param phrase A vector of vocIds of the next phrase (the phrase to be predicted) +* @param &nextState Returning the updated language model state when the next word is appended +**/ +double C_SuffixArrayLanguageModel::logProb(LMState lmState, vector<IndexType> phrase, LMState & nextState) +{ + double logProb = 0; + + if (phrase.size() == 0) { + nextState = lmState; + return logProb; + } + + for(int i=0;i<phrase.size();i++){ + logProb+=this->logProb(lmState, phrase[i], nextState); + lmState = nextState; + } + + return logProb; +} + +/** +* At the end of a sentence, call logProbEnd() to extend the lmState with the sentence end symbol </s> +**/ +double C_SuffixArrayLanguageModel::logProbEnd(LMState lmState) +{ + LMState dummyNextState; + return this->logProb(lmState, this->vocIdForSentEnd, dummyNextState); +} + +/** +* Extend the current matched n-gram with next word, calculate the prob and update the updated range +* the n-gram is represented by its position in the suffix array and the length +* @param currentMatchStart Starting position of the current matched n-gram in corpus +* @param currentMatchLen Length of the matched n-gram \ +* @param nextWord Vocabulary ID of the next word (the word to be predicted) +* @param &updatedMatchingStart If the extended n-gram (the current matched n-gram extended with the 'nextword') exists in the corpus, return its starting position in the corpus +* @param &updatedMatchingLen The length of the extended n-gram +**/ +double C_SuffixArrayLanguageModel::logProbFromFreq(TextLenType currentMatchStart, unsigned char currentMatchLen, IndexType nextWord, TextLenType &updatedMatchingStart, unsigned char &updatedMatchingLen) +{ + + double logProb; + + double * freqTable = (double *) malloc(sizeof(double)*2*(this->maxN)); + memset(freqTable, 0, 2*this->maxN*sizeof(double)); + + this->calcNgramMatchingInfoTokenFreqOnlyExtendingCurrentMatch(currentMatchStart, currentMatchLen, nextWord, freqTable, updatedMatchingStart, updatedMatchingLen); + + logProb = this->calcLogProb(freqTable); + + free(freqTable); + + return logProb; + +} + +double C_SuffixArrayLanguageModel::calcLogProb(double *freq) +{ + switch(this->interpolationStrategy){ + case 'e': + return this->calcLogProb_equalWeightedInterpolation(freq); + break; + case 'i': + return this->calcLogProb_ibmHeuristicInterpolation(freq); + break; + case 'm': + return this->calcLogProb_maxProbInterpolation(freq); + break; + default: + cerr<<"Unknown interpolation strategy!\n"; + exit(0); + } +} + +double C_SuffixArrayLanguageModel::calcLogProb_equalWeightedInterpolation(double *freq) +{ + double prob = 0.0; + + + if(freq[0]>0){ + + int i=0; + bool stillMatched = true; + + while(stillMatched && (i<this->maxN)){ + if(freq[2*i]>0){ + prob+=freq[2*i]/freq[2*i+1]; + } + else{ + stillMatched = false; + } + + i++; + } + + return log(prob/(double)this->maxN); + } + else{ //unknown word + return SALM_LOG_PROB_UNK; + } +} + +double C_SuffixArrayLanguageModel::calcLogProb_ibmHeuristicInterpolation(double *freq) +{ + double prob = 0.0; + if(freq[0]==0){ //unknown word + return SALM_LOG_PROB_UNK; + } + + double remainingWeightSum = 1.0; + + //find the first non-zero match + int i = this->maxN - 1; + + while(freq[2*i]==0){ //will stop for sure because freq[0]!=0 + i--; + } + + for(int j=i;j>=0;j--){ + //for (j+1)-gram + double historyFreq = freq[2*j+1]; + double logHistoryFreq = log(historyFreq); + if(logHistoryFreq>1){ + logHistoryFreq = 1.0; //cap it to 1 + } + + double reliability = 0.1*logHistoryFreq+0.3; //heuristics for reliability of the history + double adjustedWeights = remainingWeightSum * reliability; + + prob+=adjustedWeights * freq[2*i]/freq[2*i+1]; + + remainingWeightSum -= adjustedWeights; + } + + return log(prob); +} + +double C_SuffixArrayLanguageModel::calcLogProb_maxProbInterpolation(double *freq) +{ + double maxProb = 0.0; + + if(freq[0]>0){ + + int i=0; + bool stillMatched = true; + + while(stillMatched && (i<this->maxN)){ + if(freq[2*i]>0){ + double prob=freq[2*i]/freq[2*i+1]; + + if(prob>maxProb){ + maxProb = prob; + } + } + else{ + stillMatched = false; + } + + i++; + } + + return log(maxProb); + } + else{ //unknown word + return SALM_LOG_PROB_UNK; + } +} + +IndexType C_SuffixArrayLanguageModel::returnVocId(C_String aWord) +{ + return this->voc->returnId(aWord); +} |