diff options
author | Hieu Hoang <hieu@hoang.co.uk> | 2013-06-28 17:19:30 +0400 |
---|---|---|
committer | Hieu Hoang <hieu@hoang.co.uk> | 2013-06-28 17:19:30 +0400 |
commit | fa4b92fc0af49079827fb6d71eae86fc97332121 (patch) | |
tree | 3f14600ca2dc8fc8de14ed7552ca4864d5c7baf8 | |
parent | 8c19c2ba8ae59e4cdfefaa442e9825ba9b8bf39f (diff) | |
parent | c963338476a40d5b1071c0c253332ed6ebd0b427 (diff) |
Merge branch 'master' into nadir_osm
25 files changed, 1140 insertions, 435 deletions
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 086623c0e..593955b4d 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1572,6 +1572,16 @@ <locationURI>virtual:/virtual</locationURI> </link> <link> + <name>TranslationModel/WordCoocTable.cpp</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.cpp</locationURI> + </link> + <link> + <name>TranslationModel/WordCoocTable.h</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.h</locationURI> + </link> + <link> <name>TranslationModel/fuzzy-match</name> <type>2</type> <locationURI>virtual:/virtual</locationURI> diff --git a/moses/FF/DistortionScoreProducer.cpp b/moses/FF/DistortionScoreProducer.cpp index 67a1185cb..36618b579 100644 --- a/moses/FF/DistortionScoreProducer.cpp +++ b/moses/FF/DistortionScoreProducer.cpp @@ -22,7 +22,8 @@ struct DistortionState_traditional : public FFState { }; DistortionScoreProducer::DistortionScoreProducer(const std::string &line) - : StatefulFeatureFunction("Distortion", 1, line) { + : StatefulFeatureFunction("Distortion", 1, line) +{ ReadParameters(); } diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp index b35b98648..7c1c98652 100644 --- a/moses/FF/FeatureFunction.cpp +++ b/moses/FF/FeatureFunction.cpp @@ -102,8 +102,8 @@ void FeatureFunction::SetParameter(const std::string& key, const std::string& va void FeatureFunction::ReadParameters() { while (!m_args.empty()) { - const vector<string> &args = m_args[0]; - SetParameter(args[0], args[1]); + const vector<string> &args = m_args[0]; + SetParameter(args[0], args[1]); m_args.erase(m_args.begin()); } diff --git a/moses/FF/PhrasePenalty.cpp b/moses/FF/PhrasePenalty.cpp index ac1aa47b8..8b54939a7 100644 --- a/moses/FF/PhrasePenalty.cpp +++ b/moses/FF/PhrasePenalty.cpp @@ -5,14 +5,15 @@ namespace Moses { PhrasePenalty::PhrasePenalty(const std::string &line) -: StatelessFeatureFunction("PhrasePenalty",1, line) { + : StatelessFeatureFunction("PhrasePenalty",1, line) +{ ReadParameters(); } void PhrasePenalty::Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const { scoreBreakdown.Assign(this, 1.0f); } diff --git a/moses/FF/PhrasePenalty.h b/moses/FF/PhrasePenalty.h index 5263f15ef..7ebc20659 100644 --- a/moses/FF/PhrasePenalty.h +++ b/moses/FF/PhrasePenalty.h @@ -11,13 +11,13 @@ public: PhrasePenalty(const std::string &line); bool IsUseable(const FactorMask &mask) const { - return true; + return true; } virtual void Evaluate(const Phrase &source - , const TargetPhrase &targetPhrase - , ScoreComponentCollection &scoreBreakdown - , ScoreComponentCollection &estimatedFutureScore) const; + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; }; } //namespace diff --git a/moses/FF/UnknownWordPenaltyProducer.cpp b/moses/FF/UnknownWordPenaltyProducer.cpp index adc4c4d6b..4ba033e58 100644 --- a/moses/FF/UnknownWordPenaltyProducer.cpp +++ b/moses/FF/UnknownWordPenaltyProducer.cpp @@ -7,7 +7,8 @@ using namespace std; namespace Moses { UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(const std::string &line) - : StatelessFeatureFunction("UnknownWordPenalty",1, line) { + : StatelessFeatureFunction("UnknownWordPenalty",1, line) +{ m_tuneable = false; ReadParameters(); } diff --git a/moses/FF/WordPenaltyProducer.cpp b/moses/FF/WordPenaltyProducer.cpp index 054852349..8d77d68fd 100644 --- a/moses/FF/WordPenaltyProducer.cpp +++ b/moses/FF/WordPenaltyProducer.cpp @@ -7,7 +7,8 @@ using namespace std; namespace Moses { WordPenaltyProducer::WordPenaltyProducer(const std::string &line) - : StatelessFeatureFunction("WordPenalty",1, line) { + : StatelessFeatureFunction("WordPenalty",1, line) +{ ReadParameters(); } diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index e22a89c6d..bcd7c1519 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -275,13 +275,15 @@ bool Parameter::LoadParam(int argc, char* argv[]) } // overwrite parameters with values from switches - for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++) { + for(PARAM_STRING::const_iterator iterParam = m_description.begin(); + iterParam != m_description.end(); iterParam++) { const string paramName = iterParam->first; OverwriteParam("-" + paramName, paramName, argc, argv); } // ... also shortcuts - for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++) { + for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); + iterParam != m_abbreviation.end(); iterParam++) { const string paramName = iterParam->first; const string paramShortName = iterParam->second; OverwriteParam("-" + paramShortName, paramName, argc, argv); @@ -294,7 +296,8 @@ bool Parameter::LoadParam(int argc, char* argv[]) verbose = Scan<int>(m_setting["verbose"][0]); if (verbose >= 1) { // only if verbose TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl); - for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) { + for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; + iterParam != m_setting.end(); iterParam++) { TRACE_ERR( "\t" << iterParam->first << ": "); for ( size_t i = 0; i < iterParam->second.size(); i++ ) TRACE_ERR( iterParam->second[i] << " "); @@ -303,7 +306,8 @@ bool Parameter::LoadParam(int argc, char* argv[]) } // convert old weights args to new format - if (!isParamSpecified("feature")) + // WHAT IS GOING ON HERE??? - UG + if (!isParamSpecified("feature")) // UG ConvertWeightArgs(); CreateWeightsMap(); WeightOverwrite(); @@ -331,11 +335,11 @@ std::vector<float> &Parameter::GetWeights(const std::string &name) { std::vector<float> &ret = m_weights[name]; - cerr << "WEIGHT " << name << "="; - for (size_t i = 0; i < ret.size(); ++i) { - cerr << ret[i] << ","; - } - cerr << endl; + // cerr << "WEIGHT " << name << "="; + // for (size_t i = 0; i < ret.size(); ++i) { + // cerr << ret[i] << ","; + // } + // cerr << endl; return ret; } @@ -357,7 +361,10 @@ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<floa newWeights.push_back(line); } -void Parameter::AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights) +void +Parameter:: +AddWeight(const std::string &name, size_t ind, + const std::vector<float> &weights) { PARAM_VEC &newWeights = m_setting["weight"]; @@ -478,6 +485,12 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName) case Compact: ptType = "PhraseDictionaryCompact"; break; + case SuffixArray: + ptType = "PhraseDictionarySuffixArray"; + break; + case DSuffixArray: + ptType = "PhraseDictionaryDynSuffixArray"; + break; default: break; } @@ -502,6 +515,9 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName) ++currOldInd; } + + // cerr << weights.size() << " PHRASE TABLE WEIGHTS " + // << __FILE__ << ":" << __LINE__ << endl; AddWeight(ptType, ptInd, weights); // actual pt @@ -527,7 +543,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName) ptLine << "num-features=" << numScoreComponent << " "; ptLine << "table-limit=" << maxTargetPhrase[currDict] << " "; - if (implementation == SuffixArray) { + if (implementation == SuffixArray || implementation == DSuffixArray) { ptLine << "target-path=" << token[5] << " "; ptLine << "alignment-path=" << token[6] << " "; } diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp index bc4231b70..a9eb14ba3 100644 --- a/moses/StaticData.cpp +++ b/moses/StaticData.cpp @@ -700,8 +700,8 @@ bool StaticData::LoadData(Parameter *parameter) SetWeights(model, weights); } else if (feature == "PhrasePenalty") { PhrasePenalty* model = new PhrasePenalty(line); - vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); - SetWeights(model, weights); + vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription()); + SetWeights(model, weights); } #ifdef HAVE_SYNLM @@ -1177,7 +1177,6 @@ void StaticData::LoadFeatureFunctions() } } - // load phrase table for (size_t i = 0; i < m_phraseDictionary.size(); ++i) { PhraseDictionary *pt = m_phraseDictionary[i]; pt->Load(); diff --git a/moses/TargetPhraseCollection.cpp b/moses/TargetPhraseCollection.cpp index 73f6d0543..9a28dbd5d 100644 --- a/moses/TargetPhraseCollection.cpp +++ b/moses/TargetPhraseCollection.cpp @@ -35,11 +35,11 @@ struct CompareTargetPhrase { void TargetPhraseCollection::NthElement(size_t tableLimit) { - vector<TargetPhrase*>::iterator - iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit) ?m_collection.end() : m_collection.begin() + tableLimit; - - //std::sort(m_collection.begin(), m_collection.end(), CompareTargetPhrase()); - std::nth_element(m_collection.begin(), iterMiddle, m_collection.end(), CompareTargetPhrase()); + vector<TargetPhrase*>::iterator nth; + nth = (tableLimit && tableLimit <= m_collection.size() + ? m_collection.begin() + tableLimit + : m_collection.end()); + std::nth_element(m_collection.begin(), nth, m_collection.end(), CompareTargetPhrase()); } void TargetPhraseCollection::Prune(bool adhereTableLimit, size_t tableLimit) diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp index 35b5b37e6..aea2491e9 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.cpp +++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp @@ -3,6 +3,11 @@ #include "moses/FactorCollection.h" #include "moses/StaticData.h" #include "moses/TargetPhrase.h" + +#include "moses/generic/sorting/NBestList.h" +#include "moses/generic/sampling/Sampling.h" + +#include <boost/foreach.hpp> #include <iomanip> using namespace std; @@ -10,43 +15,48 @@ using namespace std; namespace Moses { -BilingualDynSuffixArray::BilingualDynSuffixArray(): +BilingualDynSuffixArray:: +BilingualDynSuffixArray(): m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()), - m_maxSampleSize(20) + m_maxSampleSize(20), m_maxPTEntries(20) { m_srcSA = 0; m_trgSA = 0; - m_srcCorpus = new std::vector<wordID_t>(); - m_trgCorpus = new std::vector<wordID_t>(); - m_srcVocab = new Vocab(false); - m_trgVocab = new Vocab(false); + m_srcCorpus = new vector<wordID_t>(); + m_trgCorpus = new vector<wordID_t>(); + m_srcVocab = new Vocab(false); + m_trgVocab = new Vocab(false); m_scoreCmp = 0; } -BilingualDynSuffixArray::~BilingualDynSuffixArray() +BilingualDynSuffixArray:: +~BilingualDynSuffixArray() { - if(m_srcSA) delete m_srcSA; - if(m_trgSA) delete m_trgSA; - if(m_srcVocab) delete m_srcVocab; - if(m_trgVocab) delete m_trgVocab; + if(m_srcSA) delete m_srcSA; + if(m_trgSA) delete m_trgSA; + if(m_srcVocab) delete m_srcVocab; + if(m_trgVocab) delete m_trgVocab; if(m_srcCorpus) delete m_srcCorpus; if(m_trgCorpus) delete m_trgCorpus; - if(m_scoreCmp) delete m_scoreCmp; + if(m_scoreCmp) delete m_scoreCmp; } -bool BilingualDynSuffixArray::Load( - const std::vector<FactorType>& inputFactors, - const std::vector<FactorType>& outputFactors, - std::string source, std::string target, std::string alignments, - const std::vector<float> &weight) +bool +BilingualDynSuffixArray:: +Load( + const vector<FactorType>& inputFactors, + const vector<FactorType>& outputFactors, + string source, string target, string alignments, + const vector<float> &weight) { m_inputFactors = inputFactors; m_outputFactors = outputFactors; - m_scoreCmp = new ScoresComp(weight); + // m_scoreCmp = new ScoresComp(weight); InputFileStream sourceStrme(source); InputFileStream targetStrme(target); cerr << "Loading source corpus...\n"; + // Input and Output are 'Factor directions' (whatever that is) defined in Typedef.h LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab); cerr << "Loading target corpus...\n"; LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab); @@ -57,88 +67,69 @@ bool BilingualDynSuffixArray::Load( m_srcSA = new DynSuffixArray(m_srcCorpus); if(!m_srcSA) return false; cerr << "Building Target Suffix Array...\n"; - //m_trgSA = new DynSuffixArray(m_trgCorpus); - //if(!m_trgSA) return false; - cerr << "\t(Skipped. Not used)\n"; + m_trgSA = new DynSuffixArray(m_trgCorpus); + if(!m_trgSA) return false; InputFileStream alignStrme(alignments); cerr << "Loading Alignment File...\n"; LoadRawAlignments(alignStrme); + cerr << m_srcSntBreaks.size() << " " + << m_trgSntBreaks.size() << " " + << m_rawAlignments.size() << endl; //LoadAlignments(alignStrme); cerr << "Building frequent word cache...\n"; CacheFreqWords(); - return true; -} - -bool BilingualDynSuffixArray::LoadTM( - const std::vector<FactorType>& inputFactors, - const std::vector<FactorType>& outputFactors, - std::string source, std::string target, std::string alignments, - const std::vector<float> &weight) -{ - m_inputFactors = inputFactors; - m_outputFactors = outputFactors; - - m_scoreCmp = new ScoresComp(weight); - InputFileStream sourceStrme(source); - InputFileStream targetStrme(target); - - cerr << "Loading target corpus...\n"; - LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab); - - cerr << "Loading source corpus...\n"; - LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab); - CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size()); - - // build suffix arrays and auxilliary arrays - cerr << "Building Source Suffix Array...\n"; - m_srcSA = new DynSuffixArray(m_srcCorpus); - if(!m_srcSA) return false; - cerr << "Building Target Suffix Array...\n"; - //m_trgSA = new DynSuffixArray(m_trgCorpus); - //if(!m_trgSA) return false; - cerr << "\t(Skipped. Not used)\n"; - - InputFileStream alignStrme(alignments); - cerr << "Loading Alignment File...\n"; - LoadRawAlignments(alignStrme); - //LoadAlignments(alignStrme); - cerr << "Building frequent word cache...\n"; - CacheFreqWords(); + wordID_t const* s = &(*m_srcCorpus)[0]; + wordID_t const* t = &(*m_trgCorpus)[0]; + for (size_t sid = 0; sid < m_srcSntBreaks.size(); ++sid) { + wordID_t const* se = s + GetSourceSentenceSize(sid); + wordID_t const* te = t + GetTargetSentenceSize(sid); + vector<short> const& a = m_rawAlignments[sid]; + m_wrd_cooc.Count(vector<wordID_t>(s,se), + vector<wordID_t>(t,te), a, + m_srcVocab->GetkOOVWordID(), + m_trgVocab->GetkOOVWordID()); + s = se; + t = te; + } + if (m_srcSntBreaks.size() != m_trgSntBreaks.size() || + m_rawAlignments.size() != m_trgSntBreaks.size()) { + cerr << "FATAL ERROR: Line counts don't match!\n" + << "Source side text corpus: " << m_srcSntBreaks.size() << "\n" + << "Target side text corpus: " << m_trgSntBreaks.size() << "\n" + << "Word alignments: " << m_rawAlignments.size() << endl; + exit(1); + } return true; - } -int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align) +int +BilingualDynSuffixArray:: +LoadRawAlignments(InputFileStream& align) { // stores the alignments in the raw file format - std::string line; - std::vector<int> vtmp; - int lineNum = 1; + string line; + // vector<int> vtmp; + // int lineNum = 0; while(getline(align, line)) { - if (lineNum % 10000 == 0) - cerr << lineNum; - Utils::splitToInt(line, vtmp, "- "); - CHECK(vtmp.size() % 2 == 0); - std::vector<short> vAlgn; // store as short ints for memory - for (std::vector<int>::const_iterator itr = vtmp.begin(); - itr != vtmp.end(); ++itr) { - vAlgn.push_back(short(*itr)); - } - m_rawAlignments.push_back(vAlgn); - ++lineNum; + // if (++lineNum % 10000 == 0) cerr << lineNum << endl; + LoadRawAlignments(line); } return m_rawAlignments.size(); } -int BilingualDynSuffixArray::LoadRawAlignments(string& align) + + +int +BilingualDynSuffixArray:: +LoadRawAlignments(string& align) { // stores the alignments in the raw file format vector<int> vtmp; Utils::splitToInt(align, vtmp, "- "); CHECK(vtmp.size() % 2 == 0); vector<short> vAlgn; // store as short ints for memory - for (std::vector<int>::const_iterator itr = vtmp.begin(); + for (vector<int>::const_iterator itr = vtmp.begin(); itr != vtmp.end(); ++itr) { vAlgn.push_back(short(*itr)); } @@ -146,48 +137,56 @@ int BilingualDynSuffixArray::LoadRawAlignments(string& align) return m_rawAlignments.size(); } -int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align) -{ - std::string line; - std::vector<int> vtmp; - int sntIndex(0); - - while(getline(align, line)) { - Utils::splitToInt(line, vtmp, "- "); - CHECK(vtmp.size() % 2 == 0); - - int sourceSize = GetSourceSentenceSize(sntIndex); - int targetSize = GetTargetSentenceSize(sntIndex); - - SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence - for(int i=0; i < (int)vtmp.size(); i+=2) { - int sourcePos = vtmp[i]; - int targetPos = vtmp[i+1]; - CHECK(sourcePos < sourceSize); - CHECK(targetPos < targetSize); - - curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word - curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word - } - curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence - curSnt.trgSnt = m_trgCorpus + sntIndex; - m_alignments.push_back(curSnt); - - sntIndex++; - } - return m_alignments.size(); -} - -SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const +// int +// BilingualDynSuffixArray:: +// LoadAlignments(InputFileStream& align) +// { +// string line; +// vector<int> vtmp; +// int sntIndex = 0; + +// while(getline(align, line)) +// { +// Utils::splitToInt(line, vtmp, "- "); +// CHECK(vtmp.size() % 2 == 0); + +// int sourceSize = GetSourceSentenceSize(sntIndex); +// int targetSize = GetTargetSentenceSize(sntIndex); + +// SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence +// for(int i=0; i < (int)vtmp.size(); i+=2) +// { +// int sourcePos = vtmp[i]; +// int targetPos = vtmp[i+1]; +// CHECK(sourcePos < sourceSize); +// CHECK(targetPos < targetSize); + +// curSnt.alignedList[sourcePos].push_back(targetPos); // list of target nodes for each source word +// curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word +// } +// curSnt.srcSnt = m_srcCorpus + sntIndex; // point source and target sentence +// curSnt.trgSnt = m_trgCorpus + sntIndex; +// m_alignments.push_back(curSnt); + +// sntIndex++; +// } +// return m_alignments.size(); +// } + +SentenceAlignment +BilingualDynSuffixArray:: +GetSentenceAlignment(const int sntIndex, bool trg2Src) const { // retrieves the alignments in the format used by SentenceAlignment.Extract() - int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex); - int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex); - std::vector<short> alignment = m_rawAlignments.at(sntIndex); + int t = GetTargetSentenceSize(sntIndex); + int s = GetSourceSentenceSize(sntIndex); + int sntGiven = trg2Src ? t : s; + int sntExtract = trg2Src ? s : t; SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence - for(size_t i=0; i < alignment.size(); i+=2) { - int sourcePos = alignment[i]; - int targetPos = alignment[i+1]; + vector<short> const& a = m_rawAlignments.at(sntIndex); + for(size_t i=0; i < a.size(); i+=2) { + int sourcePos = a[i]; + int targetPos = a[i+1]; if(trg2Src) { curSnt.alignedList[targetPos].push_back(sourcePos); // list of target nodes for each source word curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word @@ -202,32 +201,50 @@ SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntInd return curSnt; } -bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex, - const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const +bool +BilingualDynSuffixArray:: +ExtractPhrases(const int& sntIndex, + const int& wordIndex, + const int& sourceSize, + vector<PhrasePair*>& phrasePairs, + bool trg2Src) const { /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src * parameter */ SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src); // get span of phrase in source sentence int beginSentence = m_srcSntBreaks[sntIndex]; - int rightIdx = wordIndex - beginSentence - ,leftIdx = rightIdx - sourceSize + 1; + int rightIdx = wordIndex - beginSentence; + int leftIdx = rightIdx - sourceSize + 1; return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence } -int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStream& corpus, const FactorList& factors, - std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray, - Vocab* vocab) +void +BilingualDynSuffixArray:: +CleanUp(const InputType& source) { - std::string line, word; + //m_wordPairCache.clear(); +} + +int +BilingualDynSuffixArray:: +LoadCorpus(FactorDirection direction, + InputFileStream & corpus, + const FactorList & factors, + vector<wordID_t> & cArray, + vector<wordID_t> & sntArray, + Vocab* vocab) +{ + string line, word; int sntIdx(0); -// corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking). - const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); + // corpus.seekg(0); Seems needless -> commented out to allow + // loading of gzipped corpora (gzfilebuf doesn't support seeking). + const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); while(getline(corpus, line)) { sntArray.push_back(sntIdx); Phrase phrase(ARRAY_SIZE_INCR); // parse phrase - phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL); + phrase.CreateFromString( direction, factors, line, factorDelimiter, NULL); // store words in vocabulary and corpus for( size_t i = 0; i < phrase.GetSize(); ++i) { cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) ); @@ -239,7 +256,9 @@ int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStre return cArray.size(); } -bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const +bool +BilingualDynSuffixArray:: +GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const { // looks up the SA vocab ids for the current src phrase size_t phraseSize = src.GetSize(); @@ -251,87 +270,131 @@ bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &outp return false; } else { output.SetId(pos, arrayId); - //cerr << arrayId << " "; } } return true; } -pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const +pair<float, float> +BilingualDynSuffixArray:: +GetLexicalWeight(const PhrasePair& pp) const { - //return pair<float, float>(1, 1); - float srcLexWeight(1.0), trgLexWeight(1.0); - std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words - //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex]; - const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex); - std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache; - // for each source word - for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) { - float srcSumPairProbs(0); - wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs - const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx); - // for each target word aligned to this source word in this alignment - if(srcWordAlignments.size() == 0) { // get p(NULL|src) - pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID()); - itrCache = m_wordPairCache.find(wordpair); - if(itrCache == m_wordPairCache.end()) { // if not in cache - CacheWordProbs(srcWord); - itrCache = m_wordPairCache.find(wordpair); // search cache again - } - CHECK(itrCache != m_wordPairCache.end()); - srcSumPairProbs += itrCache->second.first; - targetProbs[wordpair] = itrCache->second.second; - } else { // extract p(trg|src) - for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word - int trgIdx = srcWordAlignments[i]; - wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); - // get probability of this source->target word pair - pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord); - itrCache = m_wordPairCache.find(wordpair); - if(itrCache == m_wordPairCache.end()) { // if not in cache - CacheWordProbs(srcWord); - itrCache = m_wordPairCache.find(wordpair); // search cache again - } - CHECK(itrCache != m_wordPairCache.end()); - srcSumPairProbs += itrCache->second.first; - targetProbs[wordpair] = itrCache->second.second; - } - } - float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size()); - srcLexWeight *= (srcNormalizer * srcSumPairProbs); - } // end for each source word - for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) { - float trgSumPairProbs(0); - wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); - for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr - = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) { - if(trgItr->first.second == trgWord) - trgSumPairProbs += trgItr->second; - } - if(trgSumPairProbs == 0) continue; // currently don't store target-side SA - int noAligned = alignment.numberAligned.at(trgIdx); - float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned); - trgLexWeight *= (trgNormalizer * trgSumPairProbs); + // sp,tp: sum of link probabilities + // sc,tc: count of links + int src_size = pp.GetSourceSize(); + int trg_size = pp.GetTargetSize(); + vector<float> sp(src_size, 0), tp(trg_size, 0); + vector<int> sc(src_size,0), tc(trg_size,0); + wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex))); + wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex))); + vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex); + for (size_t i = 0; i < a.size(); i += 2) { + int s = a[i], t = a.at(i+1), sx, tx; + // sx, tx: local positions within phrase pair + + if (s < pp.m_startSource || t < pp.m_startTarget) continue; + if ((sx = s - pp.m_startSource) >= src_size) continue; + if ((tx = t - pp.m_startTarget) >= trg_size) continue; + sp[sx] += m_wrd_cooc.pbwd(sw[s],tw[t]); + tp[tx] += m_wrd_cooc.pfwd(sw[s],tw[t]); + ++sc[sx]; + ++tc[tx]; + } + pair<float,float> ret(1,1); + wordID_t null_trg = m_trgVocab->GetkOOVWordID(); + wordID_t null_src = m_srcVocab->GetkOOVWordID(); + for (size_t i = 0, k = pp.m_startSource; i < sp.size(); ++i, ++k) { + if (sc[i]) ret.first *= sp[i]/sc[i]; + else ret.first *= m_wrd_cooc.pbwd(sw[k], null_trg); + } + for (size_t i = 0, k = pp.m_startTarget; i < tp.size(); ++i, ++k) { + if (tc[i]) ret.second *= tp[i]/tc[i]; + else ret.second *= m_wrd_cooc.pfwd(null_src,tw[k]); } - // TODO::Need to get p(NULL|trg) - return pair<float, float>(srcLexWeight, trgLexWeight); + return ret; + + // //return pair<float, float>(1, 1); + // float srcLexWeight(1.0), trgLexWeight(1.0); + // map<pair<wordID_t, wordID_t>, float> targetProbs; + // // collects sum of target probs given source words + + // //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex]; + // const SentenceAlignment alignment = GetSentenceAlignment(phrasepair.m_sntIndex); + // map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache; + // // for each source word + // for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) { + // float srcSumPairProbs(0); + // wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs + // const vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx); + // // for each target word aligned to this source word in this alignment + // if(srcWordAlignments.size() == 0) { // get p(NULL|src) + // pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID()); + // itrCache = m_wordPairCache.find(wordpair); + // if(itrCache == m_wordPairCache.end()) { // if not in cache + // CacheWordProbs(srcWord); + // itrCache = m_wordPairCache.find(wordpair); // search cache again + // } + // CHECK(itrCache != m_wordPairCache.end()); + // srcSumPairProbs += itrCache->second.first; + // targetProbs[wordpair] = itrCache->second.second; + // } + // else { // extract p(trg|src) + // for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word + // int trgIdx = srcWordAlignments[i]; + // wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); + // // get probability of this source->target word pair + // pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord); + // itrCache = m_wordPairCache.find(wordpair); + // if(itrCache == m_wordPairCache.end()) { // if not in cache + // CacheWordProbs(srcWord); + // itrCache = m_wordPairCache.find(wordpair); // search cache again + // } + // CHECK(itrCache != m_wordPairCache.end()); + // srcSumPairProbs += itrCache->second.first; + // targetProbs[wordpair] = itrCache->second.second; + // } + // } + // float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size()); + // srcLexWeight *= (srcNormalizer * srcSumPairProbs); + // } // end for each source word + + // for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) { + // float trgSumPairProbs(0); + // wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); + // for (map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr + // = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) { + // if(trgItr->first.second == trgWord) + // trgSumPairProbs += trgItr->second; + // } + // if(trgSumPairProbs == 0) continue; // currently don't store target-side SA + // int noAligned = alignment.numberAligned.at(trgIdx); + // float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned); + // trgLexWeight *= (trgNormalizer * trgSumPairProbs); + // } + + // // TODO::Need to get p(NULL|trg) + + // return pair<float, float>(srcLexWeight, trgLexWeight); } -void BilingualDynSuffixArray::CacheFreqWords() const + +void +BilingualDynSuffixArray:: +CacheFreqWords() const { - std::multimap<int, wordID_t> wordCnts; + multimap<int, wordID_t> wordCnts; // for each source word in vocab Vocab::Word2Id::const_iterator it; for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) { // get its frequency wordID_t srcWord = it->second; - std::vector<wordID_t> sword(1, srcWord), wrdIndices; + vector<wordID_t> sword(1, srcWord), wrdIndices; m_srcSA->GetCorpusIndex(&sword, &wrdIndices); if(wrdIndices.size() >= 1000) { // min count wordCnts.insert(make_pair(wrdIndices.size(), srcWord)); } } int numSoFar(0); - std::multimap<int, wordID_t>::reverse_iterator ritr; + multimap<int, wordID_t>::reverse_iterator ritr; for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) { m_freqWordsCached.insert(ritr->second); CacheWordProbs(ritr->second); @@ -339,20 +402,23 @@ void BilingualDynSuffixArray::CacheFreqWords() const } cerr << "\tCached " << m_freqWordsCached.size() << " source words\n"; } -void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const + +void +BilingualDynSuffixArray:: +CacheWordProbs(wordID_t srcWord) const { - std::map<wordID_t, int> counts; - std::vector<wordID_t> sword(1, srcWord), wrdIndices; + map<wordID_t, int> counts; + vector<wordID_t> sword(1, srcWord), wrdIndices; bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices); CHECK(ret); - std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks); + vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks); float denom(0); // for each occurrence of this word for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { int sntIdx = sntIndexes.at(snt); // get corpus index for sentence CHECK(sntIdx != -1); int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence - const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word + const vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word if(srcAlg.size() == 0) { ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word ++denom; @@ -366,7 +432,7 @@ void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const } // now we've gotten counts of all target words aligned to this source word // get probs and cache all pairs - for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin(); + for(map<wordID_t, int>::const_iterator itrCnt = counts.begin(); itrCnt != counts.end(); ++itrCnt) { pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first); float srcTrgPrb = float(itrCnt->second) / float(denom); // gives p(src->trg) @@ -375,7 +441,9 @@ void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const } } -SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const +SAPhrase +BilingualDynSuffixArray:: +TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const { // takes sentence indexes and looks up vocab IDs SAPhrase phraseIds(phrasepair.GetTargetSize()); @@ -388,7 +456,9 @@ SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepa return phraseIds; } -TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const +TargetPhrase* +BilingualDynSuffixArray:: +GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const { TargetPhrase* targetPhrase = new TargetPhrase(); for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words @@ -401,106 +471,211 @@ TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase, return targetPhrase; } -void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const +/// Gather translation candidates for source phrase /src/ and store raw +// phrase pair statistics in /pstats/. Return the sample rate +// (number of samples considered / total number of hits) and total number of +// phrase pairs +pair<float,float> +BilingualDynSuffixArray:: +GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const { - //cerr << "phrase is \"" << src << endl; - size_t sourceSize = src.GetSize(); - SAPhrase localIDs(sourceSize); - if(!GetLocalVocabIDs(src, localIDs)) return; - float totalTrgPhrases(0); - std::map<SAPhrase, int> phraseCounts; - //std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from - std::map<SAPhrase, pair<float, float> > lexicalWeights; - std::map<SAPhrase, pair<float, float> >::iterator itrLexW; - std::vector<unsigned> wrdIndices; - // extract sentence IDs from SA and return rightmost index of phrases - if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return; - SampleSelection(wrdIndices); - std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks); - // for each sentence with this phrase - for(size_t snt = 0; snt < sntIndexes.size(); ++snt) { - std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence - int sntIndex = sntIndexes.at(snt); // get corpus index for sentence - if(sntIndex == -1) continue; // bad flag set by GetSntIndexes() - ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs); - //cerr << "extracted " << phrasePairs.size() << endl; - totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair - std::vector<PhrasePair*>::iterator iterPhrasePair; - for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) { - SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair); - phraseCounts[phrase]++; // count each unique phrase - // NOTE::Correct but slow to extract lexical weight here. could do - // it later for only the top phrases chosen by phrase prob p(e|f) - pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair); // get lexical weighting for this phrase pair - itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached - if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first)) - itrLexW->second = lexWeight; // if this lex weight is greater save it - else lexicalWeights[phrase] = lexWeight; // else save + typedef map<SAPhrase, vector<float> >::iterator pstat_iter; + typedef map<SAPhrase, vector<float> >::value_type pstat_entry; + pair<float,float> ret(0,0); + float& sampleRate = ret.first; + float& totalPhrases = ret.second; + size_t srcSize = src.GetSize(); + SAPhrase localIDs(srcSize); + vector<unsigned> wrdIndices; + if(!GetLocalVocabIDs(src, localIDs) || + !m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) + return ret; // source phrase contains OOVs + + // select a sample of the occurrences for phrase extraction + size_t m1 = wrdIndices.size(); + SampleSelection(wrdIndices); // careful! SampleSelection alters wrdIndices! + sampleRate = float(wrdIndices.size())/m1; + + // determine the sentences in which these phrases occur + vector<int> sntIndices = GetSntIndexes(wrdIndices, srcSize, m_srcSntBreaks); + for(size_t s = 0; s < sntIndices.size(); ++s) { + int sntStart = sntIndices.at(s); + if(sntStart == -1) continue; // marked as bad by GetSntIndexes() + vector<PhrasePair*> phrasePairs; + ExtractPhrases(sntStart, wrdIndices[s], srcSize, phrasePairs); + totalPhrases += phrasePairs.size(); + vector<PhrasePair*>::iterator p; + for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p) { + assert(*p); + pair<float, float> lex = GetLexicalWeight(**p); + pstat_entry entry(TrgPhraseFromSntIdx(**p), Scores(5)); + pair<pstat_iter, bool> foo = pstats.insert(entry); + Scores& feats = foo.first->second; + if (foo.second) { + feats[0] = 1; // count + feats[1] = lex.first; + feats[3] = lex.second; + } else { + feats[0] += 1; + feats[1] = max(feats[1],lex.first); + feats[3] = max(feats[3],lex.second); + } + delete *p; } - // done with sentence. delete SA phrase pairs - RemoveAllInColl(phrasePairs); } // done with all sentences - // convert to moses phrase pairs - std::map<SAPhrase, int>::const_iterator iterPhrases; - std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp); - // get scores of all phrases - for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) { - float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases; - itrLexW = lexicalWeights.find(iterPhrases->first); - CHECK(itrLexW != lexicalWeights.end()); - Scores scoreVector(3); - scoreVector[0] = std::log(trg2SrcMLE); - scoreVector[1] = std::log(itrLexW->second.first); - scoreVector[2] = 1; // exp(1); - phraseScores.insert(make_pair(scoreVector, &iterPhrases->first)); - } - // return top scoring phrases - std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr; - for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) { - Scores scoreVector = ritr->first; - TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src); - target.push_back(make_pair( scoreVector, targetPhrase)); - if(target.size() == m_maxSampleSize) break; + BOOST_FOREACH(pstat_entry & e, pstats) { + Scores& feats = e.second; + // 0: bwd phrase prob + // 1: lex 1 + // 2: fwd phrase prob + // 3: lex 2 + // 4: phrase penalty + float x = m_trgSA->GetCount(e.first.words)-feats[0] * sampleRate; + feats[4] = 1; + feats[3] = log(feats[3]); + feats[2] = log(feats[0]) - log(totalPhrases); + feats[1] = log(feats[1]); + feats[0] = log(feats[0]) - log(feats[0] + x); } + return ret; } -std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices, - const int sourceSize, const std::vector<unsigned>& sntBreaks) const +// void +// BilingualDynSuffixArray:: +// GetTargetPhrasesByLexicalWeight +// (const Phrase& src, vector< pair<Scores, TargetPhrase*> > & target) +// const +// { +// size_t sourceSize = src.GetSize(); +// SAPhrase localIDs(sourceSize); +// if(!GetLocalVocabIDs(src, localIDs)) +// return; // source phrase contains OOVs + +// float totalTrgPhrases(0); +// map<SAPhrase, int> phraseCounts; +// map<SAPhrase, pair<float, float> > lexicalWeights; +// map<SAPhrase, pair<float, float> >::iterator itrLexW; + +// // find all occurrences of the phrase in the corpus; +// // wrdIndices stores the rightmost position +// vector<unsigned> wrdIndices; +// if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) +// return; // none found + +// // select a sample of the occurrences for phrase extraction +// size_t m1 = wrdIndices.size(); +// SampleSelection(wrdIndices); +// float sampleRate = float(wrdIndices.size())/m1; + +// // determine the sentences in which these phrases occur +// vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks); + +// // for each sentence with this phrase +// for(size_t s = 0; s < sntIndexes.size(); ++s) +// { +// vector<PhrasePair*> phrasePairs; +// int sntIndex = sntIndexes.at(s); +// if(sntIndex == -1) continue; // bad flag set by GetSntIndexes() +// ExtractPhrases(sntIndex, wrdIndices[s], sourceSize, phrasePairs); +// totalTrgPhrases += phrasePairs.size(); +// vector<PhrasePair*>::iterator p; +// for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p) +// { +// PhrasePair* px = *p; +// assert(px); +// SAPhrase phrase = TrgPhraseFromSntIdx(*px); +// phraseCounts[phrase]++; // count each unique phrase +// // NOTE::Correct but slow to extract lexical weight here. could do +// // it later for only the top phrases chosen by phrase prob p(e|f) +// pair<float, float> lexWeight = GetLexicalWeight(**p); +// itrLexW = lexicalWeights.find(phrase); +// if((itrLexW != lexicalWeights.end()) && +// (itrLexW->second.first < lexWeight.first)) +// itrLexW->second = lexWeight; // if this lex weight is greater save it +// else lexicalWeights[phrase] = lexWeight; // else save +// } +// // done with sentence. delete SA phrase pairs +// BOOST_FOREACH(PhrasePair* p, phrasePairs) delete p; +// } // done with all sentences + +// cerr << "Done extracting ... " << endl; + +// // convert to moses phrase pairs +// map<SAPhrase, int>::iterator pcnt; +// BetterPhrase better(*m_scoreCmp); +// NBestList<pair<Scores,SAPhrase const*>,BetterPhrase> nbest(m_maxPTEntries,better); +// for(pcnt = phraseCounts.begin(); pcnt != phraseCounts.end(); ++pcnt) +// { +// float tmarginal = (m_trgSA->GetCount(pcnt->first.words) * sampleRate); +// float pfwd = pcnt->second / totalTrgPhrases; +// float pbwd = pcnt->second / tmarginal; +// pair<float, float> lexWeight = lexicalWeights[pcnt->first]; +// pair<Scores, SAPhrase const*> entry; +// entry.first.resize(5); +// entry.first[0] = pbwd; +// entry.first[1] = itrLexW->second.first; +// entry.first[2] = pfwd; +// entry.first[3] = itrLexW->second.second; +// entry.first[4] = 2.718; // exp(1); +// entry.second = &pcnt->first; +// nbest.add(entry); +// } + +// // return top scoring phrases +// for (size_t n = 0; n < nbest.size(); ++n) +// { +// pair<Scores, SAPhrase const*> e = nbest[n]; +// target.push_back(make_pair(e.first,GetMosesFactorIDs(*e.second, src))); +// } +// } + +vector<int> +BilingualDynSuffixArray:: +GetSntIndexes(vector<unsigned>& wrdIndices, + const int sourceSize, + const vector<unsigned>& sntBreaks) const { - std::vector<unsigned>::const_iterator vit; - std::vector<int> sntIndexes; + vector<unsigned>::const_iterator vit; + vector<int> sntIndices; for(size_t i=0; i < wrdIndices.size(); ++i) { - vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]); + vit = upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]); int index = int(vit - sntBreaks.begin()) - 1; // check for phrases that cross sentence boundaries if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index)) - sntIndexes.push_back(-1); // set bad flag + sntIndices.push_back(-1); // set bad flag else - sntIndexes.push_back(index); // store the index of the sentence in the corpus + sntIndices.push_back(index); // store the index of the sentence in the corpus } - return sntIndexes; + return sntIndices; } -int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample, - int sampleSize) const +int +BilingualDynSuffixArray:: +SampleSelection(vector<unsigned>& sample, int sampleSize) const { // only use top 'sampleSize' number of samples - if(sample.size() > (size_t)sampleSize) - sample.erase(sample.begin()+sampleSize, sample.end()); + vector<unsigned> s; + randomSample<unsigned>(s,sampleSize,sample.size()); + for (size_t i = 0; i < s.size(); ++i) + s[i] = sample[s[i]]; + sample.swap(s); return sample.size(); } -void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) +void +BilingualDynSuffixArray:: +addSntPair(string& source, string& target, string& alignment) { vuint_t srcFactor, trgFactor; - cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl; - const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); + cerr << "source, target, alignment = " << source << ", " + << target << ", " << alignment << endl; + const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size(); cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl; Phrase sphrase(ARRAY_SIZE_INCR); sphrase.CreateFromString(Input, m_inputFactors, source, factorDelimiter, NULL); m_srcVocab->MakeOpen(); - std::vector<wordID_t> sIDs(sphrase.GetSize()); + vector<wordID_t> sIDs(sphrase.GetSize()); // store words in vocabulary and corpus for(int i = sphrase.GetSize()-1; i >= 0; --i) { sIDs[i] = m_srcVocab->GetWordID(sphrase.GetWord(i)); // get vocab id backwards @@ -515,7 +690,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& Phrase tphrase(ARRAY_SIZE_INCR); tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL); m_trgVocab->MakeOpen(); - std::vector<wordID_t> tIDs(tphrase.GetSize()); + vector<wordID_t> tIDs(tphrase.GetSize()); for(int i = tphrase.GetSize()-1; i >= 0; --i) { tIDs[i] = m_trgVocab->GetWordID(tphrase.GetWord(i)); // get vocab id } @@ -529,18 +704,26 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& cerr << "gets to 2\n"; m_srcSA->Insert(&srcFactor, oldSrcCrpSize); cerr << "gets to 3\n"; - //m_trgSA->Insert(&trgFactor, oldTrgCrpSize); + m_trgSA->Insert(&trgFactor, oldTrgCrpSize); LoadRawAlignments(alignment); m_trgVocab->MakeClosed(); + + m_wrd_cooc.Count(sIDs,tIDs, m_rawAlignments.back(), + m_srcVocab->GetkOOVWordID(), + m_trgVocab->GetkOOVWordID()); + //for(size_t i=0; i < sphrase.GetSize(); ++i) //ClearWordInCache(sIDs[i]); } -void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) + +void +BilingualDynSuffixArray:: +ClearWordInCache(wordID_t srcWord) { if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end()) return; - std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it, + map<pair<wordID_t, wordID_t>, pair<float, float> >::iterator it, first, last; for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) { if(it->first.first == srcWord) { // all source words grouped @@ -553,18 +736,23 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) m_wordPairCache.erase(first, last); } } -SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize) - :m_sntIndex(sntIndex) - ,numberAligned(targetSize, 0) - ,alignedList(sourceSize) + +SentenceAlignment:: +SentenceAlignment(int sntIndex, int sourceSize, int targetSize) + : m_sntIndex(sntIndex) + , numberAligned(targetSize, 0) + , alignedList(sourceSize) { - for(int i=0; i < sourceSize; ++i) { - std::vector<int> trgWrd; - alignedList[i] = trgWrd; - } + // What is the code below supposed to accomplish??? UG. + // for(int i=0; i < sourceSize; ++i) { + // vector<int> trgWrd; + // alignedList[i] = trgWrd; + // } } -bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const +bool +SentenceAlignment:: +Extract(int maxPhraseLength, vector<PhrasePair*> &ret, int startSource, int endSource) const { // foreign = target, F=T // english = source, E=S @@ -572,7 +760,7 @@ bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &r int minTarget = 9999; int maxTarget = -1; - std::vector< int > usedTarget = numberAligned; + vector< int > usedTarget = numberAligned; for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) { for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) { int targetPos = alignedList[sourcePos][ind]; @@ -626,4 +814,42 @@ bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &r } +int +BilingualDynSuffixArray:: +GetSourceSentenceSize(size_t sentenceId) const +{ + return (sentenceId==m_srcSntBreaks.size()-1) ? + m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : + m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); +} + +int +BilingualDynSuffixArray:: +GetTargetSentenceSize(size_t sentenceId) const +{ + return (sentenceId==m_trgSntBreaks.size()-1) ? + m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : + m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId); +} + +BetterPhrase:: +BetterPhrase(ScoresComp const& sc) + : cmp(sc) {} + +// bool +// BetterPhrase:: +// operator()(pair<Scores, TargetPhrase const*> const& a, +// pair<Scores, TargetPhrase const*> const& b) const +// { +// return cmp(b.first,a.first); +// } + +bool +BetterPhrase:: +operator()(pair<Scores, SAPhrase const*> const& a, + pair<Scores, SAPhrase const*> const& b) const +{ + return cmp(b.first,a.first); +} + }// end namepsace diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h index 7a4d47eea..71f032507 100644 --- a/moses/TranslationModel/BilingualDynSuffixArray.h +++ b/moses/TranslationModel/BilingualDynSuffixArray.h @@ -5,23 +5,29 @@ #include "moses/TranslationModel/DynSAInclude/vocab.h" #include "moses/TranslationModel/DynSAInclude/types.h" #include "moses/TranslationModel/DynSAInclude/utils.h" +#include "moses/TranslationModel/WordCoocTable.h" #include "moses/InputFileStream.h" #include "moses/FactorTypeSet.h" #include "moses/TargetPhrase.h" +#include <boost/dynamic_bitset.hpp> +#include "moses/TargetPhraseCollection.h" +#include <map> +using namespace std; namespace Moses { +class PhraseDictionaryDynSuffixArray; /** @todo ask Abbey Levenberg */ class SAPhrase { public: - std::vector<wordID_t> words; + vector<wordID_t> words; SAPhrase(size_t phraseSize) - :words(phraseSize) { - } + :words(phraseSize) + {} void SetId(size_t pos, wordID_t id) { CHECK(pos < words.size()); @@ -43,12 +49,16 @@ public: , m_endTarget(endTarget) , m_startSource(startSource) , m_endSource(endSource) - , m_sntIndex(sntIndex) { - } + , m_sntIndex(sntIndex) + {} size_t GetTargetSize() const { return m_endTarget - m_startTarget + 1; } + + size_t GetSourceSize() const { + return m_endSource - m_startSource + 1; + } }; /** @todo ask Abbey Levenberg @@ -58,32 +68,43 @@ class SentenceAlignment public: SentenceAlignment(int sntIndex, int sourceSize, int targetSize); int m_sntIndex; - std::vector<wordID_t>* trgSnt; - std::vector<wordID_t>* srcSnt; - std::vector<int> numberAligned; - std::vector< std::vector<int> > alignedList; - bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const; + vector<wordID_t>* trgSnt; + vector<wordID_t>* srcSnt; + vector<int> numberAligned; + vector< vector<int> > alignedList; + bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret, + int startSource, int endSource) const; }; + class ScoresComp { public: - ScoresComp(const std::vector<float>& weights): m_weights(weights) {} + ScoresComp(const vector<float>& weights): m_weights(weights) {} bool operator()(const Scores& s1, const Scores& s2) const { return s1[0] < s2[0]; // just p(e|f) as approximation - /*float score1(0), score2(0); - int idx1(0), idx2(0); - for (Scores::const_iterator itr = s1.begin(); - itr != s1.end(); ++itr) { - score1 += log(*itr * m_weights.at(idx1++)); - } - for (Scores::const_iterator itr = s2.begin(); - itr != s2.end(); ++itr) { - score2 += log(*itr * m_weights.at(idx2++)); - } - return score1 < score2;*/ + // float score1(0), score2(0); + // int idx1(0), idx2(0); + // for (Scores::const_iterator itr = s1.begin(); + // itr != s1.end(); ++itr) { + // score1 += log(*itr * m_weights.at(idx1++)); + // } + // for (Scores::const_iterator itr = s2.begin(); + // itr != s2.end(); ++itr) { + // score2 += log(*itr * m_weights.at(idx2++)); + // } + // return score1 < score2; } private: - const std::vector<float>& m_weights; + const vector<float>& m_weights; +}; + +struct BetterPhrase { + ScoresComp const& cmp; + BetterPhrase(ScoresComp const& sc); + // bool operator()(pair<Scores, TargetPhrase const*> const& a, + // pair<Scores, TargetPhrase const*> const& b) const; + bool operator()(pair<Scores, SAPhrase const*> const& a, + pair<Scores, SAPhrase const*> const& b) const; }; /** @todo ask Abbey Levenberg @@ -93,66 +114,70 @@ class BilingualDynSuffixArray public: BilingualDynSuffixArray(); ~BilingualDynSuffixArray(); - bool Load( const std::vector<FactorType>& inputFactors, - const std::vector<FactorType>& outputTactors, - std::string source, std::string target, std::string alignments, - const std::vector<float> &weight); - bool LoadTM( const std::vector<FactorType>& inputFactors, - const std::vector<FactorType>& outputTactors, - std::string source, std::string target, std::string alignments, - const std::vector<float> &weight); - void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const; + bool Load( const vector<FactorType>& inputFactors, + const vector<FactorType>& outputTactors, + string source, string target, string alignments, + const vector<float> &weight); + // bool LoadTM( const vector<FactorType>& inputFactors, + // const vector<FactorType>& outputTactors, + // string source, string target, string alignments, + // const vector<float> &weight); + void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const; + + void CleanUp(const InputType& source); void addSntPair(string& source, string& target, string& alignment); + pair<float,float> + GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const; + + TargetPhrase* + GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const; + private: - DynSuffixArray* m_srcSA; - DynSuffixArray* m_trgSA; - std::vector<wordID_t>* m_srcCorpus; - std::vector<wordID_t>* m_trgCorpus; - std::vector<FactorType> m_inputFactors; - std::vector<FactorType> m_outputFactors; - std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks; + + mutable WordCoocTable m_wrd_cooc; + DynSuffixArray * m_srcSA; + DynSuffixArray * m_trgSA; + vector<wordID_t>* m_srcCorpus; + vector<wordID_t>* m_trgCorpus; + vector<FactorType> m_inputFactors; + vector<FactorType> m_outputFactors; + + vector<unsigned> m_srcSntBreaks, m_trgSntBreaks; Vocab* m_srcVocab, *m_trgVocab; ScoresComp* m_scoreCmp; - std::vector<SentenceAlignment> m_alignments; - std::vector<std::vector<short> > m_rawAlignments; + vector<SentenceAlignment> m_alignments; + vector<vector<short> > m_rawAlignments; - mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache; - mutable std::set<wordID_t> m_freqWordsCached; + mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache; + mutable set<wordID_t> m_freqWordsCached; const size_t m_maxPhraseLength, m_maxSampleSize; - - int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors, - std::vector<wordID_t>&, std::vector<wordID_t>&, + const size_t m_maxPTEntries; + int LoadCorpus(FactorDirection direction, + InputFileStream&, const vector<FactorType>& factors, + vector<wordID_t>&, vector<wordID_t>&, Vocab*); int LoadAlignments(InputFileStream& aligs); int LoadRawAlignments(InputFileStream& aligs); int LoadRawAlignments(string& aligs); - bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const; + bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const; SentenceAlignment GetSentenceAlignment(const int, bool=false) const; - int SampleSelection(std::vector<unsigned>&, int = 300) const; + int SampleSelection(vector<unsigned>&, int = 300) const; - std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const; - TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const; + vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const; SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const; bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const; void CacheWordProbs(wordID_t) const; void CacheFreqWords() const; void ClearWordInCache(wordID_t); - std::pair<float, float> GetLexicalWeight(const PhrasePair&) const; + pair<float, float> GetLexicalWeight(const PhrasePair&) const; + + int GetSourceSentenceSize(size_t sentenceId) const; + int GetTargetSentenceSize(size_t sentenceId) const; - int GetSourceSentenceSize(size_t sentenceId) const { - return (sentenceId==m_srcSntBreaks.size()-1) ? - m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) : - m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId); - } - int GetTargetSentenceSize(size_t sentenceId) const { - return (sentenceId==m_trgSntBreaks.size()-1) ? - m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) : - m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId); - } }; } // end namespace #endif diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp index c5fddf3f0..a5fc32dcc 100644 --- a/moses/TranslationModel/DynSuffixArray.cpp +++ b/moses/TranslationModel/DynSuffixArray.cpp @@ -1,5 +1,6 @@ #include "DynSuffixArray.h" #include <iostream> +#include <boost/foreach.hpp> using namespace std; @@ -215,8 +216,37 @@ void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */ return; } +ComparePosition:: +ComparePosition(vuint_t const& crp, vuint_t const& sfa) + : m_crp(crp), m_sfa(sfa) { } + +bool +ComparePosition:: +operator()(unsigned const& i, vector<wordID_t> const& phrase) const +{ + unsigned const* x = &m_crp.at(i); + unsigned const* e = &m_crp.back(); + size_t k = 0; + for (; k < phrase.size() && x < e; ++k, ++x) + if (*x != phrase[k]) return *x < phrase[k]; + return (x == e && k < phrase.size()); +} + +bool +ComparePosition:: +operator()(vector<wordID_t> const& phrase, unsigned const& i) const +{ + unsigned const* x = &m_crp.at(i); + unsigned const* e = &m_crp.back(); + size_t k = 0; + for (; k < phrase.size() && x < e; ++k, ++x) + if (*x != phrase[k]) return phrase[k] < *x; + return false; // (k == phrase.size() && x < e); +} + bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices) { + // DOES THIS EVEN WORK WHEN A DynSuffixArray has been saved and reloaded???? pair<vuint_t::iterator,vuint_t::iterator> bounds; indices->clear(); size_t phrasesize = phrase->size(); @@ -251,6 +281,16 @@ bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices) return (indices->size() > 0); } +size_t +DynSuffixArray:: +GetCount(vuint_t const& phrase) const +{ + ComparePosition cmp(*m_corpus, *m_SA); + vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp); + vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp); + return ub-lb; +} + void DynSuffixArray::Save(FILE* fout) { fWriteVector(fout, *m_SA); diff --git a/moses/TranslationModel/DynSuffixArray.h b/moses/TranslationModel/DynSuffixArray.h index 024d6daf1..62f719d57 100644 --- a/moses/TranslationModel/DynSuffixArray.h +++ b/moses/TranslationModel/DynSuffixArray.h @@ -11,9 +11,25 @@ namespace Moses { - +using namespace std; typedef std::vector<unsigned> vuint_t; + +/// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/ +/// against reference phrase /phrase/ +// added by Ulrich Germann +class ComparePosition +{ + vuint_t const& m_crp; + vuint_t const& m_sfa; + +public: + ComparePosition(vuint_t const& crp, vuint_t const& sfa); + bool operator()(unsigned const& i, vector<wordID_t> const& phrase) const; + bool operator()(vector<wordID_t> const& phrase, unsigned const& i) const; +}; + + /** @todo ask Abbey Levenberg */ class DynSuffixArray @@ -30,6 +46,8 @@ public: void Delete(unsigned, unsigned); void Substitute(vuint_t*, unsigned); + size_t GetCount(vuint_t const& phrase) const; + private: vuint_t* m_SA; vuint_t* m_ISA; @@ -46,10 +64,10 @@ private: void PrintAuxArrays() { std::cerr << "SA\tISA\tF\tL\n"; for(size_t i=0; i < m_SA->size(); ++i) - std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" << m_F->at(i) << "\t" << m_L->at(i) << std::endl; + std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" + << m_F->at(i) << "\t" << m_L->at(i) << std::endl; } }; - } //end namespace #endif diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README new file mode 100644 index 000000000..19a7f8779 --- /dev/null +++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README @@ -0,0 +1,4 @@ +Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini + +[ttable-file] +14 0 0 5 <source language text file> <target language text file> <file with alignment info in symal format> diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp index 61305b958..160362f4c 100644 --- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp +++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp @@ -3,83 +3,103 @@ #include "moses/StaticData.h" #include "moses/TargetPhrase.h" #include <iomanip> - +#include <boost/foreach.hpp> using namespace std; namespace Moses { -PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line) - :PhraseDictionary("PhraseDictionaryDynSuffixArray", line) +PhraseDictionaryDynSuffixArray:: +PhraseDictionaryDynSuffixArray(const std::string &line) + : PhraseDictionary("PhraseDictionaryDynSuffixArray", line) ,m_biSA(new BilingualDynSuffixArray()) { ReadParameters(); } -PhraseDictionaryDynSuffixArray::~PhraseDictionaryDynSuffixArray() -{ - delete m_biSA; -} void PhraseDictionaryDynSuffixArray::Load() { SetFeaturesToApply(); - const StaticData &staticData = StaticData::Instance(); - vector<float> weight = staticData.GetWeights(this); - - m_biSA->Load( m_input, m_output, m_source, m_target, m_alignments, weight); + vector<float> weight = StaticData::Instance().GetWeights(this); + m_biSA->Load(m_input, m_output, m_source, m_target, m_alignments, weight); } -const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCollection(const Phrase& src) const +PhraseDictionaryDynSuffixArray:: +~PhraseDictionaryDynSuffixArray() { - TargetPhraseCollection *ret = new TargetPhraseCollection(); - std::vector< std::pair< Scores, TargetPhrase*> > trg; - // extract target phrases and their scores from suffix array - m_biSA->GetTargetPhrasesByLexicalWeight( src, trg); + delete m_biSA; +} - std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr; - for(itr = trg.begin(); itr != trg.end(); ++itr) { - Scores scoreVector = itr->first; - TargetPhrase *targetPhrase = itr->second; - //std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),NegateScore); - std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); +void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "source") { + m_source = value; + } else if (key == "target") { + m_target = value; + } else if (key == "alignment") { + m_alignments = value; + } else { + PhraseDictionary::SetParameter(key, value); + } +} - targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); - targetPhrase->Evaluate(src); +const TargetPhraseCollection* +PhraseDictionaryDynSuffixArray:: +GetTargetPhraseCollection(const Phrase& src) const +{ + typedef map<SAPhrase, vector<float> >::value_type pstat_entry; + map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics + m_biSA->GatherCands(src,pstats); - //cout << *targetPhrase << "\t" << std::setprecision(8) << scoreVector[2] << endl; - ret->Add(targetPhrase); + TargetPhraseCollection *ret = new TargetPhraseCollection(); + BOOST_FOREACH(pstat_entry & e, pstats) { + TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src); + tp->GetScoreBreakdown().Assign(this,e.second); + ret->Add(tp); } - ret->NthElement(m_tableLimit); // sort the phrases for the dcoder + // return ret; + // TargetPhraseCollection *ret = new TargetPhraseCollection(); + // std::vector< std::pair< Scores, TargetPhrase*> > trg; + // + // // extract target phrases and their scores from suffix array + // m_biSA->GetTargetPhrasesByLexicalWeight(src, trg); + // + // std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr; + // for(itr = trg.begin(); itr != trg.end(); ++itr) { + // Scores scoreVector = itr->first; + // TargetPhrase *targetPhrase = itr->second; + // std::transform(scoreVector.begin(),scoreVector.end(), + // scoreVector.begin(),FloorScore); + // targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); + // targetPhrase->Evaluate(); + // ret->Add(targetPhrase); + // } + ret->NthElement(m_tableLimit); // sort the phrases for the decoder return ret; } -void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment) +void +PhraseDictionaryDynSuffixArray:: +insertSnt(string& source, string& target, string& alignment) { m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache } -void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */) -{ - // need to implement -- -} -ChartRuleLookupManager *PhraseDictionaryDynSuffixArray::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&) +void +PhraseDictionaryDynSuffixArray:: +deleteSnt(unsigned /* idx */, unsigned /* num2Del */) { - throw "Chart decoding not supported by PhraseDictionaryDynSuffixArray"; + // need to implement -- } -void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value) +ChartRuleLookupManager* +PhraseDictionaryDynSuffixArray:: +CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&) { - if (key == "source") { - m_source = value; - } else if (key == "target") { - m_target = value; - } else if (key == "alignment") { - m_alignments = value; - } else { - PhraseDictionary::SetParameter(key, value); - } + CHECK(false); + return 0; } }// end namepsace diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h index 2c9a9ceca..8cd761350 100644 --- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h +++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h @@ -17,21 +17,19 @@ class PhraseDictionaryDynSuffixArray: public PhraseDictionary public: PhraseDictionaryDynSuffixArray(const std::string &line); ~PhraseDictionaryDynSuffixArray(); - + bool InitDictionary(); void Load(); - // functions below required by base class const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const; void insertSnt(string&, string&, string&); void deleteSnt(unsigned, unsigned); ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&); - void SetParameter(const std::string& key, const std::string& value); - private: BilingualDynSuffixArray *m_biSA; std::string m_source, m_target, m_alignments; + std::vector<float> m_weight; }; } // end namespace diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp index 2f05e92b6..06e931fab 100644 --- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp +++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp @@ -31,7 +31,8 @@ using namespace std; namespace Moses { PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line) - : MyBase("PhraseDictionaryOnDisk", line) { + : MyBase("PhraseDictionaryOnDisk", line) +{ ReadParameters(); } diff --git a/moses/TranslationModel/RuleTable/Trie.h b/moses/TranslationModel/RuleTable/Trie.h index 265747260..0d363f747 100644 --- a/moses/TranslationModel/RuleTable/Trie.h +++ b/moses/TranslationModel/RuleTable/Trie.h @@ -48,12 +48,6 @@ public: void Load(); - // Required by PhraseDictionary. - virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const { - CHECK(false); - return NULL; - } - private: friend class RuleTableLoader; diff --git a/moses/TranslationModel/WordCoocTable.cpp b/moses/TranslationModel/WordCoocTable.cpp new file mode 100644 index 000000000..c3223942e --- /dev/null +++ b/moses/TranslationModel/WordCoocTable.cpp @@ -0,0 +1,72 @@ +#include "moses/TranslationModel/WordCoocTable.h" +using namespace std; +namespace Moses +{ + +WordCoocTable:: +WordCoocTable() +{ + m_cooc.reserve(1000000); + m_marg1.reserve(1000000); + m_marg2.reserve(1000000); +} + +WordCoocTable:: +WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2) + : m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0) +{} + +void +WordCoocTable:: +Count(size_t const a, size_t const b) +{ + while (a >= m_marg1.size()) { + m_cooc.push_back(my_map_t()); + m_marg1.push_back(0); + } + while (b >= m_marg2.size()) + m_marg2.push_back(0); + ++m_marg1[a]; + ++m_marg2[b]; + ++m_cooc[a][b]; +} + +uint32_t +WordCoocTable:: +GetJoint(size_t const a, size_t const b) const +{ + if (a >= m_marg1.size() || b >= m_marg2.size()) return 0; + my_map_t::const_iterator m = m_cooc.at(a).find(b); + if (m == m_cooc[a].end()) return 0; + return m->second; +} + +uint32_t +WordCoocTable:: +GetMarg1(size_t const x) const +{ + return x >= m_marg1.size() ? 0 : m_marg1[x]; +} + +uint32_t +WordCoocTable:: +GetMarg2(size_t const x) const +{ + return x >= m_marg2.size() ? 0 : m_marg2[x]; +} + +float +WordCoocTable:: +pfwd(size_t const a, size_t const b) const +{ + return float(GetJoint(a,b))/GetMarg1(a); +} + +float +WordCoocTable:: +pbwd(size_t const a, size_t const b) const +{ + // cerr << "at " << __FILE__ << ":" << __LINE__ << endl; + return float(GetJoint(a,b))/GetMarg2(b); +} +} diff --git a/moses/TranslationModel/WordCoocTable.h b/moses/TranslationModel/WordCoocTable.h new file mode 100644 index 000000000..60d788bd3 --- /dev/null +++ b/moses/TranslationModel/WordCoocTable.h @@ -0,0 +1,72 @@ +#ifndef moses_WordCoocTable_h +#define moses_WordCoocTable_h + +#include "moses/TranslationModel/DynSAInclude/vocab.h" +#include "moses/TranslationModel/DynSAInclude/types.h" +#include "moses/TranslationModel/DynSAInclude/utils.h" +#include "moses/InputFileStream.h" +#include "moses/FactorTypeSet.h" +#include "moses/TargetPhrase.h" +#include <boost/dynamic_bitset.hpp> +#include <map> + +namespace Moses +{ + +using namespace std; + +#ifndef bitvector +typedef boost::dynamic_bitset<uint64_t> bitvector; +#endif + + +/** + * Stores word cooccurrence counts + * @todo ask Uli Germann + */ +class WordCoocTable +{ + typedef map<wordID_t,uint32_t> my_map_t; + vector<my_map_t> m_cooc; + vector<uint32_t> m_marg1; + vector<uint32_t> m_marg2; +public: + WordCoocTable(); + WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2); + uint32_t GetJoint(size_t const a, size_t const b) const; + uint32_t GetMarg1(size_t const x) const; + uint32_t GetMarg2(size_t const x) const; + float pfwd(size_t const a, size_t const b) const; + float pbwd(size_t const a, size_t const b) const; + void + Count(size_t const a, size_t const b); + + template<typename idvec, typename alnvec> + void + Count(idvec const& s1, idvec const& s2, alnvec const& aln, + wordID_t const NULL1, wordID_t const NULL2); + +}; + +template<typename idvec, typename alnvec> +void +WordCoocTable:: +Count(idvec const& s1, idvec const& s2, alnvec const& aln, + wordID_t const NULL1, wordID_t const NULL2) +{ + boost::dynamic_bitset<uint64_t> check1(s1.size()), check2(s2.size()); + check1.set(); + check2.set(); + for (size_t i = 0; i < aln.size(); i += 2) { + Count(s1[aln[i]], s2[aln[i+1]]); + check1.reset(aln[i]); + check2.reset(aln[i+1]); + } + for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i)) + Count(s1[i], NULL2); + for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i)) + Count(NULL1, s2[i]); +} + +} +#endif diff --git a/moses/TypeDef.h b/moses/TypeDef.h index 2b98b5bc3..af3a47b23 100644 --- a/moses/TypeDef.h +++ b/moses/TypeDef.h @@ -121,6 +121,7 @@ enum PhraseTableImplementation { ,FuzzyMatch = 11 ,Compact = 12 ,Interpolated = 13 + ,DSuffixArray = 14 }; enum InputTypeEnum { diff --git a/moses/generic/sampling/Sampling.h b/moses/generic/sampling/Sampling.h new file mode 100644 index 000000000..69796c96e --- /dev/null +++ b/moses/generic/sampling/Sampling.h @@ -0,0 +1,51 @@ +#ifndef __sampling_h +#define __sampling_h + +// Utility functions for proper sub-sampling. +// (c) 2007-2012 Ulrich Germann + + +namespace Moses +{ + +inline +size_t +randInt(size_t N) +{ + return N*(rand()/(RAND_MAX+1.)); +} + +// select a random sample of size /s/ without restitution from the range of +// integers [0,N); +template<typename idx_t> +void +randomSample(vector<idx_t>& v, size_t s, size_t N) +{ + // see also Knuth: Art of Computer Programming Vol. 2, p. 142 + + s = min(s,N); + v.resize(s); + + // the first option tries to be a bit more efficient than O(N) in picking + // the samples. The threshold is an ad-hoc, off-the-cuff guess. I still + // need to figure out the optimal break-even point between a linear sweep + // and repeatedly picking random numbers with the risk of hitting the same + // number many times. + if (s*10<N) { + boost::dynamic_bitset<uint64_t> check(N,0); + for (size_t i = 0; i < v.size(); i++) { + size_t x = randInt(N); + while (check[x]) x = randInt(N); + check[x]=true; + v[i] = x; + } + } else { + size_t m=0; + for (size_t t = 0; m <= s && t < N; t++) + if (s==N || randInt(N-t) < s-m) v[m++] = t; + } +} + +}; + +#endif diff --git a/moses/generic/sorting/NBestList.h b/moses/generic/sorting/NBestList.h new file mode 100644 index 000000000..9c58baa88 --- /dev/null +++ b/moses/generic/sorting/NBestList.h @@ -0,0 +1,85 @@ +#ifndef __n_best_list_h +#define __n_best_list_h +#include <algorithm> +#include "moses/generic/sorting/VectorIndexSorter.h" + +// NBest List; (c) 2007-2012 Ulrich Germann +// +// The 'trick' used in this implementation is to maintain a heap of size <= N +// such that the lowest-scoring item is on top of the heap. For each incoming +// item we can then determine easily if it is in the top N. + +namespace Moses +{ +using namespace std; + +template<typename THINGY, typename CMP> +class + NBestList +{ + vector<uint32_t> m_heap; + vector<THINGY> m_list; + VectorIndexSorter<THINGY, CMP, uint32_t> m_better; + mutable vector<uint32_t> m_order; + mutable bool m_changed; +public: + NBestList(size_t const max_size, CMP const& cmp); + NBestList(size_t const max_size); + bool add(THINGY const& item); + THINGY const& operator[](int i) const; + size_t size() const { + return m_heap.size(); + } +}; + +template<typename THINGY, typename CMP> +NBestList<THINGY,CMP>:: +NBestList(size_t const max_size, CMP const& cmp) + : m_better(m_list, cmp), m_changed(false) +{ + m_heap.reserve(max_size); +} + +template<typename THINGY, typename CMP> +NBestList<THINGY,CMP>:: +NBestList(size_t const max_size) + : m_better(m_heap), m_changed(false) +{ + m_heap.reserve(max_size); +} + +template<typename THINGY, typename CMP> +bool +NBestList<THINGY,CMP>:: +add(THINGY const& item) +{ + if (m_heap.size() == m_heap.capacity()) { + if (m_better.Compare(item, m_list[m_heap.at(0)])) { + pop_heap(m_heap.begin(),m_heap.end(),m_better); + m_list[m_heap.back()] = item; + } else return false; + } else { + m_list.push_back(item); + m_heap.push_back(m_heap.size()); + } + push_heap(m_heap.begin(),m_heap.end(),m_better); + return m_changed = true; +} + +template<typename THINGY, typename CMP> +THINGY const& +NBestList<THINGY,CMP>:: +operator[](int i) const +{ + if (m_changed) { + m_order.assign(m_heap.begin(),m_heap.end()); + for (size_t k = m_heap.size(); k != 0; --k) + pop_heap(m_order.begin(), m_order.begin()+k); + m_changed = false; + } + if (i < 0) i += m_order.size(); + return m_list[m_order.at(i)]; +} + +} +#endif diff --git a/moses/generic/sorting/VectorIndexSorter.h b/moses/generic/sorting/VectorIndexSorter.h new file mode 100644 index 000000000..a689c417b --- /dev/null +++ b/moses/generic/sorting/VectorIndexSorter.h @@ -0,0 +1,69 @@ +#ifndef __vector_index_sorter_h +#define __vector_index_sorter_h + +// VectorIndexSorter; (c) 2007-2012 Ulrich Germann + +// A VectorIndexSorter is a function object for sorting indices into a vector +// of objects (instead of sorting the vector itself). +// +// typcial use: +// vector<thingy> my_vector; +// VectorIndexSorter<thingy,less<thingy>,int> sorter(my_vector); +// vector<int> order; +// sorter.get_order(order); + +namespace Moses +{ +template<typename VAL, typename COMP = greater<VAL>, typename IDX_T=size_t> +class + VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool> +{ + vector<VAL> const& m_vecref; + boost::shared_ptr<COMP> m_comp; +public: + + COMP const& Compare; + VectorIndexSorter(vector<VAL> const& v, COMP const& comp) + : m_vecref(v), Compare(comp) + { } + + VectorIndexSorter(vector<VAL> const& v) + : m_vecref(v), m_comp(new COMP()), Compare(*m_comp) + { } + + bool operator()(IDX_T const & a, IDX_T const & b) const { + bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b)); + bool bwd = Compare(m_vecref[b], m_vecref[a]); + return (fwd == bwd ? a < b : fwd); + } + + boost::shared_ptr<vector<IDX_T> > + GetOrder() const; + + void + GetOrder(vector<IDX_T> & order) const; + +}; + +template<typename VAL, typename COMP, typename IDX_T> +boost::shared_ptr<vector<IDX_T> > +VectorIndexSorter<VAL,COMP,IDX_T>:: +GetOrder() const +{ + boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size())); + get_order(*ret); + return ret; +} + +template<typename VAL, typename COMP, typename IDX_T> +void +VectorIndexSorter<VAL,COMP,IDX_T>:: +GetOrder(vector<IDX_T> & order) const +{ + order.resize(m_vecref.size()); + for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i; + sort(order.begin(), order.end(), *this); +} + +} +#endif |