#include "PDTAimp.h" namespace Moses { PDTAimp::PDTAimp(PhraseDictionaryTreeAdaptor *p) : m_dict(0), m_obj(p), useCache(1), totalE(0), distinctE(0) { m_numInputScores = 0; m_inputFeature = InputFeature::InstancePtr(); if (m_inputFeature) { const PhraseDictionary *firstPt = PhraseDictionary::GetColl()[0]; if (firstPt == m_obj) { m_numInputScores = m_inputFeature->GetNumScoreComponents(); } } } PDTAimp::~PDTAimp() { CleanUp(); delete m_dict; if (StaticData::Instance().GetVerboseLevel() >= 2) { TRACE_ERR("tgt candidates stats: total="<(std::cerr," \t")); TRACE_ERR("\n"); } if(pathCN.size()) { TRACE_ERR("CN (full): "); std::transform(pathCN.begin()+1 ,pathCN.end() ,std::ostream_iterator(std::cerr," \t") ,Exp); TRACE_ERR("\n"); } if(pathExplored.size()) { TRACE_ERR("CN (explored): "); std::copy(pathExplored.begin()+1,pathExplored.end(), std::ostream_iterator(std::cerr," \t")); TRACE_ERR("\n"); } } } void PDTAimp::CleanUp() { assert(m_dict); m_dict->FreeMemory(); // for(size_t i=0; i piter; if(useCache) { piter=m_cache.insert(std::make_pair(src, ret)); if(!piter.second) return piter.first->second; } else if (m_cache.size()) { MapSrc2Tgt::const_iterator i=m_cache.find(src); return (i!=m_cache.end() ? i->second : ret); } std::vector srcString(src.GetSize()); // convert source Phrase into vector of strings for(size_t i=0; i cands; std::vector wacands; m_dict->GetTargetCandidates(srcString,cands,wacands); if(cands.empty()) { return ret; } //TODO: Multiple models broken here std::vector weights = StaticData::Instance().GetWeights(m_obj); std::vector tCands; tCands.reserve(cands.size()); std::vector > costs; costs.reserve(cands.size()); std::vector sourcePhrases; sourcePhrases.reserve(cands.size()); // convert into TargetPhrases std::string fd = m_obj->options()->output.factor_delimiter; for(size_t i=0; i scoreVector(probVector.size()); std::transform(probVector.begin(),probVector.end(),scoreVector.begin(), TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(), FloorScore); //sparse features. //These are already in log-space for (size_t j = 0; j < cands[i].fnames.size(); ++j) { targetPhrase.GetScoreBreakdown().Assign(m_obj, *cands[i].fnames[j], cands[i].fvalues[j]); } CreateTargetPhrase(targetPhrase,factorStrings, fd, scoreVector, Scores(0), &wacands[i], &src); costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size())); tCands.push_back(targetPhrase); sourcePhrases.push_back(src); } ret = PruneTargetCandidates(tCands,costs, sourcePhrases); if(ret->IsEmpty()) { ret.reset(); } else { if(useCache) piter.first->second = ret; m_tgtColls.push_back(ret); } return ret; } void PDTAimp::Create(const std::vector &input , const std::vector &output , const std::string &filePath , const std::vector &weight ) { // set my members m_dict=new PhraseDictionaryTree(); m_input=input; m_output=output; const StaticData &staticData = StaticData::Instance(); m_dict->NeedAlignmentInfo(staticData.NeedAlignmentInfo()); std::string binFname=filePath+".binphr.idx"; if(!FileExists(binFname.c_str())) { UTIL_THROW2( "bin ttable does not exist"); //TRACE_ERR( "bin ttable does not exist -> create it\n"); //InputFileStream in(filePath); //m_dict->Create(in,filePath); } VERBOSE(1,"reading bin ttable\n"); // m_dict->Read(filePath); bool res=m_dict->Read(filePath); if (!res) { std::cerr << "bin ttable was read in a wrong way\n"; exit(1); } } void PDTAimp::CacheSource(ConfusionNet const& src) { assert(m_dict); const size_t srcSize=src.GetSize(); std::vector exploredPaths(srcSize+1,0); std::vector exPathsD(srcSize+1,-1.0); // collect some statistics std::vector cnDepths(srcSize,0); for(size_t i=0; i=0.0 ? addLogScale(pd,exPathsD[len]) : pd); } // update global statistics if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0); for(size_t len=1; len<=srcSize; ++len) pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len]; if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0); for(size_t len=1; len<=srcSize; ++len) path1Best[len]+=srcSize-len+1; if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size()) { TRACE_ERR("path stats for current CN: \nCN (full): "); std::transform(exPathsD.begin()+1 ,exPathsD.end() ,std::ostream_iterator(std::cerr," ") ,Exp); TRACE_ERR("\n"); } typedef std::map E2Costs; std::map cov2cand; std::vector stack; for(Position i=0 ; i < srcSize ; ++i) stack.push_back(State(i, i, m_dict->GetRoot(), std::vector(m_numInputScores,0.0))); std::vector weightTrans = StaticData::Instance().GetWeights(m_obj); std::vector weightInput = StaticData::Instance().GetWeights(m_inputFeature); float weightWP = StaticData::Instance().GetWeightWordPenalty(); while(!stack.empty()) { State curr(stack.back()); stack.pop_back(); UTIL_THROW_IF2(curr.end() >= srcSize, "Error"); const ConfusionNet::Column &currCol=src[curr.end()]; // in a given column, loop over all possibilities for(size_t colidx=0; colidx0) continue; // At a given node in the prefix tree, look to see if w defines an edge to // another node (Extend). Stay at the same node if w==EPSILON PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s)); if(nextP) { // w is a word that should be considered Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx)); //add together the link scores from the current state and the new arc float inputScoreSum = 0; std::vector newInputScores(m_numInputScores,0.0); if (m_numInputScores) { std::transform(currCol[colidx].second.denseScores.begin(), currCol[colidx].second.denseScores.end(), curr.GetScores().begin(), newInputScores.begin(), std::plus()); //we need to sum up link weights (excluding realWordCount, which isn't in numLinkParams) //if the sum is too low, then we won't expand this. //TODO: dodgy! shouldn't we consider weights here? what about zero-weight params? inputScoreSum = std::accumulate(newInputScores.begin(),newInputScores.begin()+m_numInputScores,0.0); } Phrase newSrc(curr.src); if(!isEpsilon) newSrc.AddWord(w); if(newRange.secondLOWEST_SCORE) { // if there is more room to grow, add a new state onto the queue // to be explored that represents [begin, curEnd+) stack.push_back(State(newRange,nextP,newInputScores)); stack.back().src=newSrc; } std::vector tcands; // now, look up the target candidates (aprx. TargetPhraseCollection) for // the current path through the CN m_dict->GetTargetCandidates(nextP,tcands); if(newRange.second>=exploredPaths.size()+newRange.first) exploredPaths.resize(newRange.second-newRange.first+1,0); ++exploredPaths[newRange.second-newRange.first]; totalE+=tcands.size(); if(tcands.size()) { E2Costs& e2costs=cov2cand[newRange]; Phrase const* srcPtr=uniqSrcPhr(newSrc); for(size_t i=0; i transcores(m_obj->GetNumScoreComponents()); UTIL_THROW_IF2(transcores.size() != weightTrans.size(), "Incorrect number of translation scores"); //put in phrase table scores, logging as we insert std::transform(tcands[i].scores.begin() ,tcands[i].scores.end() ,transcores.begin() ,TransformScore); //tally up float score=std::inner_product(transcores.begin(), transcores.end(), weightTrans.begin(), 0.0f); // input feature score +=std::inner_product(newInputScores.begin(), newInputScores.end(), weightInput.begin(), 0.0f); //count word penalty score-=tcands[i].tokens.size() * weightWP; std::pair p=e2costs.insert(std::make_pair(tcands[i].tokens,TScores())); if(p.second) ++distinctE; TScores & scores=p.first->second; if(p.second || scores.total= 2 && exploredPaths.size()) { TRACE_ERR("CN (explored): "); std::copy(exploredPaths.begin()+1,exploredPaths.end(), std::ostream_iterator(std::cerr," ")); TRACE_ERR("\n"); } if(pathExplored.size()::const_iterator i=cov2cand.begin(); i!=cov2cand.end(); ++i) { assert(i->first.firstfirst.second>0); assert(static_cast(i->first.second-1)first.first].size()); assert(m_rangeCache[i->first.first][i->first.second-1]==0); std::vector tCands; tCands.reserve(i->second.size()); std::vector > costs; costs.reserve(i->second.size()); std::vector sourcePhrases; sourcePhrases.reserve(i->second.size()); for(E2Costs::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) { TScores const & scores=j->second; TargetPhrase targetPhrase(m_obj); CreateTargetPhrase(targetPhrase , j ->first , m_obj->options()->output.factor_delimiter , scores.transScore , scores.inputScores , NULL , scores.src); costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size())); tCands.push_back(targetPhrase); sourcePhrases.push_back(*scores.src); //std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl; } TargetPhraseCollectionWithSourcePhrase::shared_ptr rv = PruneTargetCandidates(tCands, costs, sourcePhrases); if(rv->IsEmpty()) rv.reset(); else { m_rangeCache[i->first.first][i->first.second-1]=rv; m_tgtColls.push_back(rv); } } // free memory m_dict->FreeMemory(); } void PDTAimp::CreateTargetPhrase(TargetPhrase& targetPhrase, StringTgtCand::Tokens const& factorStrings, std::string const& factorDelimiter, Scores const& transVector, Scores const& inputVector, const std::string *alignmentString, Phrase const* srcPtr) const { FactorCollection &factorCollection = FactorCollection::Instance(); for(size_t k=0; k word(*factorStrings[k], factorDelimiter); Word& w=targetPhrase.AddWord(); for(size_t l=0; lGetFeaturesToApply()); } TargetPhraseCollectionWithSourcePhrase::shared_ptr PDTAimp::PruneTargetCandidates (const std::vector & tCands, std::vector >& costs, const std::vector &sourcePhrases) const { // convert into TargetPhraseCollection UTIL_THROW_IF2(tCands.size() != sourcePhrases.size(), "Number of target phrases must equal number of source phrases"); TargetPhraseCollectionWithSourcePhrase::shared_ptr rv; rv.reset(new TargetPhraseCollectionWithSourcePhrase); // set limit to tableLimit or actual size, whatever is smaller std::vector >::iterator nth = costs.begin() + ((m_obj->m_tableLimit>0 && // 0 indicates no limit m_obj->m_tableLimit < costs.size()) ? m_obj->m_tableLimit : costs.size()); // find the nth phrase according to future cost NTH_ELEMENT3(costs.begin(),nth ,costs.end()); // add n top phrases to the return list for(std::vector >::iterator it = costs.begin(); it != nth; ++it) { size_t ind = it->second; TargetPhrase *targetPhrase = new TargetPhrase(tCands[ind]); const Phrase &sourcePhrase = sourcePhrases[ind]; rv->Add(targetPhrase, sourcePhrase); } return rv; } }