// $Id$ // vim:tabstop=2 #pragma once #include "StaticData.h" // needed for factor splitter inline bool existsFile(const char* filePath) { struct stat mystat; return (stat(filePath,&mystat)==0); } double addLogScale(double x,double y) { if(x>y) return addLogScale(y,x); else return x+log(1.0+exp(y-x)); } double Exp(double x) { return exp(x); } class PDTAimp { // only these classes are allowed to instantiate this class friend class PhraseDictionaryTreeAdaptor; protected: PDTAimp(PhraseDictionaryTreeAdaptor *p,unsigned nis) : m_languageModels(0),m_weightWP(0.0),m_dict(0), m_obj(p),useCache(1),m_numInputScores(nis),totalE(0),distinctE(0) {} public: std::vector m_weights; LMList const* m_languageModels; float m_weightWP; std::vector m_input,m_output; PhraseDictionaryTree *m_dict; typedef std::vector vTPC; mutable vTPC m_tgtColls; typedef std::map MapSrc2Tgt; mutable MapSrc2Tgt m_cache; PhraseDictionaryTreeAdaptor *m_obj; int useCache; std::vector m_rangeCache; unsigned m_numInputScores; UniqueObjectManager uniqSrcPhr; size_t totalE,distinctE; std::vector path1Best,pathExplored; std::vector pathCN; ~PDTAimp() { CleanUp(); delete m_dict; if (StaticData::Instance().GetVerboseLevel() >= 2) { TRACE_ERR("tgt candidates stats: total="<(std::cerr," \t")); TRACE_ERR("\n"); } if(pathCN.size()) { TRACE_ERR("CN (full): "); std::transform(pathCN.begin()+1 ,pathCN.end() ,std::ostream_iterator(std::cerr," \t") ,Exp); TRACE_ERR("\n"); } if(pathExplored.size()) { TRACE_ERR("CN (explored): "); std::copy(pathExplored.begin()+1,pathExplored.end(), std::ostream_iterator(std::cerr," \t")); TRACE_ERR("\n"); } } } void Factors2String(Word const& w,std::string& s) const { s=w.GetString(m_input,false); } void CleanUp() { assert(m_dict); m_dict->FreeMemory(); for(size_t i=0;i p =m_cache.insert(std::make_pair(source,static_cast(0))); if(p.second || p.first->second==0) { TargetPhraseCollection *ptr=new TargetPhraseCollection; ptr->Add(new TargetPhrase(targetPhrase)); p.first->second=ptr; m_tgtColls.push_back(ptr); } else VERBOSE(2, "WARNING: you added an already existing phrase!\n"); } TargetPhraseCollection const* GetTargetPhraseCollection(Phrase const &src) const { assert(m_dict); if(src.GetSize()==0) return 0; std::pair piter; if(useCache) { piter=m_cache.insert(std::make_pair(src,static_cast(0))); if(!piter.second) return piter.first->second; } else if (m_cache.size()) { MapSrc2Tgt::const_iterator i=m_cache.find(src); return (i!=m_cache.end() ? i->second : 0); } std::vector srcString(src.GetSize()); // convert source Phrase into vector of strings for(size_t i=0;i cands; m_dict->GetTargetCandidates(srcString,cands); if(cands.empty()) { return 0; } std::vector tCands;tCands.reserve(cands.size()); std::vector > costs;costs.reserve(cands.size()); // convert into TargetPhrases for(size_t i=0;i scoreVector(probVector.size()); std::transform(probVector.begin(),probVector.end(),scoreVector.begin(), TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(), FloorScore); CreateTargetPhrase(targetPhrase,factorStrings,scoreVector); costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(), tCands.size())); tCands.push_back(targetPhrase); } TargetPhraseCollection *rv=PruneTargetCandidates(tCands,costs); if(rv->IsEmpty()) { delete rv; return 0; } else { if(useCache) piter.first->second=rv; m_tgtColls.push_back(rv); return rv; } } void Create(const std::vector &input , const std::vector &output , const std::string &filePath , const std::vector &weight , const LMList &languageModels , float weightWP ) { // set my members m_dict=new PhraseDictionaryTree(weight.size()-m_numInputScores); m_input=input; m_output=output; m_languageModels=&languageModels; m_weightWP=weightWP; m_weights=weight; std::string binFname=filePath+".binphr.idx"; if(!existsFile(binFname.c_str())) { TRACE_ERR( "bin ttable does not exist -> create it\n"); InputFileStream in(filePath); m_dict->Create(in,filePath); } TRACE_ERR( "reading bin ttable\n"); m_dict->Read(filePath); } typedef PhraseDictionaryTree::PrefixPtr PPtr; typedef unsigned short Position; typedef std::pair Range; struct State { PPtr ptr; Range range; float score; Position realWords; Phrase src; State() : range(0,0),score(0.0),realWords(0),src(Input) {} State(Position b,Position e,const PPtr& v,float sc=0.0,Position rw=0) : ptr(v),range(b,e),score(sc),realWords(rw),src(Input) {} State(Range const& r,const PPtr& v,float sc=0.0,Position rw=0) : ptr(v),range(r),score(sc),realWords(rw),src(Input) {} Position begin() const {return range.first;} Position end() const {return range.second;} float GetScore() const {return score;} friend std::ostream& operator<<(std::ostream& out,State const& s) { out<<" R=("< factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter()); Word& w=targetPhrase.AddWord(); for(size_t l=0;l const & tCands, std::vector >& costs) const { // convert into TargetPhraseCollection TargetPhraseCollection *rv=new TargetPhraseCollection; // set limit to tableLimit or actual size, whatever is smaller std::vector >::iterator nth = costs.begin() + ((m_obj->m_tableLimit>0 && // 0 indicates no limit m_obj->m_tableLimit < costs.size()) ? m_obj->m_tableLimit : costs.size()); // find the nth phrase according to future cost std::nth_element(costs.begin(),nth ,costs.end()); // add n top phrases to the return list for(std::vector >::iterator it = costs.begin(); it != nth; ++it) rv->Add(new TargetPhrase(tCands[it->second])); return rv; } // POD for target phrase scores struct TScores { float total; StringTgtCand::second_type trans; Phrase const* src; TScores() : total(0.0),src(0) {} }; void CacheSource(ConfusionNet const& src) { assert(m_dict); const size_t srcSize=src.GetSize(); std::vector exploredPaths(srcSize+1,0); std::vector exPathsD(srcSize+1,-1.0); // collect some statistics std::vector cnDepths(srcSize,0); for(size_t i=0;i=0.0 ? addLogScale(pd,exPathsD[len]) : pd); } // update global statistics if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0); for(size_t len=1;len<=srcSize;++len) pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len]; if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0); for(size_t len=1;len<=srcSize;++len) path1Best[len]+=srcSize-len+1; if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size()) { TRACE_ERR("path stats for current CN: \nCN (full): "); std::transform(exPathsD.begin()+1 ,exPathsD.end() ,std::ostream_iterator(std::cerr," ") ,Exp); TRACE_ERR("\n"); } typedef StringTgtCand::first_type sPhrase; typedef std::map E2Costs; std::map cov2cand; std::vector stack; for(Position i=0 ; i < srcSize ; ++i) stack.push_back(State(i, i, m_dict->GetRoot())); while(!stack.empty()) { State curr(stack.back()); stack.pop_back(); assert(curr.end()0) continue; // At a given node in the prefix tree, look to see if w defines an edge to // another node (Extend). Stay at the same node if w==EPSILON PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s)); unsigned newRealWords=curr.realWords + (isEpsilon ? 0 : 1); if(nextP) // w is a word that should be considered { Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx)); float newScore=curr.GetScore()+currCol[colidx].second; // CN score Phrase newSrc(curr.src); if(!isEpsilon) newSrc.AddWord(w); if(newRange.secondLOWEST_SCORE) { // if there is more room to grow, add a new state onto the queue // to be explored that represents [begin, curEnd+) stack.push_back(State(newRange,nextP,newScore,newRealWords)); stack.back().src=newSrc; } std::vector tcands; // now, look up the target candidates (aprx. TargetPhraseCollection) for // the current path through the CN m_dict->GetTargetCandidates(nextP,tcands); if(newRange.second>=exploredPaths.size()+newRange.first) exploredPaths.resize(newRange.second-newRange.first+1,0); ++exploredPaths[newRange.second-newRange.first]; totalE+=tcands.size(); if(tcands.size()) { E2Costs& e2costs=cov2cand[newRange]; Phrase const* srcPtr=uniqSrcPhr(newSrc); for(size_t i=0;i nscores(tcands[i].second.size()+m_numInputScores,0.0); switch(m_numInputScores) { case 2: nscores[1]= -1.0f * newRealWords; // do not use -newRealWords ! -- RZ case 1: nscores[0]= newScore; case 0: break; default: TRACE_ERR("ERROR: too many model scaling factors for input weights 'weight-i' : "< p=e2costs.insert(std::make_pair(tcands[i].first,TScores())); if(p.second) ++distinctE; TScores & scores=p.first->second; if(p.second || scores.total= 2 && exploredPaths.size()) { TRACE_ERR("CN (explored): "); std::copy(exploredPaths.begin()+1,exploredPaths.end(), std::ostream_iterator(std::cerr," ")); TRACE_ERR("\n"); } if(pathExplored.size()::const_iterator i=cov2cand.begin();i!=cov2cand.end();++i) { assert(i->first.firstfirst.second>0); assert(static_cast(i->first.second-1)first.first].size()); assert(m_rangeCache[i->first.first][i->first.second-1]==0); std::vector tCands;tCands.reserve(i->second.size()); std::vector > costs;costs.reserve(i->second.size()); for(E2Costs::const_iterator j=i->second.begin();j!=i->second.end();++j) { TScores const & scores=j->second; TargetPhrase targetPhrase(Output); CreateTargetPhrase(targetPhrase,j->first,scores.trans,scores.src); costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size())); tCands.push_back(targetPhrase); //std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl; } TargetPhraseCollection *rv=PruneTargetCandidates(tCands,costs); if(rv->IsEmpty()) delete rv; else { m_rangeCache[i->first.first][i->first.second-1]=rv; m_tgtColls.push_back(rv); } } // free memory m_dict->FreeMemory(); } size_t GetNumInputScores() const {return m_numInputScores;} };