// $Id$ // vim:tabstop=2 #include "PhraseDictionaryTree.h" #include #include #include #include #include #include #include "PrefixTree.h" #include "File.h" #include "ObjectPool.h" #include "LVoc.h" #include "TypeDef.h" #include "Util.h" template std::ostream& operator<<(std::ostream& out,const std::vector& x) { out<::const_iterator iend=x.end(); for(typename std::vector::const_iterator i=x.begin();i!=iend;++i) out<<*i<<' '; return out; } typedef std::vector Scores; typedef PrefixTreeF PTF; class TgtCand { IPhrase e; Scores sc; public: TgtCand() {} TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {} TgtCand(FILE* f) {readBin(f);} const IPhrase& GetPhrase() const {return e;} const Scores& GetScores() const {return sc;} void writeBin(FILE* f) const {fWriteVector(f,e);fWriteVector(f,sc);} void readBin(FILE* f) {fReadVector(f,e);fReadVector(f,sc);} }; class TgtCands : public std::vector { typedef std::vector MyBase; public: TgtCands() : MyBase() {} void writeBin(FILE* f) const { unsigned s=size();fWrite(f,s); for(size_t i=0;isize());} bool isRoot() const {return root;} PTF const* ptr() const {return p;} }; PhraseDictionaryTree::PrefixPtr::operator bool() const { return imp && imp->isValid(); } struct PDTimp { typedef PrefixTreeF PTF; typedef FilePtr CPT; typedef std::vector Data; typedef LVoc WordVoc; Data data; std::vector srcOffsets; FILE *os,*ot; WordVoc sv,tv; ObjectPool pPool; // a comparison with the Boost MemPools might be useful PDTimp() : os(0),ot(0) {PTF::setDefault(InvalidOffT);} ~PDTimp() {if(os) fClose(os);if(ot) fClose(ot);FreeMemory();} void FreeMemory() { for(Data::iterator i=data.begin();i!=data.end();++i) (*i).free(); pPool.reset(); } int Read(const std::string& fn); void GetTargetCandidates(const IPhrase& f,TgtCands& tgtCands) { if(f.empty()) return; if(f[0]>=data.size()) return; if(!data[f[0]]) return; assert(data[f[0]]->findKey(f[0])size()); OFF_T tCandOffset=data[f[0]]->find(f); if(tCandOffset==InvalidOffT) return; fSeek(ot,tCandOffset); tgtCands.readBin(ot); } typedef PhraseDictionaryTree::PrefixPtr PPtr; void GetTargetCandidates(PPtr p,TgtCands& tgtCands) { assert(p); if(p.imp->isRoot()) return; OFF_T tCandOffset=p.imp->ptr()->getData(p.imp->idx); if(tCandOffset==InvalidOffT) return; fSeek(ot,tCandOffset); tgtCands.readBin(ot); } void PrintTgtCand(const TgtCands& tcands,std::ostream& out) const; // convert target candidates from internal data structure to the external one void ConvertTgtCand(const TgtCands& tcands,std::vector& rv) const { for(TgtCands::const_iterator i=tcands.begin();i!=tcands.end();++i) { const IPhrase& iphrase=i->GetPhrase(); std::vector vs; vs.reserve(iphrase.size()); for(size_t j=0;jGetScores())); } } PPtr GetRoot() { return PPtr(pPool.get(PPimp(0,0,1))); } PPtr Extend(PPtr p,const std::string& w) { assert(p); if(w.empty() || w==EPSILON) return p; LabelId wi=sv.index(w); if(wi==InvalidLabelId) return PPtr(); // unknown word else if(p.imp->isRoot()) { if(wifindKeyPtr(wi)); return PPtr(pPool.get(PPimp(data[wi],data[wi]->findKey(wi),0))); } } else if(PTF const* nextP=p.imp->ptr()->getPtr(p.imp->idx)) { return PPtr(pPool.get(PPimp(nextP,nextP->findKey(wi),0))); } return PPtr(); } }; //////////////////////////////////////////////////////////// // // member functions of PDTimp // //////////////////////////////////////////////////////////// int PDTimp::Read(const std::string& fn) { std::string ifs(fn+".binphr.srctree"), ift(fn+".binphr.tgtdata"), ifi(fn+".binphr.idx"), ifsv(fn+".binphr.srcvoc"), iftv(fn+".binphr.tgtvoc"); FILE *ii=fOpen(ifi.c_str(),"rb"); fReadVector(ii,srcOffsets); fClose(ii); os=fOpen(ifs.c_str(),"rb"); ot=fOpen(ift.c_str(),"rb"); data.resize(srcOffsets.size()); for(size_t i=0;i abort \n\n"); abort(); } } PhraseDictionaryTree::~PhraseDictionaryTree() { delete imp; } void PhraseDictionaryTree::FreeMemory() const { imp->FreeMemory(); } void PhraseDictionaryTree:: GetTargetCandidates(const std::vector& src, std::vector& rv) const { IPhrase f(src.size()); for(size_t i=0;isv.index(src[i]); if(f[i]==InvalidLabelId) return; } TgtCands tgtCands; imp->GetTargetCandidates(f,tgtCands); imp->ConvertTgtCand(tgtCands,rv); } void PhraseDictionaryTree:: PrintTargetCandidates(const std::vector& src, std::ostream& out) const { IPhrase f(src.size()); for(size_t i=0;isv.index(src[i]); if(f[i]==InvalidLabelId) { TRACE_ERR("the source phrase '"<GetTargetCandidates(f,tcand); out<<"there are "<PrintTgtCand(tcand,out); } int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out) { std::string line; size_t count = 0; std::string ofn(out+".binphr.srctree"), oft(out+".binphr.tgtdata"), ofi(out+".binphr.idx"), ofsv(out+".binphr.srcvoc"), oftv(out+".binphr.tgtvoc"); FILE *os=fOpen(ofn.c_str(),"wb"), *ot=fOpen(oft.c_str(),"wb"); typedef PrefixTreeSA PSA; PSA *psa=new PSA;PSA::setDefault(InvalidOffT); LabelId currFirstWord=InvalidLabelId; IPhrase currF; TgtCands tgtCands; std::vector vo; size_t lnc=0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info while(getline(inFile, line)) { ++lnc; std::vector tokens = TokenizeMultiCharSeparator( line , "|||" ); if (numElement == NOT_FOUND) { // init numElement numElement = tokens.size(); assert(numElement == 3 || numElement == 5); } else if (tokens.size() != numElement) { std::stringstream strme; strme << "Syntax error at line " << lnc << " : " << line; UserMessage::Add(strme.str()); abort(); } IPhrase f,e;Scores sc; std::vector wordVec = Tokenize(tokens[0]); for (size_t i = 0 ; i < wordVec.size() ; ++i) f.push_back(imp->sv.add(wordVec[i])); wordVec = Tokenize(tokens[1]); for (size_t i = 0 ; i < wordVec.size() ; ++i) e.push_back(imp->tv.add(wordVec[i])); // while(is>>w && w!="|||") sc.push_back(atof(w.c_str())); // Mauro: to handle 0 probs in phrase tables std::vector scoreVector = Tokenize(tokens[(numElement==3) ? 2 : 4]); for (size_t i = 0 ; i < scoreVector.size() ; ++i) { float tmp = scoreVector[i]; sc.push_back(((tmp>0.0)?tmp:(float)1.0e-38)); } if(f.empty()) { TRACE_ERR("WARNING: empty source phrase in line '"<insert(f); if(d==InvalidOffT) d=fTell(ot); else { TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" <=vo.size()) vo.resize(currFirstWord+1,InvalidOffT); vo[currFirstWord]=fTell(os); pf.create(*psa,os); // clear delete psa;psa=new PSA; currFirstWord=f[0]; } // insert src phrase in prefix tree assert(psa); PSA::Data& d=psa->insert(f); if(d==InvalidOffT) d=fTell(ot); else { TRACE_ERR("ERROR: xsource phrase already inserted (B)!\nline(" << lnc << "): '" <=vo.size()) vo.resize(currFirstWord+1,InvalidOffT); vo[currFirstWord]=fTell(os); pf.create(*psa,os); delete psa;psa=0; TRACE_ERR("distinct source phrases: "< inv; for(size_t i=0;isv.Write(ofsv); imp->tv.Write(oftv); return 1; } int PhraseDictionaryTree::Read(const std::string& fn) { TRACE_ERR("size of OFF_T "<Read(fn); } PhraseDictionaryTree::PrefixPtr PhraseDictionaryTree::GetRoot() const { return imp->GetRoot(); } PhraseDictionaryTree::PrefixPtr PhraseDictionaryTree::Extend(PrefixPtr p, const std::string& w) const { return imp->Extend(p,w); } void PhraseDictionaryTree::PrintTargetCandidates(PrefixPtr p,std::ostream& out) const { TgtCands tcand; imp->GetTargetCandidates(p,tcand); out<<"there are "<PrintTgtCand(tcand,out); } void PhraseDictionaryTree:: GetTargetCandidates(PrefixPtr p, std::vector& rv) const { TgtCands tcands; imp->GetTargetCandidates(p,tcands); imp->ConvertTgtCand(tcands,rv); } std::string PhraseDictionaryTree::GetScoreProducerDescription() const { return "Phrase dictionary tree"; }