diff options
author | zens <zens@1f5c12ca-751b-0410-a591-d2e778427230> | 2006-07-24 22:33:08 +0400 |
---|---|---|
committer | zens <zens@1f5c12ca-751b-0410-a591-d2e778427230> | 2006-07-24 22:33:08 +0400 |
commit | 422418008ea4ce09774460e835402b029ec1825b (patch) | |
tree | 96e96980f42a97f2aa7143ae3f440ce68c8060f8 | |
parent | 90124bd40335a9e19ebd7a4eb563857d01fc1c3b (diff) |
- confusion net:
* more robust read functions
* correct treatment of epsilons
* code cleanup
- parameter: fixed check for binary phrase table
- staticData: do not read input phrases in case of binary phrase table
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@260 1f5c12ca-751b-0410-a591-d2e778427230
-rw-r--r-- | misc/Makefile | 2 | ||||
-rw-r--r-- | misc/processPhraseTable.cpp | 2 | ||||
-rw-r--r-- | moses/src/ConfusionNet.cpp | 66 | ||||
-rwxr-xr-x | moses/src/Parameter.cpp | 36 | ||||
-rwxr-xr-x | moses/src/Parameter.h | 3 | ||||
-rw-r--r-- | moses/src/PhraseDictionaryTree.cpp | 19 | ||||
-rw-r--r-- | moses/src/PhraseDictionaryTreeAdaptor.cpp | 7 | ||||
-rwxr-xr-x | moses/src/StaticData.cpp | 41 | ||||
-rwxr-xr-x | moses/src/TypeDef.h | 1 |
9 files changed, 124 insertions, 53 deletions
diff --git a/misc/Makefile b/misc/Makefile index bfe173fcf..a21823f27 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -13,7 +13,7 @@ default: processPhraseTable processPhraseTable.o: processPhraseTable.cpp $(CXX) $(CXXFLAGS) $(INCLUDES) $< -c -o $@ -MOSESLIB =$(HOME)/workspace/moses/src/libmoses.a +MOSESLIB =../moses/src/libmoses.a processPhraseTable: processPhraseTable.o $(MOSESLIB) $(CXX) $(LDFLAGS) $^ -o $@ $(LIBS) diff --git a/misc/processPhraseTable.cpp b/misc/processPhraseTable.cpp index ec5af4f99..d7becb8a6 100644 --- a/misc/processPhraseTable.cpp +++ b/misc/processPhraseTable.cpp @@ -174,7 +174,7 @@ int main(int argc,char **argv) { for(size_t i=0;i<pdicts.size();++i) weights.push_back(std::vector<float>(noScoreComponent,1/(1.0*noScoreComponent))); - while(net.Read(std::cin,factorOrder,cn-1)) { + while(net.ReadF(std::cin,factorOrder,cn-1)) { net.Print(std::cerr); GenerateCandidates(net,pdicts,weights,verb); } diff --git a/moses/src/ConfusionNet.cpp b/moses/src/ConfusionNet.cpp index 235ae87a4..02bc2e7ad 100644 --- a/moses/src/ConfusionNet.cpp +++ b/moses/src/ConfusionNet.cpp @@ -7,49 +7,73 @@ #include "PhraseDictionaryTreeAdaptor.h" #include "TranslationOptionCollectionConfusionNet.h" -ConfusionNet::ConfusionNet(FactorCollection* p) : InputType(),m_factorCollection(p) {} +ConfusionNet::ConfusionNet(FactorCollection* p) + : InputType(),m_factorCollection(p) {} void ConfusionNet::SetFactorCollection(FactorCollection *p) { m_factorCollection=p; } -bool ConfusionNet::ReadF(std::istream& in,const std::vector<FactorType>& factorOrder,int format) { - std::cerr<<"cn read with format "<<format<<"\n"; +bool ConfusionNet::ReadF(std::istream& in, + const std::vector<FactorType>& factorOrder, + int format) +{ + TRACE_ERR("read confusion net with format "<<format<<"\n"); switch(format) { case 0: return ReadFormat0(in,factorOrder); case 1: return ReadFormat1(in,factorOrder); default: - std::cerr<<"ERROR: unknown format '"<<format<<"' in ConfusionNet::Read\n"; + std::cerr<<"ERROR: unknown format '"<<format + <<"' in ConfusionNet::Read\n"; } return 0; } -int ConfusionNet::Read(std::istream& in,const std::vector<FactorType>& factorOrder, FactorCollection &factorCollection) +int ConfusionNet::Read(std::istream& in, + const std::vector<FactorType>& factorOrder, + FactorCollection &factorCollection) { SetFactorCollection(&factorCollection); return ReadF(in,factorOrder,0); } -void ConfusionNet::String2Word(const std::string& s,Word& w,const std::vector<FactorType>& factorOrder) { +void ConfusionNet::String2Word(const std::string& s,Word& w, + const std::vector<FactorType>& factorOrder) +{ std::vector<std::string> factorStrVector = Tokenize(s, "|"); for(size_t i=0;i<factorOrder.size();++i) - w.SetFactor(factorOrder[i],m_factorCollection->AddFactor(Input,factorOrder[i],factorStrVector[i])); + w.SetFactor(factorOrder[i], + m_factorCollection->AddFactor(Input,factorOrder[i], + factorStrVector[i])); } -bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& factorOrder) { +bool ConfusionNet::ReadFormat0(std::istream& in, + const std::vector<FactorType>& factorOrder) +{ assert(m_factorCollection); Clear(); std::string line; while(getline(in,line)) { std::istringstream is(line); - std::string word;float costs; + std::string word;double prob; Column col; - while(is>>word>>costs) { + while(is>>word>>prob) { Word w; String2Word(word,w,factorOrder); - col.push_back(std::make_pair(w,costs)); + if(prob<0.0) + { + std::cerr<<"WARN: negative prob: "<<prob<<" ->set to 0.0\n"; + prob=0.0; + } + else if (prob>1.0) + { + std::cerr<<"WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n"; + prob=1.0; + } + col.push_back(std::make_pair(w,std::max(static_cast<float>(log(prob)), + LOWEST_SCORE))); } if(col.size()) { data.push_back(col); @@ -59,7 +83,9 @@ bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& f } return !data.empty(); } -bool ConfusionNet::ReadFormat1(std::istream& in,const std::vector<FactorType>& factorOrder) { +bool ConfusionNet::ReadFormat1(std::istream& in, + const std::vector<FactorType>& factorOrder) +{ assert(m_factorCollection); Clear(); std::string line; @@ -110,15 +136,21 @@ std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn) cn.Print(out);return out; } -TargetPhraseCollection const* ConfusionNet::CreateTargetPhraseCollection(PhraseDictionaryBase const& d,const WordsRange& r) const +TargetPhraseCollection const* ConfusionNet:: +CreateTargetPhraseCollection(PhraseDictionaryBase const& d, + const WordsRange& r) const { - if(PhraseDictionaryTreeAdaptor const* pdict=dynamic_cast<PhraseDictionaryTreeAdaptor const*>(&d)) + if(PhraseDictionaryTreeAdaptor const* pdict= + dynamic_cast<PhraseDictionaryTreeAdaptor const*>(&d)) return pdict->GetTargetPhraseCollection(*this,r); - std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!\n" - "has to be PhraseDictionaryTreeAdaptor\n"; + + std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!" + " Has to be PhraseDictionaryTreeAdaptor\n"; abort(); } -TranslationOptionCollection* ConfusionNet::CreateTranslationOptionCollection() const + +TranslationOptionCollection* +ConfusionNet::CreateTranslationOptionCollection() const { return new TranslationOptionCollectionConfusionNet(*this); } diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp index 6b5273b3c..2b1473774 100755 --- a/moses/src/Parameter.cpp +++ b/moses/src/Parameter.cpp @@ -97,8 +97,15 @@ bool Parameter::Validate() // do files exist? // phrase tables - if (ret) - ret = FilesExist("ttable-file", 3); + if (ret) + { + std::vector<std::string> ext; + // standard phrase table extension (i.e. full name has to be specified) + ext.push_back(""); + // alternative file extension for binary phrase table format: + ext.push_back(".binphr.idx"); + ret = FilesExist("ttable-file", 3,ext); + } // generation tables if (ret) ret = FilesExist("generation-file", 2); @@ -109,7 +116,7 @@ bool Parameter::Validate() return ret; } -bool Parameter::FilesExist(const string ¶mName, size_t tokenizeIndex) +bool Parameter::FilesExist(const string ¶mName, size_t tokenizeIndex,std::vector<std::string> const& extensions) { using namespace boost::filesystem; @@ -135,14 +142,21 @@ bool Parameter::FilesExist(const string ¶mName, size_t tokenizeIndex) return false; } const string &pathStr = vec[tokenizeIndex]; - path filePath(pathStr, native); - if (!exists(filePath)) - { - stringstream errorMsg(""); - errorMsg << "File " << pathStr << " does not exists"; - UserMessage::Add(errorMsg.str()); - return false; - } + + bool fileFound=0; + for(size_t i=0;i<extensions.size() && !fileFound;++i) + { + path filePath(pathStr+extensions[i], native); + fileFound|=exists(filePath); + } + if(!fileFound) + { + stringstream errorMsg(""); + errorMsg << "File " << pathStr << " does not exists"; + UserMessage::Add(errorMsg.str()); + return false; + } + } return true; } diff --git a/moses/src/Parameter.h b/moses/src/Parameter.h index a8211b3bc..a43e52c03 100755 --- a/moses/src/Parameter.h +++ b/moses/src/Parameter.h @@ -41,7 +41,7 @@ protected: std::string FindParam(const std::string ¶mSwitch, int argc, char* argv[]); void OverwriteParam(const std::string ¶mSwitch, const std::string ¶mName, int argc, char* argv[]); bool ReadConfigFile( std::string filePath ); - bool FilesExist(const std::string ¶mName, size_t tokenizeIndex); + bool FilesExist(const std::string ¶mName, size_t tokenizeIndex,std::vector<std::string> const& fileExtension=std::vector<std::string>(1,"")); bool Validate(); @@ -59,5 +59,6 @@ public: { return m_setting[paramName]; } + }; diff --git a/moses/src/PhraseDictionaryTree.cpp b/moses/src/PhraseDictionaryTree.cpp index c5b448867..5860db893 100644 --- a/moses/src/PhraseDictionaryTree.cpp +++ b/moses/src/PhraseDictionaryTree.cpp @@ -5,6 +5,7 @@ #include <sstream> #include <iostream> #include <fstream> +#include <ext/hash_map> #include "PrefixTree.h" #include "File.h" @@ -29,6 +30,12 @@ typedef std::vector<LabelId> IPhrase; typedef std::vector<float> Scores; typedef PrefixTreeF<LabelId,off_t> PTF; +namespace __gnu_cxx { + template <> struct hash<std::string> { + size_t operator()(const std::string& s) const {return __gnu_cxx::__stl_hash_string(s.c_str());} + }; +} + template<typename A,typename B=std::map<A,LabelId> > class LVoc { typedef A Key; @@ -132,7 +139,8 @@ struct PDTimp { typedef PrefixTreeF<LabelId,off_t> PTF; typedef FilePtr<PTF> CPT; typedef std::vector<CPT> Data; - typedef LVoc<std::string> WordVoc; + // typedef LVoc<std::string> WordVoc; + typedef LVoc<std::string,__gnu_cxx::hash_map<std::string,LabelId> > WordVoc; Data data; std::vector<off_t> srcOffsets; @@ -215,7 +223,7 @@ struct PDTimp { PPtr Extend(PPtr p,const std::string& w) { assert(p); - if(w.empty()) return p; + if(w.empty() || w==EPSILON) return p; LabelId wi=sv.index(w); if(wi==InvalidLabelId) return PPtr(); else if(p.imp->isRoot()) @@ -291,6 +299,13 @@ PhraseDictionaryTree::PhraseDictionaryTree(size_t noScoreComponent, : Dictionary(noScoreComponent),imp(new PDTimp),m_inFactorType(ift),m_outFactorType(oft) { imp->m_factorCollection=fc; + if(sizeof(off_t)!=8) + { + std::cerr<<"ERROR: size of type 'off_t' has to be 64 bit!\n" + "use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n" + " -> abort \n\n"; + abort(); + } } PhraseDictionaryTree::~PhraseDictionaryTree() diff --git a/moses/src/PhraseDictionaryTreeAdaptor.cpp b/moses/src/PhraseDictionaryTreeAdaptor.cpp index fd514db99..0a78cdfe6 100644 --- a/moses/src/PhraseDictionaryTreeAdaptor.cpp +++ b/moses/src/PhraseDictionaryTreeAdaptor.cpp @@ -35,6 +35,7 @@ struct PDTAimp { : m_languageModels(0),m_weightWP(0.0),m_factorCollection(0),m_dict(0), m_obj(p),useCache(1) {} + // convert FactorArray into string void Factors2String(FactorArray const& w,std::string& s) const { for(size_t j=0;j<m_input.size();++j) @@ -44,6 +45,7 @@ struct PDTAimp { } } + // free temporary memory void CleanUp() { assert(m_dict); @@ -54,6 +56,7 @@ struct PDTAimp { m_rangeCache.clear(); } + // add phrase pair till next CleanUp, should be used only for unknowns void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase) { assert(GetTargetPhraseCollection(source)==0); @@ -70,12 +73,14 @@ struct PDTAimp { else std::cerr<<"WARNING: you added an already existing phrase!\n"; } + // access with full source phrase TargetPhraseCollection const* GetTargetPhraseCollection(Phrase const &src) const { assert(m_dict); if(src.GetSize()==0) return 0; + // look up cache std::pair<MapSrc2Tgt::iterator,bool> piter; if(useCache) { @@ -84,6 +89,8 @@ struct PDTAimp { } else if (m_cache.size()) { + // cache is also used for unknowns, so even if the cache is disabled + // there may be entries MapSrc2Tgt::const_iterator i=m_cache.find(src); return (i!=m_cache.end() ? i->second : 0); } diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp index 7da13e37b..44323a3e4 100755 --- a/moses/src/StaticData.cpp +++ b/moses/src/StaticData.cpp @@ -418,30 +418,31 @@ void StaticData::LoadPhraseTables(bool filter + PROJECT_NAME + "--" + inputFileHash + "--" + phraseTableHash + ".txt"; - bool filterPhrase; - if (filter) - { - boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native); - if (boost::filesystem::exists(tempFile)) - { // load filtered file instead - filterPhrase = false; - filePath = hashFilePath; - } - else - { // load original file & create has file - filterPhrase = true; - } - } - else - { // load original file - filterPhrase = false; - } - TRACE_ERR(filePath << endl); timer.check("Start loading PhraseTable"); - if (!boost::filesystem::exists(filePath+".binphr.idx")) { + bool filterPhrase; + if (filter) + { + boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native); + if (boost::filesystem::exists(tempFile)) + { // load filtered file instead + filterPhrase = false; + filePath = hashFilePath; + } + else + { // load original file & create has file + filterPhrase = true; + } + } + else + { // load original file + filterPhrase = false; + } + TRACE_ERR(filePath << endl); + + TRACE_ERR("using standard phrase tables"); PhraseDictionary *pd=new PhraseDictionary(noScoreComponent); pd->Load(input diff --git a/moses/src/TypeDef.h b/moses/src/TypeDef.h index f1abf17e3..c109f7e12 100755 --- a/moses/src/TypeDef.h +++ b/moses/src/TypeDef.h @@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #define SENTENCE_START "<s>" #define SENTENCE_END "</s>" #define UNKNOWN_FACTOR "UNK" +#define EPSILON "*EPS*" #define NOT_FOUND std::numeric_limits<size_t>::max() #define MAX_NGRAM_SIZE 20 |