- confusion net:

* more robust read functions * correct treatment of epsilons * code cleanup - parameter: fixed check for binary phrase table - staticData: do not read input phrases in case of binary phrase table git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@260 1f5c12ca-751b-0410-a591-d2e778427230
author: zens <zens@1f5c12ca-751b-0410-a591-d2e778427230> 2006-07-24 22:33:08 +0400
committer: zens <zens@1f5c12ca-751b-0410-a591-d2e778427230> 2006-07-24 22:33:08 +0400
commit: 422418008ea4ce09774460e835402b029ec1825b (patch)
tree: 96e96980f42a97f2aa7143ae3f440ce68c8060f8
parent: 90124bd40335a9e19ebd7a4eb563857d01fc1c3b (diff)
9 files changed, 124 insertions, 53 deletions
diff --git a/misc/Makefile b/misc/Makefile
index bfe173fcf..a21823f27 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -13,7 +13,7 @@ default: processPhraseTable
 processPhraseTable.o: processPhraseTable.cpp
 	$(CXX) $(CXXFLAGS) $(INCLUDES) $< -c -o $@
 
-MOSESLIB =$(HOME)/workspace/moses/src/libmoses.a
+MOSESLIB =../moses/src/libmoses.a
 
 processPhraseTable: processPhraseTable.o $(MOSESLIB)
 	$(CXX) $(LDFLAGS) $^ -o $@ $(LIBS)
diff --git a/misc/processPhraseTable.cpp b/misc/processPhraseTable.cpp
index ec5af4f99..d7becb8a6 100644
--- a/misc/processPhraseTable.cpp
+++ b/misc/processPhraseTable.cpp
@@ -174,7 +174,7 @@ int main(int argc,char **argv) {
 						for(size_t i=0;i<pdicts.size();++i)
 							weights.push_back(std::vector<float>(noScoreComponent,1/(1.0*noScoreComponent)));
 
-						while(net.Read(std::cin,factorOrder,cn-1)) {
+						while(net.ReadF(std::cin,factorOrder,cn-1)) {
 							net.Print(std::cerr);
 							GenerateCandidates(net,pdicts,weights,verb);
 						}
diff --git a/moses/src/ConfusionNet.cpp b/moses/src/ConfusionNet.cpp
index 235ae87a4..02bc2e7ad 100644
--- a/moses/src/ConfusionNet.cpp
+++ b/moses/src/ConfusionNet.cpp
@@ -7,49 +7,73 @@
 #include "PhraseDictionaryTreeAdaptor.h"
 #include "TranslationOptionCollectionConfusionNet.h"
 
-ConfusionNet::ConfusionNet(FactorCollection* p) : InputType(),m_factorCollection(p) {}
+ConfusionNet::ConfusionNet(FactorCollection* p) 
+	: InputType(),m_factorCollection(p) {}
 
 void ConfusionNet::SetFactorCollection(FactorCollection *p) 
 {
 	m_factorCollection=p;
 }
-bool ConfusionNet::ReadF(std::istream& in,const std::vector<FactorType>& factorOrder,int format) {
-	std::cerr<<"cn read with format "<<format<<"\n";
+bool ConfusionNet::ReadF(std::istream& in,
+												 const std::vector<FactorType>& factorOrder,
+												 int format) 
+{
+	TRACE_ERR("read confusion net with format "<<format<<"\n");
 	switch(format) 
 		{
 		case 0: return ReadFormat0(in,factorOrder);
 		case 1: return ReadFormat1(in,factorOrder);
 		default: 
-			std::cerr<<"ERROR: unknown format '"<<format<<"' in ConfusionNet::Read\n";
+			std::cerr<<"ERROR: unknown format '"<<format
+							 <<"' in ConfusionNet::Read\n";
 		}
 	return 0;
 }
 
-int ConfusionNet::Read(std::istream& in,const std::vector<FactorType>& factorOrder, FactorCollection &factorCollection) 
+int ConfusionNet::Read(std::istream& in,
+											 const std::vector<FactorType>& factorOrder, 
+											 FactorCollection &factorCollection) 
 {
 	SetFactorCollection(&factorCollection);
 	return ReadF(in,factorOrder,0);
 }
 
 
-void ConfusionNet::String2Word(const std::string& s,Word& w,const std::vector<FactorType>& factorOrder) {
+void ConfusionNet::String2Word(const std::string& s,Word& w,
+															 const std::vector<FactorType>& factorOrder) 
+{
 	std::vector<std::string> factorStrVector = Tokenize(s, "|");
 	for(size_t i=0;i<factorOrder.size();++i)
-		w.SetFactor(factorOrder[i],m_factorCollection->AddFactor(Input,factorOrder[i],factorStrVector[i]));
+		w.SetFactor(factorOrder[i],
+								m_factorCollection->AddFactor(Input,factorOrder[i],
+																							factorStrVector[i]));
 }
 
-bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& factorOrder) {
+bool ConfusionNet::ReadFormat0(std::istream& in,
+															 const std::vector<FactorType>& factorOrder) 
+{
 	assert(m_factorCollection);
 	Clear();
 	std::string line;
 	while(getline(in,line)) {
 		std::istringstream is(line);
-		std::string word;float costs;
+		std::string word;double prob;
 		Column col;
-		while(is>>word>>costs) {
+		while(is>>word>>prob) {
 			Word w;
 			String2Word(word,w,factorOrder);
-			col.push_back(std::make_pair(w,costs));
+			if(prob<0.0) 
+				{
+					std::cerr<<"WARN: negative prob: "<<prob<<" ->set to 0.0\n";
+					prob=0.0;
+				}
+			else if (prob>1.0)
+				{
+					std::cerr<<"WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n";
+					prob=1.0;
+				}
+			col.push_back(std::make_pair(w,std::max(static_cast<float>(log(prob)),
+																							LOWEST_SCORE)));
 		}
 		if(col.size()) {
 			data.push_back(col);
@@ -59,7 +83,9 @@ bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& f
 	}
 	return !data.empty();
 }
-bool ConfusionNet::ReadFormat1(std::istream& in,const std::vector<FactorType>& factorOrder) {
+bool ConfusionNet::ReadFormat1(std::istream& in,
+															 const std::vector<FactorType>& factorOrder) 
+{
 	assert(m_factorCollection);
 	Clear();
 	std::string line;
@@ -110,15 +136,21 @@ std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
 	cn.Print(out);return out;
 }
 
-TargetPhraseCollection const* ConfusionNet::CreateTargetPhraseCollection(PhraseDictionaryBase const& d,const WordsRange& r) const 
+TargetPhraseCollection const* ConfusionNet::
+CreateTargetPhraseCollection(PhraseDictionaryBase const& d,
+														 const WordsRange& r) const 
 {
-	if(PhraseDictionaryTreeAdaptor const* pdict=dynamic_cast<PhraseDictionaryTreeAdaptor const*>(&d))
+	if(PhraseDictionaryTreeAdaptor const* pdict=
+		 dynamic_cast<PhraseDictionaryTreeAdaptor const*>(&d))
 		return pdict->GetTargetPhraseCollection(*this,r);
-	std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!\n"
-		"has to be PhraseDictionaryTreeAdaptor\n";
+
+	std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!"
+		"  Has to be PhraseDictionaryTreeAdaptor\n";
 	abort();
 }
-TranslationOptionCollection* ConfusionNet::CreateTranslationOptionCollection() const 
+
+TranslationOptionCollection* 
+ConfusionNet::CreateTranslationOptionCollection() const 
 {
 	return new TranslationOptionCollectionConfusionNet(*this);
 }
diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp
index 6b5273b3c..2b1473774 100755
--- a/moses/src/Parameter.cpp
+++ b/moses/src/Parameter.cpp
@@ -97,8 +97,15 @@ bool Parameter::Validate()
 
   // do files exist?
 	// phrase tables
-	if (ret)
-		ret = FilesExist("ttable-file", 3);
+	if (ret) 
+		{
+			std::vector<std::string> ext;
+			// standard phrase table extension (i.e. full name has to be specified)
+			ext.push_back("");
+			// alternative file extension for binary phrase table format:
+			ext.push_back(".binphr.idx");
+			ret = FilesExist("ttable-file", 3,ext);
+		}
 	// generation tables
 	if (ret)
 		ret = FilesExist("generation-file", 2);
@@ -109,7 +116,7 @@ bool Parameter::Validate()
 	return ret;
 }
 
-bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex)
+bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex,std::vector<std::string> const& extensions)
 {
 	using namespace boost::filesystem;
 	
@@ -135,14 +142,21 @@ bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex)
 			return false;
 		}
 		const string &pathStr = vec[tokenizeIndex];
-		path filePath(pathStr, native);
-		if (!exists(filePath))
-		{
-			stringstream errorMsg("");
-			errorMsg << "File " << pathStr << " does not exists";
-			UserMessage::Add(errorMsg.str());
-			return false;
-		}
+
+		bool fileFound=0;
+		for(size_t i=0;i<extensions.size() && !fileFound;++i)
+			{
+				path filePath(pathStr+extensions[i], native);
+				fileFound|=exists(filePath);
+			}
+		if(!fileFound)
+			{
+				stringstream errorMsg("");
+				errorMsg << "File " << pathStr << " does not exists";
+				UserMessage::Add(errorMsg.str());
+				return false;
+			}
+			
 	}
 	return true;
 }
diff --git a/moses/src/Parameter.h b/moses/src/Parameter.h
index a8211b3bc..a43e52c03 100755
--- a/moses/src/Parameter.h
+++ b/moses/src/Parameter.h
@@ -41,7 +41,7 @@ protected:
 	std::string FindParam(const std::string &paramSwitch, int argc, char* argv[]);
 	void OverwriteParam(const std::string &paramSwitch, const std::string &paramName, int argc, char* argv[]);
 	bool ReadConfigFile( std::string filePath );
-	bool FilesExist(const std::string &paramName, size_t tokenizeIndex);
+	bool FilesExist(const std::string &paramName, size_t tokenizeIndex,std::vector<std::string> const& fileExtension=std::vector<std::string>(1,""));
 
 	bool Validate();
 
@@ -59,5 +59,6 @@ public:
 	{
 		return m_setting[paramName];
 	}
+
 };
 
diff --git a/moses/src/PhraseDictionaryTree.cpp b/moses/src/PhraseDictionaryTree.cpp
index c5b448867..5860db893 100644
--- a/moses/src/PhraseDictionaryTree.cpp
+++ b/moses/src/PhraseDictionaryTree.cpp
@@ -5,6 +5,7 @@
 #include <sstream>
 #include <iostream>
 #include <fstream>
+#include <ext/hash_map>
 
 #include "PrefixTree.h"
 #include "File.h"
@@ -29,6 +30,12 @@ typedef std::vector<LabelId> IPhrase;
 typedef std::vector<float> Scores;
 typedef PrefixTreeF<LabelId,off_t> PTF;
 
+namespace __gnu_cxx {
+	template <> struct hash<std::string> {
+		size_t operator()(const std::string& s) const {return __gnu_cxx::__stl_hash_string(s.c_str());}
+	};
+}
+
 template<typename A,typename B=std::map<A,LabelId> >
 class LVoc {
   typedef A Key;
@@ -132,7 +139,8 @@ struct PDTimp {
   typedef PrefixTreeF<LabelId,off_t> PTF;
 	typedef FilePtr<PTF> CPT;
   typedef std::vector<CPT> Data;
-	typedef LVoc<std::string> WordVoc;
+	//	typedef LVoc<std::string> WordVoc;
+	typedef LVoc<std::string,__gnu_cxx::hash_map<std::string,LabelId> > WordVoc;
 
   Data data;
   std::vector<off_t> srcOffsets;
@@ -215,7 +223,7 @@ struct PDTimp {
 	PPtr Extend(PPtr p,const std::string& w) 
 	{
 		assert(p);
-		if(w.empty()) return p;
+		if(w.empty() || w==EPSILON) return p;
 		LabelId wi=sv.index(w);
 		if(wi==InvalidLabelId) return PPtr();
 		else if(p.imp->isRoot()) 
@@ -291,6 +299,13 @@ PhraseDictionaryTree::PhraseDictionaryTree(size_t noScoreComponent,
 	: Dictionary(noScoreComponent),imp(new PDTimp),m_inFactorType(ift),m_outFactorType(oft) 
 {
 	imp->m_factorCollection=fc;
+	if(sizeof(off_t)!=8)
+		{
+			std::cerr<<"ERROR: size of type 'off_t' has to be 64 bit!\n"
+				"use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n"
+				" -> abort \n\n";
+			abort();
+		}
 }
 
 PhraseDictionaryTree::~PhraseDictionaryTree() 
diff --git a/moses/src/PhraseDictionaryTreeAdaptor.cpp b/moses/src/PhraseDictionaryTreeAdaptor.cpp
index fd514db99..0a78cdfe6 100644
--- a/moses/src/PhraseDictionaryTreeAdaptor.cpp
+++ b/moses/src/PhraseDictionaryTreeAdaptor.cpp
@@ -35,6 +35,7 @@ struct PDTAimp {
 		: m_languageModels(0),m_weightWP(0.0),m_factorCollection(0),m_dict(0),
 			m_obj(p),useCache(1) {}
 
+	// convert FactorArray into string
 	void Factors2String(FactorArray const& w,std::string& s) const 
 	{
 		for(size_t j=0;j<m_input.size();++j)
@@ -44,6 +45,7 @@ struct PDTAimp {
 			}
 	}
 
+	// free temporary memory
 	void CleanUp() 
 	{
 		assert(m_dict);
@@ -54,6 +56,7 @@ struct PDTAimp {
 		m_rangeCache.clear();
 	}
 
+	// add phrase pair till next CleanUp, should be used only for unknowns
 	void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase) 
 	{
 		assert(GetTargetPhraseCollection(source)==0);
@@ -70,12 +73,14 @@ struct PDTAimp {
 		else std::cerr<<"WARNING: you added an already existing phrase!\n";
 	}
 
+	// access with full source phrase
 	TargetPhraseCollection const* 
 	GetTargetPhraseCollection(Phrase const &src) const
 	{
 		assert(m_dict);
 		if(src.GetSize()==0) return 0;
 
+		// look up cache
 		std::pair<MapSrc2Tgt::iterator,bool> piter;
 		if(useCache) 
 			{
@@ -84,6 +89,8 @@ struct PDTAimp {
 			}
 		else if (m_cache.size()) 
 			{
+				// cache is also used for unknowns, so even if the cache is disabled
+				// there may be entries
 				MapSrc2Tgt::const_iterator i=m_cache.find(src);
 				return (i!=m_cache.end() ? i->second : 0);
 			}
diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp
index 7da13e37b..44323a3e4 100755
--- a/moses/src/StaticData.cpp
+++ b/moses/src/StaticData.cpp
@@ -418,30 +418,31 @@ void StaticData::LoadPhraseTables(bool filter
 															+ PROJECT_NAME + "--"
 															+ inputFileHash + "--" 
 															+ phraseTableHash + ".txt";
-			bool filterPhrase;
-			if (filter)
-			{
-				boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native);
-				if (boost::filesystem::exists(tempFile))
-				{ // load filtered file instead
-					filterPhrase = false;
-					filePath = hashFilePath;
-				}
-				else
-				{ // load original file & create has file
-					filterPhrase = true;
-				}
-			}
-			else
-			{ // load original file
-				filterPhrase = false;
-			}
-			TRACE_ERR(filePath << endl);
 
 			timer.check("Start loading PhraseTable");
-
 			if (!boost::filesystem::exists(filePath+".binphr.idx")) 
 				{
+					bool filterPhrase;
+					if (filter)
+						{
+							boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native);
+							if (boost::filesystem::exists(tempFile))
+								{ // load filtered file instead
+									filterPhrase = false;
+									filePath = hashFilePath;
+								}
+							else
+								{ // load original file & create has file
+									filterPhrase = true;
+								}
+						}
+					else
+						{ // load original file
+							filterPhrase = false;
+						}
+					TRACE_ERR(filePath << endl);
+
+
 					TRACE_ERR("using standard phrase tables");
 					PhraseDictionary *pd=new PhraseDictionary(noScoreComponent);
 					pd->Load(input
diff --git a/moses/src/TypeDef.h b/moses/src/TypeDef.h
index f1abf17e3..c109f7e12 100755
--- a/moses/src/TypeDef.h
+++ b/moses/src/TypeDef.h
@@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #define SENTENCE_START	"<s>"
 #define SENTENCE_END		"</s>"
 #define UNKNOWN_FACTOR	"UNK"
+#define EPSILON         "*EPS*"
 
 #define NOT_FOUND 			std::numeric_limits<size_t>::max()
 #define MAX_NGRAM_SIZE  20
author	zens <zens@1f5c12ca-751b-0410-a591-d2e778427230>	2006-07-24 22:33:08 +0400
committer	zens <zens@1f5c12ca-751b-0410-a591-d2e778427230>	2006-07-24 22:33:08 +0400
commit	422418008ea4ce09774460e835402b029ec1825b (patch)
tree	96e96980f42a97f2aa7143ae3f440ce68c8060f8
parent	90124bd40335a9e19ebd7a4eb563857d01fc1c3b (diff)