Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorzens <zens@1f5c12ca-751b-0410-a591-d2e778427230>2006-07-24 22:33:08 +0400
committerzens <zens@1f5c12ca-751b-0410-a591-d2e778427230>2006-07-24 22:33:08 +0400
commit422418008ea4ce09774460e835402b029ec1825b (patch)
tree96e96980f42a97f2aa7143ae3f440ce68c8060f8
parent90124bd40335a9e19ebd7a4eb563857d01fc1c3b (diff)
- confusion net:
* more robust read functions * correct treatment of epsilons * code cleanup - parameter: fixed check for binary phrase table - staticData: do not read input phrases in case of binary phrase table git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@260 1f5c12ca-751b-0410-a591-d2e778427230
-rw-r--r--misc/Makefile2
-rw-r--r--misc/processPhraseTable.cpp2
-rw-r--r--moses/src/ConfusionNet.cpp66
-rwxr-xr-xmoses/src/Parameter.cpp36
-rwxr-xr-xmoses/src/Parameter.h3
-rw-r--r--moses/src/PhraseDictionaryTree.cpp19
-rw-r--r--moses/src/PhraseDictionaryTreeAdaptor.cpp7
-rwxr-xr-xmoses/src/StaticData.cpp41
-rwxr-xr-xmoses/src/TypeDef.h1
9 files changed, 124 insertions, 53 deletions
diff --git a/misc/Makefile b/misc/Makefile
index bfe173fcf..a21823f27 100644
--- a/misc/Makefile
+++ b/misc/Makefile
@@ -13,7 +13,7 @@ default: processPhraseTable
processPhraseTable.o: processPhraseTable.cpp
$(CXX) $(CXXFLAGS) $(INCLUDES) $< -c -o $@
-MOSESLIB =$(HOME)/workspace/moses/src/libmoses.a
+MOSESLIB =../moses/src/libmoses.a
processPhraseTable: processPhraseTable.o $(MOSESLIB)
$(CXX) $(LDFLAGS) $^ -o $@ $(LIBS)
diff --git a/misc/processPhraseTable.cpp b/misc/processPhraseTable.cpp
index ec5af4f99..d7becb8a6 100644
--- a/misc/processPhraseTable.cpp
+++ b/misc/processPhraseTable.cpp
@@ -174,7 +174,7 @@ int main(int argc,char **argv) {
for(size_t i=0;i<pdicts.size();++i)
weights.push_back(std::vector<float>(noScoreComponent,1/(1.0*noScoreComponent)));
- while(net.Read(std::cin,factorOrder,cn-1)) {
+ while(net.ReadF(std::cin,factorOrder,cn-1)) {
net.Print(std::cerr);
GenerateCandidates(net,pdicts,weights,verb);
}
diff --git a/moses/src/ConfusionNet.cpp b/moses/src/ConfusionNet.cpp
index 235ae87a4..02bc2e7ad 100644
--- a/moses/src/ConfusionNet.cpp
+++ b/moses/src/ConfusionNet.cpp
@@ -7,49 +7,73 @@
#include "PhraseDictionaryTreeAdaptor.h"
#include "TranslationOptionCollectionConfusionNet.h"
-ConfusionNet::ConfusionNet(FactorCollection* p) : InputType(),m_factorCollection(p) {}
+ConfusionNet::ConfusionNet(FactorCollection* p)
+ : InputType(),m_factorCollection(p) {}
void ConfusionNet::SetFactorCollection(FactorCollection *p)
{
m_factorCollection=p;
}
-bool ConfusionNet::ReadF(std::istream& in,const std::vector<FactorType>& factorOrder,int format) {
- std::cerr<<"cn read with format "<<format<<"\n";
+bool ConfusionNet::ReadF(std::istream& in,
+ const std::vector<FactorType>& factorOrder,
+ int format)
+{
+ TRACE_ERR("read confusion net with format "<<format<<"\n");
switch(format)
{
case 0: return ReadFormat0(in,factorOrder);
case 1: return ReadFormat1(in,factorOrder);
default:
- std::cerr<<"ERROR: unknown format '"<<format<<"' in ConfusionNet::Read\n";
+ std::cerr<<"ERROR: unknown format '"<<format
+ <<"' in ConfusionNet::Read\n";
}
return 0;
}
-int ConfusionNet::Read(std::istream& in,const std::vector<FactorType>& factorOrder, FactorCollection &factorCollection)
+int ConfusionNet::Read(std::istream& in,
+ const std::vector<FactorType>& factorOrder,
+ FactorCollection &factorCollection)
{
SetFactorCollection(&factorCollection);
return ReadF(in,factorOrder,0);
}
-void ConfusionNet::String2Word(const std::string& s,Word& w,const std::vector<FactorType>& factorOrder) {
+void ConfusionNet::String2Word(const std::string& s,Word& w,
+ const std::vector<FactorType>& factorOrder)
+{
std::vector<std::string> factorStrVector = Tokenize(s, "|");
for(size_t i=0;i<factorOrder.size();++i)
- w.SetFactor(factorOrder[i],m_factorCollection->AddFactor(Input,factorOrder[i],factorStrVector[i]));
+ w.SetFactor(factorOrder[i],
+ m_factorCollection->AddFactor(Input,factorOrder[i],
+ factorStrVector[i]));
}
-bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& factorOrder) {
+bool ConfusionNet::ReadFormat0(std::istream& in,
+ const std::vector<FactorType>& factorOrder)
+{
assert(m_factorCollection);
Clear();
std::string line;
while(getline(in,line)) {
std::istringstream is(line);
- std::string word;float costs;
+ std::string word;double prob;
Column col;
- while(is>>word>>costs) {
+ while(is>>word>>prob) {
Word w;
String2Word(word,w,factorOrder);
- col.push_back(std::make_pair(w,costs));
+ if(prob<0.0)
+ {
+ std::cerr<<"WARN: negative prob: "<<prob<<" ->set to 0.0\n";
+ prob=0.0;
+ }
+ else if (prob>1.0)
+ {
+ std::cerr<<"WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n";
+ prob=1.0;
+ }
+ col.push_back(std::make_pair(w,std::max(static_cast<float>(log(prob)),
+ LOWEST_SCORE)));
}
if(col.size()) {
data.push_back(col);
@@ -59,7 +83,9 @@ bool ConfusionNet::ReadFormat0(std::istream& in,const std::vector<FactorType>& f
}
return !data.empty();
}
-bool ConfusionNet::ReadFormat1(std::istream& in,const std::vector<FactorType>& factorOrder) {
+bool ConfusionNet::ReadFormat1(std::istream& in,
+ const std::vector<FactorType>& factorOrder)
+{
assert(m_factorCollection);
Clear();
std::string line;
@@ -110,15 +136,21 @@ std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
cn.Print(out);return out;
}
-TargetPhraseCollection const* ConfusionNet::CreateTargetPhraseCollection(PhraseDictionaryBase const& d,const WordsRange& r) const
+TargetPhraseCollection const* ConfusionNet::
+CreateTargetPhraseCollection(PhraseDictionaryBase const& d,
+ const WordsRange& r) const
{
- if(PhraseDictionaryTreeAdaptor const* pdict=dynamic_cast<PhraseDictionaryTreeAdaptor const*>(&d))
+ if(PhraseDictionaryTreeAdaptor const* pdict=
+ dynamic_cast<PhraseDictionaryTreeAdaptor const*>(&d))
return pdict->GetTargetPhraseCollection(*this,r);
- std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!\n"
- "has to be PhraseDictionaryTreeAdaptor\n";
+
+ std::cerr<<"ERROR: wrong phrase dictionary type for confusion net decoding!"
+ " Has to be PhraseDictionaryTreeAdaptor\n";
abort();
}
-TranslationOptionCollection* ConfusionNet::CreateTranslationOptionCollection() const
+
+TranslationOptionCollection*
+ConfusionNet::CreateTranslationOptionCollection() const
{
return new TranslationOptionCollectionConfusionNet(*this);
}
diff --git a/moses/src/Parameter.cpp b/moses/src/Parameter.cpp
index 6b5273b3c..2b1473774 100755
--- a/moses/src/Parameter.cpp
+++ b/moses/src/Parameter.cpp
@@ -97,8 +97,15 @@ bool Parameter::Validate()
// do files exist?
// phrase tables
- if (ret)
- ret = FilesExist("ttable-file", 3);
+ if (ret)
+ {
+ std::vector<std::string> ext;
+ // standard phrase table extension (i.e. full name has to be specified)
+ ext.push_back("");
+ // alternative file extension for binary phrase table format:
+ ext.push_back(".binphr.idx");
+ ret = FilesExist("ttable-file", 3,ext);
+ }
// generation tables
if (ret)
ret = FilesExist("generation-file", 2);
@@ -109,7 +116,7 @@ bool Parameter::Validate()
return ret;
}
-bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex)
+bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex,std::vector<std::string> const& extensions)
{
using namespace boost::filesystem;
@@ -135,14 +142,21 @@ bool Parameter::FilesExist(const string &paramName, size_t tokenizeIndex)
return false;
}
const string &pathStr = vec[tokenizeIndex];
- path filePath(pathStr, native);
- if (!exists(filePath))
- {
- stringstream errorMsg("");
- errorMsg << "File " << pathStr << " does not exists";
- UserMessage::Add(errorMsg.str());
- return false;
- }
+
+ bool fileFound=0;
+ for(size_t i=0;i<extensions.size() && !fileFound;++i)
+ {
+ path filePath(pathStr+extensions[i], native);
+ fileFound|=exists(filePath);
+ }
+ if(!fileFound)
+ {
+ stringstream errorMsg("");
+ errorMsg << "File " << pathStr << " does not exists";
+ UserMessage::Add(errorMsg.str());
+ return false;
+ }
+
}
return true;
}
diff --git a/moses/src/Parameter.h b/moses/src/Parameter.h
index a8211b3bc..a43e52c03 100755
--- a/moses/src/Parameter.h
+++ b/moses/src/Parameter.h
@@ -41,7 +41,7 @@ protected:
std::string FindParam(const std::string &paramSwitch, int argc, char* argv[]);
void OverwriteParam(const std::string &paramSwitch, const std::string &paramName, int argc, char* argv[]);
bool ReadConfigFile( std::string filePath );
- bool FilesExist(const std::string &paramName, size_t tokenizeIndex);
+ bool FilesExist(const std::string &paramName, size_t tokenizeIndex,std::vector<std::string> const& fileExtension=std::vector<std::string>(1,""));
bool Validate();
@@ -59,5 +59,6 @@ public:
{
return m_setting[paramName];
}
+
};
diff --git a/moses/src/PhraseDictionaryTree.cpp b/moses/src/PhraseDictionaryTree.cpp
index c5b448867..5860db893 100644
--- a/moses/src/PhraseDictionaryTree.cpp
+++ b/moses/src/PhraseDictionaryTree.cpp
@@ -5,6 +5,7 @@
#include <sstream>
#include <iostream>
#include <fstream>
+#include <ext/hash_map>
#include "PrefixTree.h"
#include "File.h"
@@ -29,6 +30,12 @@ typedef std::vector<LabelId> IPhrase;
typedef std::vector<float> Scores;
typedef PrefixTreeF<LabelId,off_t> PTF;
+namespace __gnu_cxx {
+ template <> struct hash<std::string> {
+ size_t operator()(const std::string& s) const {return __gnu_cxx::__stl_hash_string(s.c_str());}
+ };
+}
+
template<typename A,typename B=std::map<A,LabelId> >
class LVoc {
typedef A Key;
@@ -132,7 +139,8 @@ struct PDTimp {
typedef PrefixTreeF<LabelId,off_t> PTF;
typedef FilePtr<PTF> CPT;
typedef std::vector<CPT> Data;
- typedef LVoc<std::string> WordVoc;
+ // typedef LVoc<std::string> WordVoc;
+ typedef LVoc<std::string,__gnu_cxx::hash_map<std::string,LabelId> > WordVoc;
Data data;
std::vector<off_t> srcOffsets;
@@ -215,7 +223,7 @@ struct PDTimp {
PPtr Extend(PPtr p,const std::string& w)
{
assert(p);
- if(w.empty()) return p;
+ if(w.empty() || w==EPSILON) return p;
LabelId wi=sv.index(w);
if(wi==InvalidLabelId) return PPtr();
else if(p.imp->isRoot())
@@ -291,6 +299,13 @@ PhraseDictionaryTree::PhraseDictionaryTree(size_t noScoreComponent,
: Dictionary(noScoreComponent),imp(new PDTimp),m_inFactorType(ift),m_outFactorType(oft)
{
imp->m_factorCollection=fc;
+ if(sizeof(off_t)!=8)
+ {
+ std::cerr<<"ERROR: size of type 'off_t' has to be 64 bit!\n"
+ "use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n"
+ " -> abort \n\n";
+ abort();
+ }
}
PhraseDictionaryTree::~PhraseDictionaryTree()
diff --git a/moses/src/PhraseDictionaryTreeAdaptor.cpp b/moses/src/PhraseDictionaryTreeAdaptor.cpp
index fd514db99..0a78cdfe6 100644
--- a/moses/src/PhraseDictionaryTreeAdaptor.cpp
+++ b/moses/src/PhraseDictionaryTreeAdaptor.cpp
@@ -35,6 +35,7 @@ struct PDTAimp {
: m_languageModels(0),m_weightWP(0.0),m_factorCollection(0),m_dict(0),
m_obj(p),useCache(1) {}
+ // convert FactorArray into string
void Factors2String(FactorArray const& w,std::string& s) const
{
for(size_t j=0;j<m_input.size();++j)
@@ -44,6 +45,7 @@ struct PDTAimp {
}
}
+ // free temporary memory
void CleanUp()
{
assert(m_dict);
@@ -54,6 +56,7 @@ struct PDTAimp {
m_rangeCache.clear();
}
+ // add phrase pair till next CleanUp, should be used only for unknowns
void AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)
{
assert(GetTargetPhraseCollection(source)==0);
@@ -70,12 +73,14 @@ struct PDTAimp {
else std::cerr<<"WARNING: you added an already existing phrase!\n";
}
+ // access with full source phrase
TargetPhraseCollection const*
GetTargetPhraseCollection(Phrase const &src) const
{
assert(m_dict);
if(src.GetSize()==0) return 0;
+ // look up cache
std::pair<MapSrc2Tgt::iterator,bool> piter;
if(useCache)
{
@@ -84,6 +89,8 @@ struct PDTAimp {
}
else if (m_cache.size())
{
+ // cache is also used for unknowns, so even if the cache is disabled
+ // there may be entries
MapSrc2Tgt::const_iterator i=m_cache.find(src);
return (i!=m_cache.end() ? i->second : 0);
}
diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp
index 7da13e37b..44323a3e4 100755
--- a/moses/src/StaticData.cpp
+++ b/moses/src/StaticData.cpp
@@ -418,30 +418,31 @@ void StaticData::LoadPhraseTables(bool filter
+ PROJECT_NAME + "--"
+ inputFileHash + "--"
+ phraseTableHash + ".txt";
- bool filterPhrase;
- if (filter)
- {
- boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native);
- if (boost::filesystem::exists(tempFile))
- { // load filtered file instead
- filterPhrase = false;
- filePath = hashFilePath;
- }
- else
- { // load original file & create has file
- filterPhrase = true;
- }
- }
- else
- { // load original file
- filterPhrase = false;
- }
- TRACE_ERR(filePath << endl);
timer.check("Start loading PhraseTable");
-
if (!boost::filesystem::exists(filePath+".binphr.idx"))
{
+ bool filterPhrase;
+ if (filter)
+ {
+ boost::filesystem::path tempFile(hashFilePath, boost::filesystem::native);
+ if (boost::filesystem::exists(tempFile))
+ { // load filtered file instead
+ filterPhrase = false;
+ filePath = hashFilePath;
+ }
+ else
+ { // load original file & create has file
+ filterPhrase = true;
+ }
+ }
+ else
+ { // load original file
+ filterPhrase = false;
+ }
+ TRACE_ERR(filePath << endl);
+
+
TRACE_ERR("using standard phrase tables");
PhraseDictionary *pd=new PhraseDictionary(noScoreComponent);
pd->Load(input
diff --git a/moses/src/TypeDef.h b/moses/src/TypeDef.h
index f1abf17e3..c109f7e12 100755
--- a/moses/src/TypeDef.h
+++ b/moses/src/TypeDef.h
@@ -29,6 +29,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#define SENTENCE_START "<s>"
#define SENTENCE_END "</s>"
#define UNKNOWN_FACTOR "UNK"
+#define EPSILON "*EPS*"
#define NOT_FOUND std::numeric_limits<size_t>::max()
#define MAX_NGRAM_SIZE 20