Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/moses
diff options
context:
space:
mode:
authorOliver Wilson <oliver.wilson@ed.ac.uk>2012-04-02 19:14:58 +0400
committerOliver Wilson <oliver.wilson@ed.ac.uk>2012-04-02 19:14:58 +0400
commit430b720084e1a7de43b1b27b181a00ab7af04c7b (patch)
tree0c52aaf12208d0b7558341c8b5bd8606cdd1f0d4 /moses
parent9686e86c70b7284b55861b0e16b3d16c2f1b4408 (diff)
Import Lossy Distributed Hash Table Language Model implementation.
Diffstat (limited to 'moses')
-rw-r--r--moses/src/LM/DMap.cpp135
-rw-r--r--moses/src/LM/DMap.h57
-rw-r--r--moses/src/LM/Factory.cpp12
-rw-r--r--moses/src/LM/Jamfile10
-rw-r--r--moses/src/LM/LDHT.cpp282
-rw-r--r--moses/src/LM/LDHT.h21
-rw-r--r--moses/src/TypeDef.h2
7 files changed, 321 insertions, 198 deletions
diff --git a/moses/src/LM/DMap.cpp b/moses/src/LM/DMap.cpp
deleted file mode 100644
index 3bef47083..000000000
--- a/moses/src/LM/DMap.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-//
-// Oliver Wilson <oliver.wilson@ed.ac.uk>
-//
-
-#include <Config.h>
-
-#include "FactorCollection.h"
-#include "LM/DMapLM.h"
-
-namespace Moses
-{
-
-LanguageModelDMapLM::LanguageModelDMapLM() : m_lm(0) {
-}
-
-LanguageModelDMapLM::~LanguageModelDMapLM() {
- delete m_lm;
-}
-
-bool LanguageModelDMapLM::Load(const std::string& filePath,
- FactorType factorType,
- size_t nGramOrder)
-{
- //std::cerr << "LanguageModelDMapLM: loading..." << std::endl;
- m_filePath = filePath;
- m_factorType = factorType;
- m_nGramOrder = nGramOrder;
- m_sentenceStart = FactorCollection::Instance().AddFactor(Output, factorType, "<s>");
- m_sentenceStartArray[m_factorType] = m_sentenceStart;
- m_sentenceEnd = FactorCollection::Instance().AddFactor(Output, factorType, "</s>");
- m_sentenceEndArray[m_factorType] = m_sentenceEnd;
- std::ifstream configFile(filePath.c_str());
- char struct_name_buffer[1024];
- char run_local_buffer[1024];
- configFile.getline(struct_name_buffer, 1024);
- configFile.getline(run_local_buffer, 1024);
- bool run_local;
- //std::cerr << "table name: " << struct_name_buffer << std::endl;
- //std::cerr << "run local: " << run_local_buffer << std::endl;
- if (strncmp(run_local_buffer, "true", 1024) == 0)
- run_local = true;
- else
- run_local = false;
- m_lm = new StructLanguageModelBackoff(Config::getConfig(), struct_name_buffer);
- return m_lm->init(run_local);
-}
-
-void LanguageModelDMapLM::CreateFactor(FactorCollection& factorCollection)
-{
- // Don't know what this is for.
-}
-
-LMResult LanguageModelDMapLM::GetValueGivenState(
- const std::vector<const Word*>& contextFactor,
- FFState& state) const
-{
- DMapLMState& cast_state = static_cast<DMapLMState&>(state);
- LMResult result;
- size_t succeeding_order;
- size_t target_order = std::min((size_t)cast_state.m_last_succeeding_order + 1,
- GetNGramOrder());
- result.score = GetValue(contextFactor, target_order, &succeeding_order);
- cast_state.m_last_succeeding_order = succeeding_order;
- return result;
-}
-
-LMResult LanguageModelDMapLM::GetValueForgotState(
- const std::vector<const Word*>& contextFactor,
- FFState& outState) const
-{
- DMapLMState& cast_state = static_cast<DMapLMState&>(outState);
- LMResult result;
- size_t succeeding_order;
- size_t target_order = GetNGramOrder();
- result.score = GetValue(contextFactor, target_order, &succeeding_order);
- cast_state.m_last_succeeding_order = succeeding_order;
- return result;
-}
-
-float LanguageModelDMapLM::GetValue(
- const std::vector<const Word*>& contextFactor,
- size_t target_order,
- size_t* succeeding_order) const
-{
- FactorType factorType = GetFactorType();
- float score;
-
- std::string ngram_string("");
- ngram_string.append(((*contextFactor[0])[factorType])->GetString());
- for (size_t i = 1; i < contextFactor.size(); ++i) {
- ngram_string.append(" ");
- ngram_string.append(((*contextFactor[i])[factorType])->GetString());
- }
- //std::cout << "ngram: X" << ngram_string << "X" << std::endl;
- score = m_lm->calcScore(ngram_string.c_str(), target_order, succeeding_order);
- score = FloorScore(TransformLMScore(score));
- return score;
-}
-
-const FFState* LanguageModelDMapLM::GetNullContextState() const {
- DMapLMState* state = new DMapLMState();
- state->m_last_succeeding_order = GetNGramOrder();
- return state;
-}
-
-FFState* LanguageModelDMapLM::GetNewSentenceState() const {
- DMapLMState* state = new DMapLMState();
- state->m_last_succeeding_order = GetNGramOrder();
- return state;
-}
-
-const FFState* LanguageModelDMapLM::GetBeginSentenceState() const {
- DMapLMState* state = new DMapLMState();
- state->m_last_succeeding_order = GetNGramOrder();
- return state;
-}
-
-FFState* LanguageModelDMapLM::NewState(const FFState* state) const {
- DMapLMState* new_state = new DMapLMState();
- const DMapLMState* cast_state = static_cast<const DMapLMState*>(state);
- new_state->m_last_succeeding_order = cast_state->m_last_succeeding_order;
- return new_state;
-}
-
-void LanguageModelDMapLM::CleanUpAfterSentenceProcessing() {
- m_lm->printStats();
- m_lm->resetStats();
- m_lm->clearCaches();
-}
-
-void LanguageModelDMapLM::InitializeBeforeSentenceProcessing() {
-}
-
-} // namespace Moses
-
diff --git a/moses/src/LM/DMap.h b/moses/src/LM/DMap.h
deleted file mode 100644
index f9fbf56c2..000000000
--- a/moses/src/LM/DMap.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//
-// Oliver Wilson <oliver.wilson@ed.ac.uk>
-//
-
-#ifndef moses_LanguageModelDMapLM_h
-#define moses_LanguageModelDMapLM_h
-
-#include <StructLanguageModelBackoff.h>
-
-#include "Factor.h"
-#include "FFState.h"
-#include "LM/SingleFactor.h"
-#include "Util.h"
-
-namespace Moses {
-
-class DMapLMState : public FFState {
-public:
- int Compare(const FFState &o) const {
- const DMapLMState& cast_other = static_cast<const DMapLMState&>(o);
- if (cast_other.m_last_succeeding_order < m_last_succeeding_order)
- return -1;
- else if (cast_other.m_last_succeeding_order > m_last_succeeding_order)
- return 1;
- else
- return 0;
- }
- uint8_t m_last_succeeding_order;
-};
-
-class LanguageModelDMapLM : public LanguageModelSingleFactor
-{
-public:
- LanguageModelDMapLM();
- ~LanguageModelDMapLM();
- bool Load(const std::string&, FactorType, size_t);
- LMResult GetValueGivenState(const std::vector<const Word*>&, FFState&) const;
- LMResult GetValueForgotState(const std::vector<const Word*>&, FFState&) const;
- float GetValue(const std::vector<const Word*>&, size_t, size_t*) const;
- const FFState* GetNullContextState() const;
- FFState* GetNewSentenceState() const;
- const FFState* GetBeginSentenceState() const;
- FFState* NewState(const FFState*) const;
- void CleanUpAfterSentenceProcessing();
- void InitializeBeforeSentenceProcessing();
-
-protected:
- StructLanguageModelBackoff* m_lm;
-
- void CreateFactor(FactorCollection&);
-
-};
-
-} // namespace Moses
-
-#endif // moses_LanguageModelDMapLM_h
-
diff --git a/moses/src/LM/Factory.cpp b/moses/src/LM/Factory.cpp
index 68dc06e4a..3498be55e 100644
--- a/moses/src/LM/Factory.cpp
+++ b/moses/src/LM/Factory.cpp
@@ -46,8 +46,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "LM/Ken.h"
-#ifdef LM_DMAP
-# include "LM/DMapLM.h"
+#ifdef LM_LDHT
+# include "LM/LDHT.h"
#endif
#include "LM/Base.h"
@@ -107,9 +107,11 @@ LanguageModel* CreateLanguageModel(LMImplementation lmImplementation
lm = NewParallelBackoff();
#endif
break;
- case DMapLM:
-#ifdef LM_DMAP
- lm = new LanguageModelDMapLM();
+ case LDHTLM:
+#ifdef LM_LDHT
+ return ConstructLDHTLM(languageModelFile,
+ scoreIndexManager,
+ factorTypes[0]);
#endif
break;
default:
diff --git a/moses/src/LM/Jamfile b/moses/src/LM/Jamfile
index 27cf65f47..74b6f9ee6 100644
--- a/moses/src/LM/Jamfile
+++ b/moses/src/LM/Jamfile
@@ -55,6 +55,16 @@ if $(with-randlm) {
dependencies += rand ;
}
+# LDHTLM
+local with-ldhtlm = [ option.get "with-ldhtlm" ] ;
+if $(with-ldhtlm) {
+ lib LDHT : : <search>$(with-ldhtlm)/lib ;
+ lib ticpp : LDHT : <search>$(with-ldhtlm)/lib ;
+ obj LDHT.o : LDHT.cpp LDHT ..//headers : <include>$(with-ldhtlm)/include <include>$(with-ldhtlm)/include/LDHT <include>/home/wilson/include ;
+ alias ldht : LDHT.o LDHT ticpp : : : <define>LM_LDHT ;
+ dependencies += ldht ;
+}
+
#ORLM is always compiled but needs special headers
obj ORLM.o : ORLM.cpp ..//headers ../DynSAInclude//dynsa : : : <include>../DynSAInclude ;
diff --git a/moses/src/LM/LDHT.cpp b/moses/src/LM/LDHT.cpp
new file mode 100644
index 000000000..0c61235b8
--- /dev/null
+++ b/moses/src/LM/LDHT.cpp
@@ -0,0 +1,282 @@
+//
+// Oliver Wilson <oliver.wilson@ed.ac.uk>
+//
+
+#include "LM/Base.h"
+#include "LM/LDHT.h"
+#include "../FFState.h"
+#include "../TypeDef.h"
+#include "../Hypothesis.h"
+
+#include <LDHT/Client.h>
+#include <LDHT/ClientLocal.h>
+#include <LDHT/NewNgram.h>
+#include <LDHT/FactoryCollection.h>
+
+#include <boost/thread/tss.hpp>
+
+namespace Moses {
+
+struct LDHTLMState : public FFState {
+ LDHT::NewNgram gram_fingerprints;
+
+ int Compare(const FFState& uncast_other) const {
+ const LDHTLMState &other = static_cast<const LDHTLMState&>(uncast_other);
+ return gram_fingerprints.compareMoses(other.gram_fingerprints);
+ }
+
+ void copyFrom(const LDHTLMState& other) {
+ gram_fingerprints.copyFrom(other.gram_fingerprints);
+ }
+};
+
+class LanguageModelLDHT : public LanguageModel {
+public:
+ LanguageModelLDHT();
+ LanguageModelLDHT(const std::string& path,
+ ScoreIndexManager& manager,
+ FactorType factorType);
+ LanguageModelLDHT(ScoreIndexManager& manager,
+ LanguageModelLDHT& copyFrom);
+ std::string GetScoreProducerDescription(unsigned) const {
+ std::ostringstream oss;
+ oss << "LM_" << LDHT::NewNgram::k_max_order << "gram";
+ return oss.str();
+ }
+ LDHT::Client* getClientUnsafe() const;
+ LDHT::Client* getClientSafe();
+ LDHT::Client* initTSSClient();
+ virtual ~LanguageModelLDHT();
+ virtual LanguageModel* Duplicate(
+ ScoreIndexManager& scoreIndexManager) const;
+ virtual void InitializeBeforeSentenceProcessing();
+ virtual void CleanUpAfterSentenceProcessing();
+ virtual const FFState* EmptyHypothesisState(const InputType& input) const;
+ virtual bool Useable(const Phrase& phrase) const;
+ virtual void CalcScore(const Phrase& phrase,
+ float& fullScore,
+ float& ngramScore,
+ std::size_t& oovCount) const;
+ FFState* Evaluate(const Hypothesis& hypo,
+ const FFState* input_state,
+ ScoreComponentCollection* score_output) const;
+ FFState* EvaluateChart(const ChartHypothesis& hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const;
+
+protected:
+ boost::thread_specific_ptr<LDHT::Client> m_client;
+ std::string m_configPath;
+ FactorType m_factorType;
+
+};
+
+LanguageModel* ConstructLDHTLM(const std::string& path,
+ ScoreIndexManager& manager,
+ FactorType factorType) {
+ return new LanguageModelLDHT(path, manager, factorType);
+}
+
+LanguageModelLDHT::LanguageModelLDHT() : LanguageModel(), m_client(NULL) {
+ m_enableOOVFeature = false;
+}
+
+LanguageModelLDHT::LanguageModelLDHT(ScoreIndexManager& manager,
+ LanguageModelLDHT& copyFrom) {
+ //m_client = copyFrom.m_client;
+ m_factorType = copyFrom.m_factorType;
+ m_configPath = copyFrom.m_configPath;
+ Init(manager);
+}
+
+LanguageModelLDHT::LanguageModelLDHT(const std::string& path,
+ ScoreIndexManager& manager,
+ FactorType factorType)
+ : m_factorType(factorType) {
+ m_configPath = path;
+ Init(manager);
+}
+
+LanguageModelLDHT::~LanguageModelLDHT() {
+ // TODO(wilson): should cleanup for each individual thread.
+ delete getClientSafe();
+}
+
+LanguageModel* LanguageModelLDHT::Duplicate(
+ ScoreIndexManager& scoreIndexManager) const {
+ return NULL;
+}
+
+// Check that there is a TSS Client instance, and instantiate one if
+// there isn't.
+LDHT::Client* LanguageModelLDHT::getClientSafe() {
+ if (m_client.get() == NULL)
+ m_client.reset(initTSSClient());
+ return m_client.get();
+}
+
+// Do not check that there is a TSS Client instance.
+LDHT::Client* LanguageModelLDHT::getClientUnsafe() const {
+ return m_client.get();
+}
+
+LDHT::Client* LanguageModelLDHT::initTSSClient() {
+ std::ifstream config_file(m_configPath.c_str());
+ std::string ldht_config_path;
+ getline(config_file, ldht_config_path);
+ std::string ldhtlm_config_path;
+ getline(config_file, ldhtlm_config_path);
+
+ LDHT::FactoryCollection* factory_collection =
+ LDHT::FactoryCollection::createDefaultFactoryCollection();
+
+ LDHT::Client* client;
+ client = new LDHT::ClientLocal();
+ //client = new LDHT::Client();
+ client->fromXmlFiles(*factory_collection,
+ ldht_config_path,
+ ldhtlm_config_path);
+ return client;
+}
+
+void LanguageModelLDHT::InitializeBeforeSentenceProcessing() {
+ getClientSafe()->clearCache();
+}
+
+void LanguageModelLDHT::CleanUpAfterSentenceProcessing() {
+}
+
+const FFState* LanguageModelLDHT::EmptyHypothesisState(
+ const InputType& input) const {
+ return NULL;
+}
+
+bool LanguageModelLDHT::Useable(const Phrase& phrase) const {
+ return (phrase.GetSize() > 0 && phrase.GetFactor(0, m_factorType) != NULL);
+}
+
+void LanguageModelLDHT::CalcScore(const Phrase& phrase,
+ float& fullScore,
+ float& ngramScore,
+ std::size_t& oovCount) const {
+ // TODO(wilson): handle nonterminal words.
+ LDHT::Client* client = getClientUnsafe();
+ // Score the first order - 1 words of the phrase.
+ int order = LDHT::NewNgram::k_max_order;
+ int prefix_start = 0;
+ int prefix_end = std::min(phrase.GetSize(), static_cast<size_t>(order - 1));
+ LDHT::NewNgram ngram;
+ std::deque<int> full_score_tags;
+ for (int word_idx = prefix_start; word_idx < prefix_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ full_score_tags.push_back(client->requestNgram(ngram));
+ }
+ // Now score all subsequent ngrams to end of phrase.
+ int internal_start = prefix_end;
+ int internal_end = phrase.GetSize();
+ std::deque<int> internal_score_tags;
+ for (int word_idx = internal_start; word_idx < internal_end; ++word_idx) {
+ ngram.appendGram(phrase.GetWord(word_idx)
+ .GetFactor(m_factorType)->GetString().c_str());
+ internal_score_tags.push_back(client->requestNgram(ngram));
+ }
+
+ // Wait for resposes from the servers.
+ client->awaitResponses();
+
+ // Calculate the full phrase score, and the internal score.
+ fullScore = 0.0;
+ while (!full_score_tags.empty()) {
+ fullScore += client->getNgramScore(full_score_tags.front());
+ full_score_tags.pop_front();
+ }
+ ngramScore = 0.0;
+ while (!internal_score_tags.empty()) {
+ float score = client->getNgramScore(internal_score_tags.front());
+ internal_score_tags.pop_front();
+ fullScore += score;
+ ngramScore += score;
+ }
+ fullScore = TransformLMScore(fullScore);
+ ngramScore = TransformLMScore(ngramScore);
+ oovCount = 0;
+}
+
+FFState* LanguageModelLDHT::Evaluate(
+ const Hypothesis& hypo,
+ const FFState* input_state,
+ ScoreComponentCollection* score_output) const {
+ // TODO(wilson): handle nonterminal words.
+ LDHT::Client* client = getClientUnsafe();
+
+ // Create a new state and copy the contents of the input_state if
+ // supplied.
+ LDHTLMState* new_state = new LDHTLMState();
+ if (input_state == NULL) {
+ if (hypo.GetCurrTargetWordsRange().GetStartPos() != 0) {
+ V("got a null state but not at start of sentence");
+ abort();
+ }
+ new_state->gram_fingerprints.appendGram(BOS_);
+ }
+ else {
+ if (hypo.GetCurrTargetWordsRange().GetStartPos() == 0) {
+ V("got a non null state but at start of sentence");
+ abort();
+ }
+ new_state->copyFrom(static_cast<const LDHTLMState&>(*input_state));
+ }
+
+ // Score ngrams that overlap with the previous phrase.
+ int order = LDHT::NewNgram::k_max_order;
+ int phrase_start = hypo.GetCurrTargetWordsRange().GetStartPos();
+ int phrase_end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
+ int overlap_start = phrase_start;
+ int overlap_end = std::min(phrase_end, phrase_start + order - 1);
+ int word_idx = overlap_start;
+ LDHT::NewNgram& ngram = new_state->gram_fingerprints;
+ std::deque<int> request_tags;
+ for (; word_idx < overlap_end; ++word_idx) {
+ ngram.appendGram(
+ hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
+ request_tags.push_back(client->requestNgram(ngram));
+ }
+ // No need to score phrase internal ngrams, but keep track of them
+ // in the state (which in this case is the NewNgram containing the
+ // hashes of the individual grams).
+ for (; word_idx < phrase_end; ++word_idx) {
+ ngram.appendGram(
+ hypo.GetFactor(word_idx, m_factorType)->GetString().c_str());
+ }
+ // If this is the last phrase in the sentence, score the last ngram
+ // with the end of sentence marker on it.
+ if (hypo.IsSourceCompleted()) {
+ ngram.appendGram(EOS_);
+ request_tags.push_back(client->requestNgram(ngram));
+ }
+ // Await responses from the server.
+ client->awaitResponses();
+
+ // Calculate scores given the request tags.
+ float score = 0;
+ while (!request_tags.empty()) {
+ score += client->getNgramScore(request_tags.front());
+ request_tags.pop_front();
+ }
+
+ score = FloorScore(TransformLMScore(score));
+ score_output->PlusEquals(this, score);
+
+ return new_state;
+}
+
+FFState* LanguageModelLDHT::EvaluateChart(
+ const ChartHypothesis& hypo,
+ int featureID,
+ ScoreComponentCollection* accumulator) const {
+ return NULL;
+}
+
+} // namespace Moses.
+
diff --git a/moses/src/LM/LDHT.h b/moses/src/LM/LDHT.h
new file mode 100644
index 000000000..dc60d0725
--- /dev/null
+++ b/moses/src/LM/LDHT.h
@@ -0,0 +1,21 @@
+//
+// Oliver Wilson <oliver.wilson@ed.ac.uk>
+//
+
+#ifndef moses_LanguageModelLDHT_h
+#define moses_LanguageModelLDHT_h
+
+#include "../TypeDef.h"
+
+namespace Moses {
+
+class ScoreIndexManager;
+class LanguageModel;
+
+LanguageModel* ConstructLDHTLM(const std::string& file,
+ ScoreIndexManager& manager,
+ FactorType factorType);
+} // namespace Moses.
+
+#endif // moses_LanguageModelLDHT_h
+
diff --git a/moses/src/TypeDef.h b/moses/src/TypeDef.h
index a7d6b6b20..038d5c944 100644
--- a/moses/src/TypeDef.h
+++ b/moses/src/TypeDef.h
@@ -120,7 +120,7 @@ enum LMImplementation {
,Ken = 8
,LazyKen = 9
,ORLM = 10
- ,DMapLM = 11
+ ,LDHTLM = 11
};
enum PhraseTableImplementation {