Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lm/Jamfile4
-rw-r--r--lm/builder/Jamfile4
-rw-r--r--lm/builder/dump_counts_main.cc36
-rw-r--r--lm/builder/print.cc5
-rw-r--r--lm/ngram_query.hh2
-rw-r--r--moses/LM/NeuralLMWrapper.cpp6
6 files changed, 47 insertions, 10 deletions
diff --git a/lm/Jamfile b/lm/Jamfile
index 6ca37c99e..227b22014 100644
--- a/lm/Jamfile
+++ b/lm/Jamfile
@@ -17,7 +17,7 @@ wrappers = ;
local with-nplm = [ option.get "with-nplm" ] ;
if $(with-nplm) {
lib neuralLM : : <search>$(with-nplm)/src ;
- obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen <cxxflags>-fopenmp ;
+ obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ;
alias nplm : nplm.o neuralLM ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ;
wrappers += nplm ;
}
@@ -37,4 +37,4 @@ for local p in [ glob *_main.cc ] {
exes += $(name) ;
}
-alias programs : $(exes) filter//filter : <threading>multi:<source>builder//lmplz ;
+alias programs : $(exes) filter//filter builder//dump_counts : <threading>multi:<source>builder//lmplz ;
diff --git a/lm/builder/Jamfile b/lm/builder/Jamfile
index b596e086a..1e0e18b5f 100644
--- a/lm/builder/Jamfile
+++ b/lm/builder/Jamfile
@@ -4,6 +4,10 @@ fakelib builder : [ glob *.cc : *test.cc *main.cc ]
exe lmplz : lmplz_main.cc builder /top//boost_program_options ;
+exe dump_counts : dump_counts_main.cc builder ;
+
+alias programs : lmplz dump_counts ;
+
import testing ;
unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ;
unit-test adjust_counts_test : adjust_counts_test.cc builder /top//boost_unit_test_framework ;
diff --git a/lm/builder/dump_counts_main.cc b/lm/builder/dump_counts_main.cc
new file mode 100644
index 000000000..fa0016792
--- /dev/null
+++ b/lm/builder/dump_counts_main.cc
@@ -0,0 +1,36 @@
+#include "lm/builder/print.hh"
+#include "lm/word_index.hh"
+#include "util/file.hh"
+#include "util/read_compressed.hh"
+
+#include <boost/lexical_cast.hpp>
+
+#include <iostream>
+#include <vector>
+
+int main(int argc, char *argv[]) {
+ if (argc != 4) {
+ std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
+ "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
+ "counts. Each record has order many vocabulary ids.\n"
+ "The vocabulary file contains the words delimited by NULL in order of id.\n"
+ "The vocabulary file may not be compressed because it is mmapped but the counts\n"
+ "file can be compressed.\n";
+ return 1;
+ }
+ util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
+ util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
+ lm::builder::VocabReconstitute vocab(vocab_file.get());
+ unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
+ std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
+ while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
+ UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
+ const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
+ for (const lm::WordIndex *i = words; i != words + order; ++i) {
+ UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?");
+ std::cout << vocab.Lookup(*i) << ' ';
+ }
+ // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FakeOFStream.
+ std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
+ }
+}
diff --git a/lm/builder/print.cc b/lm/builder/print.cc
index c70e62ed6..75f15f0a6 100644
--- a/lm/builder/print.cc
+++ b/lm/builder/print.cc
@@ -54,9 +54,8 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
out << ' ' << vocab_.Lookup(*i);
}
- float backoff = stream->Value().complete.backoff;
- if (backoff != 0.0)
- out << '\t' << backoff;
+ if (order != positions.size())
+ out << '\t' << stream->Value().complete.backoff;
out << '\n';
}
diff --git a/lm/ngram_query.hh b/lm/ngram_query.hh
index 9e32d113a..5f330c5cc 100644
--- a/lm/ngram_query.hh
+++ b/lm/ngram_query.hh
@@ -36,7 +36,7 @@ struct FullPrint : public BasicPrint {
"Perplexity including OOVs:\t" << ppl_including_oov << "\n"
"Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
"OOVs:\t" << corpus_oov << "\n"
- "Tokenss:\t" << corpus_tokens << '\n'
+ "Tokens:\t" << corpus_tokens << '\n'
;
}
};
diff --git a/moses/LM/NeuralLMWrapper.cpp b/moses/LM/NeuralLMWrapper.cpp
index 9411bd2c4..ab7b5400b 100644
--- a/moses/LM/NeuralLMWrapper.cpp
+++ b/moses/LM/NeuralLMWrapper.cpp
@@ -4,7 +4,6 @@
#include <boost/functional/hash.hpp>
#include "NeuralLMWrapper.h"
#include "neuralLM.h"
-#include <model.h>
using namespace std;
@@ -34,7 +33,6 @@ void NeuralLMWrapper::Load()
m_sentenceEndWord[m_factorType] = m_sentenceEnd;
m_neuralLM_shared = new nplm::neuralLM(m_filePath, true);
- m_neuralLM_shared->set_log_base(10);
//TODO: config option?
m_neuralLM_shared->set_cache(1000000);
@@ -56,7 +54,7 @@ LMResult NeuralLMWrapper::GetValue(const vector<const Word*> &contextFactor, Sta
for (size_t i=0, n=contextFactor.size(); i<n; i++) {
const Word* word = contextFactor[i];
const Factor* factor = word->GetFactor(m_factorType);
- const std::string string= factor->GetString().as_string();
+ const std::string string = factor->GetString().as_string();
int neuralLM_wordID = m_neuralLM->lookup_word(string);
words[i] = neuralLM_wordID;
boost::hash_combine(hashCode, neuralLM_wordID);
@@ -66,7 +64,7 @@ LMResult NeuralLMWrapper::GetValue(const vector<const Word*> &contextFactor, Sta
// Create a new struct to hold the result
LMResult ret;
- ret.score = value;
+ ret.score = FloorScore(value);
ret.unknown = false;
(*finalState) = (State*) hashCode;