diff options
-rw-r--r-- | lm/Jamfile | 4 | ||||
-rw-r--r-- | lm/builder/Jamfile | 4 | ||||
-rw-r--r-- | lm/builder/dump_counts_main.cc | 36 | ||||
-rw-r--r-- | lm/builder/print.cc | 5 | ||||
-rw-r--r-- | lm/ngram_query.hh | 2 | ||||
-rw-r--r-- | moses/LM/NeuralLMWrapper.cpp | 6 |
6 files changed, 47 insertions, 10 deletions
diff --git a/lm/Jamfile b/lm/Jamfile index 6ca37c99e..227b22014 100644 --- a/lm/Jamfile +++ b/lm/Jamfile @@ -17,7 +17,7 @@ wrappers = ; local with-nplm = [ option.get "with-nplm" ] ; if $(with-nplm) { lib neuralLM : : <search>$(with-nplm)/src ; - obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <include>$(with-nplm)/3rdparty/eigen <cxxflags>-fopenmp ; + obj nplm.o : wrappers/nplm.cc : <include>.. <include>$(with-nplm)/src <cxxflags>-fopenmp ; alias nplm : nplm.o neuralLM ..//boost_thread : : : <cxxflags>-fopenmp <linkflags>-fopenmp <define>WITH_NPLM <library>..//boost_thread ; wrappers += nplm ; } @@ -37,4 +37,4 @@ for local p in [ glob *_main.cc ] { exes += $(name) ; } -alias programs : $(exes) filter//filter : <threading>multi:<source>builder//lmplz ; +alias programs : $(exes) filter//filter builder//dump_counts : <threading>multi:<source>builder//lmplz ; diff --git a/lm/builder/Jamfile b/lm/builder/Jamfile index b596e086a..1e0e18b5f 100644 --- a/lm/builder/Jamfile +++ b/lm/builder/Jamfile @@ -4,6 +4,10 @@ fakelib builder : [ glob *.cc : *test.cc *main.cc ] exe lmplz : lmplz_main.cc builder /top//boost_program_options ; +exe dump_counts : dump_counts_main.cc builder ; + +alias programs : lmplz dump_counts ; + import testing ; unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ; unit-test adjust_counts_test : adjust_counts_test.cc builder /top//boost_unit_test_framework ; diff --git a/lm/builder/dump_counts_main.cc b/lm/builder/dump_counts_main.cc new file mode 100644 index 000000000..fa0016792 --- /dev/null +++ b/lm/builder/dump_counts_main.cc @@ -0,0 +1,36 @@ +#include "lm/builder/print.hh" +#include "lm/word_index.hh" +#include "util/file.hh" +#include "util/read_compressed.hh" + +#include <boost/lexical_cast.hpp> + +#include <iostream> +#include <vector> + +int main(int argc, char *argv[]) { + if (argc != 4) { + std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n" + "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n" + "counts. Each record has order many vocabulary ids.\n" + "The vocabulary file contains the words delimited by NULL in order of id.\n" + "The vocabulary file may not be compressed because it is mmapped but the counts\n" + "file can be compressed.\n"; + return 1; + } + util::ReadCompressed counts(util::OpenReadOrThrow(argv[1])); + util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2])); + lm::builder::VocabReconstitute vocab(vocab_file.get()); + unsigned int order = boost::lexical_cast<unsigned int>(argv[3]); + std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t)); + while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) { + UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size()); + const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin()); + for (const lm::WordIndex *i = words; i != words + order; ++i) { + UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?"); + std::cout << vocab.Lookup(*i) << ' '; + } + // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FakeOFStream. + std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n'; + } +} diff --git a/lm/builder/print.cc b/lm/builder/print.cc index c70e62ed6..75f15f0a6 100644 --- a/lm/builder/print.cc +++ b/lm/builder/print.cc @@ -54,9 +54,8 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) { for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) { out << ' ' << vocab_.Lookup(*i); } - float backoff = stream->Value().complete.backoff; - if (backoff != 0.0) - out << '\t' << backoff; + if (order != positions.size()) + out << '\t' << stream->Value().complete.backoff; out << '\n'; } diff --git a/lm/ngram_query.hh b/lm/ngram_query.hh index 9e32d113a..5f330c5cc 100644 --- a/lm/ngram_query.hh +++ b/lm/ngram_query.hh @@ -36,7 +36,7 @@ struct FullPrint : public BasicPrint { "Perplexity including OOVs:\t" << ppl_including_oov << "\n" "Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n" "OOVs:\t" << corpus_oov << "\n" - "Tokenss:\t" << corpus_tokens << '\n' + "Tokens:\t" << corpus_tokens << '\n' ; } }; diff --git a/moses/LM/NeuralLMWrapper.cpp b/moses/LM/NeuralLMWrapper.cpp index 9411bd2c4..ab7b5400b 100644 --- a/moses/LM/NeuralLMWrapper.cpp +++ b/moses/LM/NeuralLMWrapper.cpp @@ -4,7 +4,6 @@ #include <boost/functional/hash.hpp> #include "NeuralLMWrapper.h" #include "neuralLM.h" -#include <model.h> using namespace std; @@ -34,7 +33,6 @@ void NeuralLMWrapper::Load() m_sentenceEndWord[m_factorType] = m_sentenceEnd; m_neuralLM_shared = new nplm::neuralLM(m_filePath, true); - m_neuralLM_shared->set_log_base(10); //TODO: config option? m_neuralLM_shared->set_cache(1000000); @@ -56,7 +54,7 @@ LMResult NeuralLMWrapper::GetValue(const vector<const Word*> &contextFactor, Sta for (size_t i=0, n=contextFactor.size(); i<n; i++) { const Word* word = contextFactor[i]; const Factor* factor = word->GetFactor(m_factorType); - const std::string string= factor->GetString().as_string(); + const std::string string = factor->GetString().as_string(); int neuralLM_wordID = m_neuralLM->lookup_word(string); words[i] = neuralLM_wordID; boost::hash_combine(hashCode, neuralLM_wordID); @@ -66,7 +64,7 @@ LMResult NeuralLMWrapper::GetValue(const vector<const Word*> &contextFactor, Sta // Create a new struct to hold the result LMResult ret; - ret.score = value; + ret.score = FloorScore(value); ret.unknown = false; (*finalState) = (State*) hashCode; |