Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/lm
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2013-05-19 18:12:06 +0400
committerKenneth Heafield <github@kheafield.com>2013-05-19 18:12:06 +0400
commit50652382e9285740de73654a7f47a8f4a9d993a1 (patch)
tree31f37b7f09559678c3f4661290287ce39d34da39 /lm
parent41da56364565e0aa9d40cce018e5ef82f9766430 (diff)
KenLM 10ddf7d923355b35a7de9a5219673eca9e18be98 except Hieu's slow string_piece_hash
Diffstat (limited to 'lm')
-rw-r--r--lm/Jamfile12
-rw-r--r--lm/builder/corpus_count.cc15
-rw-r--r--lm/builder/pipeline.hh2
-rw-r--r--lm/builder/print.cc74
-rw-r--r--lm/builder/print.hh3
-rw-r--r--lm/filter/Jamfile2
-rw-r--r--lm/filter/filter_main.cc4
-rw-r--r--lm/kenlm_max_order_main.cc6
-rw-r--r--lm/query_main.cc1
9 files changed, 24 insertions, 95 deletions
diff --git a/lm/Jamfile b/lm/Jamfile
index 3f25d9ce4..eaf629fd0 100644
--- a/lm/Jamfile
+++ b/lm/Jamfile
@@ -21,9 +21,11 @@ run left_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
run model_test.cc kenlm /top//boost_unit_test_framework : : test.arpa test_nounk.arpa ;
run partial_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ;
-exe query : query_main.cc kenlm ../util//kenutil ;
-exe build_binary : build_binary_main.cc kenlm ../util//kenutil ;
-exe kenlm_max_order : kenlm_max_order_main.cc : <include>.. $(max-order) ;
-exe fragment : fragment_main.cc kenlm ;
+exes = ;
+for local p in [ glob *_main.cc ] {
+ local name = [ MATCH "(.*)\_main.cc" : $(p) ] ;
+ exe $(name) : $(p) kenlm ;
+ exes += $(name) ;
+}
-alias programs : query build_binary kenlm_max_order fragment filter//filter : <threading>multi:<source>builder//lmplz ;
+alias programs : $(exes) filter//filter : <threading>multi:<source>builder//lmplz ;
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index 3714dddad..aea93ad10 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -3,6 +3,7 @@
#include "lm/builder/ngram.hh"
#include "lm/lm_exception.hh"
#include "lm/word_index.hh"
+#include "util/fake_ofstream.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
#include "util/murmur_hash.hh"
@@ -48,10 +49,8 @@ class VocabHandout {
explicit VocabHandout(int fd, WordIndex initial_guess) :
table_backing_(util::CallocOrThrow(MemUsage(initial_guess))),
table_(table_backing_.get(), MemUsage(initial_guess)),
- double_cutoff_(std::max<std::size_t>(initial_guess * 1.1, 1)) {
- util::scoped_fd duped(util::DupOrThrow(fd));
- word_list_.reset(util::FDOpenOrThrow(duped));
-
+ double_cutoff_(std::max<std::size_t>(initial_guess * 1.1, 1)),
+ word_list_(fd) {
Lookup("<unk>"); // Force 0
Lookup("<s>"); // Force 1
Lookup("</s>"); // Force 2
@@ -65,9 +64,7 @@ class VocabHandout {
Table::MutableIterator it;
if (table_.FindOrInsert(entry, it))
return it->value;
- char null_delimit = 0;
- util::WriteOrThrow(word_list_.get(), word.data(), word.size());
- util::WriteOrThrow(word_list_.get(), &null_delimit, 1);
+ word_list_ << word << '\0';
UTIL_THROW_IF(Size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh.");
if (Size() >= double_cutoff_) {
table_backing_.call_realloc(table_.DoubleTo());
@@ -90,8 +87,8 @@ class VocabHandout {
Table table_;
std::size_t double_cutoff_;
-
- util::scoped_FILE word_list_;
+
+ util::FakeOFStream word_list_;
};
class DedupeHash : public std::unary_function<const WordIndex *, bool> {
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index fc3314bf1..845e5481d 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -34,7 +34,7 @@ struct PipelineConfig {
std::size_t TotalMemory() const { return sort.total_memory; }
};
-// Takes ownership of text_file.
+// Takes ownership of text_file and out_arpa.
void Pipeline(PipelineConfig config, int text_file, int out_arpa);
}} // namespaces
diff --git a/lm/builder/print.cc b/lm/builder/print.cc
index b0323221a..84bd81cad 100644
--- a/lm/builder/print.cc
+++ b/lm/builder/print.cc
@@ -1,15 +1,11 @@
#include "lm/builder/print.hh"
-#include "util/double-conversion/double-conversion.h"
-#include "util/double-conversion/utils.h"
+#include "util/fake_ofstream.hh"
#include "util/file.hh"
#include "util/mmap.hh"
#include "util/scoped.hh"
#include "util/stream/timer.hh"
-#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
-#include <boost/lexical_cast.hpp>
-
#include <sstream>
#include <string.h>
@@ -28,71 +24,6 @@ VocabReconstitute::VocabReconstitute(int fd) {
map_.push_back(i);
}
-namespace {
-class OutputManager {
- public:
- static const std::size_t kOutBuf = 1048576;
-
- // Does not take ownership of out.
- explicit OutputManager(int out)
- : buf_(util::MallocOrThrow(kOutBuf)),
- builder_(static_cast<char*>(buf_.get()), kOutBuf),
- // Mostly the default but with inf instead. And no flags.
- convert_(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0),
- fd_(out) {}
-
- ~OutputManager() {
- Flush();
- }
-
- OutputManager &operator<<(float value) {
- // Odd, but this is the largest number found in the comments.
- EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8);
- convert_.ToShortestSingle(value, &builder_);
- return *this;
- }
-
- OutputManager &operator<<(StringPiece str) {
- if (str.size() > kOutBuf) {
- Flush();
- util::WriteOrThrow(fd_, str.data(), str.size());
- } else {
- EnsureRemaining(str.size());
- builder_.AddSubstring(str.data(), str.size());
- }
- return *this;
- }
-
- // Inefficient!
- OutputManager &operator<<(unsigned val) {
- return *this << boost::lexical_cast<std::string>(val);
- }
-
- OutputManager &operator<<(char c) {
- EnsureRemaining(1);
- builder_.AddCharacter(c);
- return *this;
- }
-
- void Flush() {
- util::WriteOrThrow(fd_, buf_.get(), builder_.position());
- builder_.Reset();
- }
-
- private:
- void EnsureRemaining(std::size_t amount) {
- if (static_cast<std::size_t>(builder_.size() - builder_.position()) < amount) {
- Flush();
- }
- }
-
- util::scoped_malloc buf_;
- double_conversion::StringBuilder builder_;
- double_conversion::DoubleToStringConverter convert_;
- int fd_;
-};
-} // namespace
-
PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd)
: vocab_(vocab), out_fd_(out_fd) {
std::stringstream stream;
@@ -112,8 +43,9 @@ PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t>
}
void PrintARPA::Run(const ChainPositions &positions) {
+ util::scoped_fd closer(out_fd_);
UTIL_TIMER("(%w s) Wrote ARPA file\n");
- OutputManager out(out_fd_);
+ util::FakeOFStream out(out_fd_);
for (unsigned order = 1; order <= positions.size(); ++order) {
out << "\\" << order << "-grams:" << '\n';
for (NGramStream stream(positions[order - 1]); stream; ++stream) {
diff --git a/lm/builder/print.hh b/lm/builder/print.hh
index aa932e757..adbbb94a9 100644
--- a/lm/builder/print.hh
+++ b/lm/builder/print.hh
@@ -88,7 +88,8 @@ template <class V> class Print {
class PrintARPA {
public:
- // header_info may be NULL to disable the header
+ // header_info may be NULL to disable the header.
+ // Takes ownership of out_fd upon Run().
explicit PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd);
void Run(const ChainPositions &positions);
diff --git a/lm/filter/Jamfile b/lm/filter/Jamfile
index adee3d233..bcf62da78 100644
--- a/lm/filter/Jamfile
+++ b/lm/filter/Jamfile
@@ -3,3 +3,5 @@ fakelib lm_filter : phrase.cc vocab.cc arpa_io.cc ../../util//kenutil : <threadi
obj main : filter_main.cc : <threading>single:<define>NTHREAD <include>../.. ;
exe filter : main lm_filter ../../util//kenutil ..//kenlm : <threading>multi:<library>/top//boost_thread ;
+
+exe phrase_table_vocab : phrase_table_vocab_main.cc ../../util//kenutil ;
diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc
index 1a4ba84fe..1736bc405 100644
--- a/lm/filter/filter_main.cc
+++ b/lm/filter/filter_main.cc
@@ -25,8 +25,8 @@ void DisplayHelp(const char *name) {
" parser.\n"
"single mode treats the entire input as a single sentence.\n"
"multiple mode filters to multiple sentences in parallel. Each sentence is on\n"
- " a separate line. A separate file is created for each file by appending the\n"
- " 0-indexed line number to the output file name.\n"
+ " a separate line. A separate file is created for each sentence by appending\n"
+ " the 0-indexed line number to the output file name.\n"
"union mode produces one filtered model that is the union of models created by\n"
" multiple mode.\n\n"
"context means only the context (all but last word) has to pass the filter, but\n"
diff --git a/lm/kenlm_max_order_main.cc b/lm/kenlm_max_order_main.cc
deleted file mode 100644
index 94221201c..000000000
--- a/lm/kenlm_max_order_main.cc
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "lm/max_order.hh"
-#include <iostream>
-
-int main(int argc, char *argv[]) {
- std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl;
-}
diff --git a/lm/query_main.cc b/lm/query_main.cc
index 49757d9aa..27d3a1a56 100644
--- a/lm/query_main.cc
+++ b/lm/query_main.cc
@@ -2,6 +2,7 @@
int main(int argc, char *argv[]) {
if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) {
+ std::cerr << "KenLM was compiled with maximum order " << KENLM_MAX_ORDER << "." << std::endl;
std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl;
std::cerr << "Input is wrapped in <s> and </s> unless null is passed." << std::endl;
return 1;