diff options
author | Kenneth Heafield <github@kheafield.com> | 2013-05-19 18:12:06 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2013-05-19 18:12:06 +0400 |
commit | 50652382e9285740de73654a7f47a8f4a9d993a1 (patch) | |
tree | 31f37b7f09559678c3f4661290287ce39d34da39 /lm | |
parent | 41da56364565e0aa9d40cce018e5ef82f9766430 (diff) |
KenLM 10ddf7d923355b35a7de9a5219673eca9e18be98 except Hieu's slow string_piece_hash
Diffstat (limited to 'lm')
-rw-r--r-- | lm/Jamfile | 12 | ||||
-rw-r--r-- | lm/builder/corpus_count.cc | 15 | ||||
-rw-r--r-- | lm/builder/pipeline.hh | 2 | ||||
-rw-r--r-- | lm/builder/print.cc | 74 | ||||
-rw-r--r-- | lm/builder/print.hh | 3 | ||||
-rw-r--r-- | lm/filter/Jamfile | 2 | ||||
-rw-r--r-- | lm/filter/filter_main.cc | 4 | ||||
-rw-r--r-- | lm/kenlm_max_order_main.cc | 6 | ||||
-rw-r--r-- | lm/query_main.cc | 1 |
9 files changed, 24 insertions, 95 deletions
diff --git a/lm/Jamfile b/lm/Jamfile index 3f25d9ce4..eaf629fd0 100644 --- a/lm/Jamfile +++ b/lm/Jamfile @@ -21,9 +21,11 @@ run left_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ; run model_test.cc kenlm /top//boost_unit_test_framework : : test.arpa test_nounk.arpa ; run partial_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ; -exe query : query_main.cc kenlm ../util//kenutil ; -exe build_binary : build_binary_main.cc kenlm ../util//kenutil ; -exe kenlm_max_order : kenlm_max_order_main.cc : <include>.. $(max-order) ; -exe fragment : fragment_main.cc kenlm ; +exes = ; +for local p in [ glob *_main.cc ] { + local name = [ MATCH "(.*)\_main.cc" : $(p) ] ; + exe $(name) : $(p) kenlm ; + exes += $(name) ; +} -alias programs : query build_binary kenlm_max_order fragment filter//filter : <threading>multi:<source>builder//lmplz ; +alias programs : $(exes) filter//filter : <threading>multi:<source>builder//lmplz ; diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc index 3714dddad..aea93ad10 100644 --- a/lm/builder/corpus_count.cc +++ b/lm/builder/corpus_count.cc @@ -3,6 +3,7 @@ #include "lm/builder/ngram.hh" #include "lm/lm_exception.hh" #include "lm/word_index.hh" +#include "util/fake_ofstream.hh" #include "util/file.hh" #include "util/file_piece.hh" #include "util/murmur_hash.hh" @@ -48,10 +49,8 @@ class VocabHandout { explicit VocabHandout(int fd, WordIndex initial_guess) : table_backing_(util::CallocOrThrow(MemUsage(initial_guess))), table_(table_backing_.get(), MemUsage(initial_guess)), - double_cutoff_(std::max<std::size_t>(initial_guess * 1.1, 1)) { - util::scoped_fd duped(util::DupOrThrow(fd)); - word_list_.reset(util::FDOpenOrThrow(duped)); - + double_cutoff_(std::max<std::size_t>(initial_guess * 1.1, 1)), + word_list_(fd) { Lookup("<unk>"); // Force 0 Lookup("<s>"); // Force 1 Lookup("</s>"); // Force 2 @@ -65,9 +64,7 @@ class VocabHandout { Table::MutableIterator it; if (table_.FindOrInsert(entry, it)) return it->value; - char null_delimit = 0; - util::WriteOrThrow(word_list_.get(), word.data(), word.size()); - util::WriteOrThrow(word_list_.get(), &null_delimit, 1); + word_list_ << word << '\0'; UTIL_THROW_IF(Size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words. Change WordIndex to uint64_t in lm/word_index.hh."); if (Size() >= double_cutoff_) { table_backing_.call_realloc(table_.DoubleTo()); @@ -90,8 +87,8 @@ class VocabHandout { Table table_; std::size_t double_cutoff_; - - util::scoped_FILE word_list_; + + util::FakeOFStream word_list_; }; class DedupeHash : public std::unary_function<const WordIndex *, bool> { diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh index fc3314bf1..845e5481d 100644 --- a/lm/builder/pipeline.hh +++ b/lm/builder/pipeline.hh @@ -34,7 +34,7 @@ struct PipelineConfig { std::size_t TotalMemory() const { return sort.total_memory; } }; -// Takes ownership of text_file. +// Takes ownership of text_file and out_arpa. void Pipeline(PipelineConfig config, int text_file, int out_arpa); }} // namespaces diff --git a/lm/builder/print.cc b/lm/builder/print.cc index b0323221a..84bd81cad 100644 --- a/lm/builder/print.cc +++ b/lm/builder/print.cc @@ -1,15 +1,11 @@ #include "lm/builder/print.hh" -#include "util/double-conversion/double-conversion.h" -#include "util/double-conversion/utils.h" +#include "util/fake_ofstream.hh" #include "util/file.hh" #include "util/mmap.hh" #include "util/scoped.hh" #include "util/stream/timer.hh" -#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE -#include <boost/lexical_cast.hpp> - #include <sstream> #include <string.h> @@ -28,71 +24,6 @@ VocabReconstitute::VocabReconstitute(int fd) { map_.push_back(i); } -namespace { -class OutputManager { - public: - static const std::size_t kOutBuf = 1048576; - - // Does not take ownership of out. - explicit OutputManager(int out) - : buf_(util::MallocOrThrow(kOutBuf)), - builder_(static_cast<char*>(buf_.get()), kOutBuf), - // Mostly the default but with inf instead. And no flags. - convert_(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0), - fd_(out) {} - - ~OutputManager() { - Flush(); - } - - OutputManager &operator<<(float value) { - // Odd, but this is the largest number found in the comments. - EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8); - convert_.ToShortestSingle(value, &builder_); - return *this; - } - - OutputManager &operator<<(StringPiece str) { - if (str.size() > kOutBuf) { - Flush(); - util::WriteOrThrow(fd_, str.data(), str.size()); - } else { - EnsureRemaining(str.size()); - builder_.AddSubstring(str.data(), str.size()); - } - return *this; - } - - // Inefficient! - OutputManager &operator<<(unsigned val) { - return *this << boost::lexical_cast<std::string>(val); - } - - OutputManager &operator<<(char c) { - EnsureRemaining(1); - builder_.AddCharacter(c); - return *this; - } - - void Flush() { - util::WriteOrThrow(fd_, buf_.get(), builder_.position()); - builder_.Reset(); - } - - private: - void EnsureRemaining(std::size_t amount) { - if (static_cast<std::size_t>(builder_.size() - builder_.position()) < amount) { - Flush(); - } - } - - util::scoped_malloc buf_; - double_conversion::StringBuilder builder_; - double_conversion::DoubleToStringConverter convert_; - int fd_; -}; -} // namespace - PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd) : vocab_(vocab), out_fd_(out_fd) { std::stringstream stream; @@ -112,8 +43,9 @@ PrintARPA::PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> } void PrintARPA::Run(const ChainPositions &positions) { + util::scoped_fd closer(out_fd_); UTIL_TIMER("(%w s) Wrote ARPA file\n"); - OutputManager out(out_fd_); + util::FakeOFStream out(out_fd_); for (unsigned order = 1; order <= positions.size(); ++order) { out << "\\" << order << "-grams:" << '\n'; for (NGramStream stream(positions[order - 1]); stream; ++stream) { diff --git a/lm/builder/print.hh b/lm/builder/print.hh index aa932e757..adbbb94a9 100644 --- a/lm/builder/print.hh +++ b/lm/builder/print.hh @@ -88,7 +88,8 @@ template <class V> class Print { class PrintARPA { public: - // header_info may be NULL to disable the header + // header_info may be NULL to disable the header. + // Takes ownership of out_fd upon Run(). explicit PrintARPA(const VocabReconstitute &vocab, const std::vector<uint64_t> &counts, const HeaderInfo* header_info, int out_fd); void Run(const ChainPositions &positions); diff --git a/lm/filter/Jamfile b/lm/filter/Jamfile index adee3d233..bcf62da78 100644 --- a/lm/filter/Jamfile +++ b/lm/filter/Jamfile @@ -3,3 +3,5 @@ fakelib lm_filter : phrase.cc vocab.cc arpa_io.cc ../../util//kenutil : <threadi obj main : filter_main.cc : <threading>single:<define>NTHREAD <include>../.. ; exe filter : main lm_filter ../../util//kenutil ..//kenlm : <threading>multi:<library>/top//boost_thread ; + +exe phrase_table_vocab : phrase_table_vocab_main.cc ../../util//kenutil ; diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc index 1a4ba84fe..1736bc405 100644 --- a/lm/filter/filter_main.cc +++ b/lm/filter/filter_main.cc @@ -25,8 +25,8 @@ void DisplayHelp(const char *name) { " parser.\n" "single mode treats the entire input as a single sentence.\n" "multiple mode filters to multiple sentences in parallel. Each sentence is on\n" - " a separate line. A separate file is created for each file by appending the\n" - " 0-indexed line number to the output file name.\n" + " a separate line. A separate file is created for each sentence by appending\n" + " the 0-indexed line number to the output file name.\n" "union mode produces one filtered model that is the union of models created by\n" " multiple mode.\n\n" "context means only the context (all but last word) has to pass the filter, but\n" diff --git a/lm/kenlm_max_order_main.cc b/lm/kenlm_max_order_main.cc deleted file mode 100644 index 94221201c..000000000 --- a/lm/kenlm_max_order_main.cc +++ /dev/null @@ -1,6 +0,0 @@ -#include "lm/max_order.hh" -#include <iostream> - -int main(int argc, char *argv[]) { - std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl; -} diff --git a/lm/query_main.cc b/lm/query_main.cc index 49757d9aa..27d3a1a56 100644 --- a/lm/query_main.cc +++ b/lm/query_main.cc @@ -2,6 +2,7 @@ int main(int argc, char *argv[]) { if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) { + std::cerr << "KenLM was compiled with maximum order " << KENLM_MAX_ORDER << "." << std::endl; std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl; std::cerr << "Input is wrapped in <s> and </s> unless null is passed." << std::endl; return 1; |