Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/kpu/kenlm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2014-02-18 05:36:42 +0400
committerKenneth Heafield <github@kheafield.com>2014-02-18 05:36:42 +0400
commitd313cae3dd3de0bf51c122a1beaa3d93dd511373 (patch)
treecd03f1fad306b81aae944e0b650217101ceebd61
parentd18074aa0576b827e7b1578ef5027daed5b5074c (diff)
Untested just count assuming more than one block to merge
-rw-r--r--lm/builder/Jamfile1
-rw-r--r--lm/builder/just_count_main.cc74
2 files changed, 75 insertions, 0 deletions
diff --git a/lm/builder/Jamfile b/lm/builder/Jamfile
index a6c56fe..d627653 100644
--- a/lm/builder/Jamfile
+++ b/lm/builder/Jamfile
@@ -4,6 +4,7 @@ fakelib builder : [ glob *.cc : *test.cc *main.cc ]
exe lmplz : lmplz_main.cc builder /top//boost_program_options ;
#exe make_trie : make_trie_main.cc builder ;
+exe just_count : just_count_main.cc builder /top//boost_program_options ;
import testing ;
unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ;
diff --git a/lm/builder/just_count_main.cc b/lm/builder/just_count_main.cc
new file mode 100644
index 0000000..55d2143
--- /dev/null
+++ b/lm/builder/just_count_main.cc
@@ -0,0 +1,74 @@
+#include "lm/builder/corpus_count.hh"
+#include "lm/builder/sort.hh"
+#include "util/stream/chain.hh"
+#include "util/stream/io.hh"
+#include "util/stream/sort.hh"
+#include "util/file.hh"
+#include "util/file_piece.hh"
+#include "util/usage.hh"
+
+#include <boost/program_options.hpp>
+
+#include <string>
+
+namespace {
+class SizeNotify {
+ public:
+ SizeNotify(std::size_t &out) : behind_(out) {}
+
+ void operator()(const std::string &from) {
+ behind_ = util::ParseSize(from);
+ }
+
+ private:
+ std::size_t &behind_;
+};
+
+boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
+ return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
+}
+
+} // namespace
+
+int main(int argc, char *argv[]) {
+ namespace po = boost::program_options;
+ unsigned order;
+ std::size_t ram;
+ std::string temp_prefix, vocab;
+ po::options_description options("corpus count");
+ options.add_options()
+ ("order,o", po::value<unsigned>(&order)->required(), "Order")
+ ("temp_prefix,T", po::value<std::string>(&temp_prefix)->default_value("/tmp"), "Temporary file prefix")
+ ("memory,S", SizeOption(ram, "80%"), "RAM")
+ ("vocab", po::value<std::string>(&vocab)->required(), "Vocab mapping to use");
+ po::variables_map vm;
+ po::store(po::parse_command_line(argc, argv, options), vm);
+ po::notify(vm);
+
+ util::NormalizeTempPrefix(temp_prefix);
+
+ util::scoped_fd vocab_file(util::OpenReadOrThrow(vocab.c_str()));
+
+ std::size_t memory_for_chain =
+ // This much memory to work with after vocab hash table.
+ static_cast<float>(ram - util::SizeOrThrow(vocab_file.get())) /
+ // Solve for block size including the dedupe multiplier for one block.
+ (static_cast<float>(2) + lm::builder::CorpusCount::DedupeMultiplier(order)) *
+ // Chain likes memory expressed in terms of total memory.
+ static_cast<float>(2);
+
+ util::stream::Chain chain(util::stream::ChainConfig(lm::builder::NGram::TotalSize(order), 2, memory_for_chain));
+ util::FilePiece f(0, NULL, &std::cerr);
+ uint64_t token_count = 0;
+ lm::WordIndex type_count = 0;
+ lm::builder::CorpusCount counter(f, vocab_file.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize(), lm::THROW_UP, false);
+ chain >> boost::ref(counter);
+
+ util::stream::SortConfig sort_config;
+ sort_config.temp_prefix = temp_prefix;
+ sort_config.buffer_size = 64 * 1024 * 1024;
+ sort_config.total_memory = ram;
+ // Inefficiently copies if there's only one block.
+ util::stream::BlockingSort(chain, sort_config, lm::builder::SuffixOrder(order), lm::builder::AddCombiner());
+ chain >> util::stream::WriteAndRecycle(1);
+}