diff options
author | Kenneth Heafield <github@kheafield.com> | 2014-02-18 05:36:42 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2014-02-18 05:36:42 +0400 |
commit | d313cae3dd3de0bf51c122a1beaa3d93dd511373 (patch) | |
tree | cd03f1fad306b81aae944e0b650217101ceebd61 | |
parent | d18074aa0576b827e7b1578ef5027daed5b5074c (diff) |
Untested just count assuming more than one block to merge
-rw-r--r-- | lm/builder/Jamfile | 1 | ||||
-rw-r--r-- | lm/builder/just_count_main.cc | 74 |
2 files changed, 75 insertions, 0 deletions
diff --git a/lm/builder/Jamfile b/lm/builder/Jamfile index a6c56fe..d627653 100644 --- a/lm/builder/Jamfile +++ b/lm/builder/Jamfile @@ -4,6 +4,7 @@ fakelib builder : [ glob *.cc : *test.cc *main.cc ] exe lmplz : lmplz_main.cc builder /top//boost_program_options ; #exe make_trie : make_trie_main.cc builder ; +exe just_count : just_count_main.cc builder /top//boost_program_options ; import testing ; unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ; diff --git a/lm/builder/just_count_main.cc b/lm/builder/just_count_main.cc new file mode 100644 index 0000000..55d2143 --- /dev/null +++ b/lm/builder/just_count_main.cc @@ -0,0 +1,74 @@ +#include "lm/builder/corpus_count.hh" +#include "lm/builder/sort.hh" +#include "util/stream/chain.hh" +#include "util/stream/io.hh" +#include "util/stream/sort.hh" +#include "util/file.hh" +#include "util/file_piece.hh" +#include "util/usage.hh" + +#include <boost/program_options.hpp> + +#include <string> + +namespace { +class SizeNotify { + public: + SizeNotify(std::size_t &out) : behind_(out) {} + + void operator()(const std::string &from) { + behind_ = util::ParseSize(from); + } + + private: + std::size_t &behind_; +}; + +boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) { + return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value); +} + +} // namespace + +int main(int argc, char *argv[]) { + namespace po = boost::program_options; + unsigned order; + std::size_t ram; + std::string temp_prefix, vocab; + po::options_description options("corpus count"); + options.add_options() + ("order,o", po::value<unsigned>(&order)->required(), "Order") + ("temp_prefix,T", po::value<std::string>(&temp_prefix)->default_value("/tmp"), "Temporary file prefix") + ("memory,S", SizeOption(ram, "80%"), "RAM") + ("vocab", po::value<std::string>(&vocab)->required(), "Vocab mapping to use"); + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, options), vm); + po::notify(vm); + + util::NormalizeTempPrefix(temp_prefix); + + util::scoped_fd vocab_file(util::OpenReadOrThrow(vocab.c_str())); + + std::size_t memory_for_chain = + // This much memory to work with after vocab hash table. + static_cast<float>(ram - util::SizeOrThrow(vocab_file.get())) / + // Solve for block size including the dedupe multiplier for one block. + (static_cast<float>(2) + lm::builder::CorpusCount::DedupeMultiplier(order)) * + // Chain likes memory expressed in terms of total memory. + static_cast<float>(2); + + util::stream::Chain chain(util::stream::ChainConfig(lm::builder::NGram::TotalSize(order), 2, memory_for_chain)); + util::FilePiece f(0, NULL, &std::cerr); + uint64_t token_count = 0; + lm::WordIndex type_count = 0; + lm::builder::CorpusCount counter(f, vocab_file.get(), token_count, type_count, chain.BlockSize() / chain.EntrySize(), lm::THROW_UP, false); + chain >> boost::ref(counter); + + util::stream::SortConfig sort_config; + sort_config.temp_prefix = temp_prefix; + sort_config.buffer_size = 64 * 1024 * 1024; + sort_config.total_memory = ram; + // Inefficiently copies if there's only one block. + util::stream::BlockingSort(chain, sort_config, lm::builder::SuffixOrder(order), lm::builder::AddCombiner()); + chain >> util::stream::WriteAndRecycle(1); +} |