Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/lm
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2015-08-27 12:55:52 +0300
committerKenneth Heafield <github@kheafield.com>2015-08-27 12:55:52 +0300
commit09ecd071f9571a3808fc43700186fd777564785b (patch)
tree98236edb11f43a01980fe2ebaf397bf3ac344b53 /lm
parent380b5a5dfd99a7df4ea8f270abf72debe5e9cb6e (diff)
KenLM 2a3e8fae3633c890cb3b342d461f9130c8e343fa excluding unfinished interpolation directory
Diffstat (limited to 'lm')
-rw-r--r--lm/CMakeLists.txt183
-rw-r--r--lm/builder/CMakeLists.txt87
-rw-r--r--lm/builder/corpus_count.cc3
-rw-r--r--lm/builder/corpus_count_test.cc5
-rw-r--r--lm/builder/debug_print.hh (renamed from lm/builder/print.hh)61
-rw-r--r--lm/builder/dump_counts_main.cc4
-rw-r--r--lm/builder/header_info.hh4
-rw-r--r--lm/builder/initial_probabilities.cc2
-rw-r--r--lm/builder/initial_probabilities.hh3
-rw-r--r--lm/builder/interpolate.cc11
-rw-r--r--lm/builder/interpolate.hh2
-rw-r--r--lm/builder/lmplz_main.cc35
-rw-r--r--lm/builder/output.cc27
-rw-r--r--lm/builder/output.hh48
-rw-r--r--lm/builder/pipeline.cc35
-rw-r--r--lm/builder/pipeline.hh1
-rw-r--r--lm/builder/print.cc64
-rw-r--r--lm/common/CMakeLists.txt40
-rw-r--r--lm/common/Jamfile2
-rw-r--r--lm/common/joint_order.hh (renamed from lm/builder/joint_order.hh)29
-rw-r--r--lm/common/model_buffer.cc49
-rw-r--r--lm/common/model_buffer.hh46
-rw-r--r--lm/common/ngram.hh5
-rw-r--r--lm/common/ngram_stream.hh41
-rw-r--r--lm/common/print.cc62
-rw-r--r--lm/common/print.hh58
-rw-r--r--lm/common/size_option.cc24
-rw-r--r--lm/common/size_option.hh11
-rw-r--r--lm/common/special.hh (renamed from lm/builder/special.hh)10
-rw-r--r--lm/filter/CMakeLists.txt62
30 files changed, 677 insertions, 337 deletions
diff --git a/lm/CMakeLists.txt b/lm/CMakeLists.txt
index 62de6f0b5..195fc730c 100644
--- a/lm/CMakeLists.txt
+++ b/lm/CMakeLists.txt
@@ -1,46 +1,139 @@
+cmake_minimum_required(VERSION 2.8.8)
+#
+# The KenLM cmake files make use of add_library(... OBJECTS ...)
+#
+# This syntax allows grouping of source files when compiling
+# (effectively creating "fake" libraries based on source subdirs).
+#
+# This syntax was only added in cmake version 2.8.8
+#
+# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
+
+
+# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
+
+
+set(KENLM_MAX_ORDER 6)
+
+add_definitions(-DKENLM_MAX_ORDER=${KENLM_MAX_ORDER})
+
+
+# Explicitly list the source files for this subdirectory
+#
+# If you add any source files to this subdirectory
+# that should be included in the kenlm library,
+# (this excludes any unit test files)
+# you should add them to the following list:
+set(KENLM_SOURCE
+ bhiksha.cc
+ binary_format.cc
+ config.cc
+ lm_exception.cc
+ model.cc
+ quantize.cc
+ read_arpa.cc
+ search_hashed.cc
+ search_trie.cc
+ sizes.cc
+ trie.cc
+ trie_sort.cc
+ value_build.cc
+ virtual_interface.cc
+ vocab.cc
+)
+
+
+# Group these objects together for later use.
+#
+# Given add_library(foo OBJECT ${my_foo_sources}),
+# refer to these objects as $<TARGET_OBJECTS:foo>
+#
+add_library(kenlm OBJECT ${KENLM_SOURCE})
+
+# This directory has children that need to be processed
+add_subdirectory(builder)
+add_subdirectory(common)
+add_subdirectory(filter)
+
+
+
+# Explicitly list the executable files to be compiled
+set(EXE_LIST
+ query
+ fragment
+ build_binary
+)
+
+# Iterate through the executable list
+foreach(exe ${EXE_LIST})
+
+ # Compile the executable, linking against the requisite dependent object files
+ add_executable(${exe} ${exe}_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>)
+
+ # Link the executable against boost
+ target_link_libraries(${exe} ${Boost_LIBRARIES})
+
+ # Group executables together
+ set_target_properties(${exe} PROPERTIES FOLDER executables)
+
+# End for loop
+endforeach(exe)
+
+
+# Install the executable files
+install(TARGETS ${EXE_LIST} DESTINATION bin)
+
+
+
+if(BUILD_TESTING)
+
+ # Explicitly list the Boost test files to be compiled
+ set(KENLM_BOOST_TESTS_LIST
+ left_test
+ model_test
+ partial_test
+ )
+
+ # Iterate through the Boost tests list
+ foreach(test ${KENLM_BOOST_TESTS_LIST})
+
+ # Compile the executable, linking against the requisite dependent object files
+ add_executable(${test} ${test}.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_util>)
+
+ # Require the following compile flag
+ set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
+
+ # Link the executable against boost
+ target_link_libraries(${test} ${Boost_LIBRARIES})
+
+ # model_test requires an extra command line parameter
+ if ("${test}" STREQUAL "model_test")
+ set(test_params
+ ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
+ ${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa
+ )
+ else()
+ set(test_params
+ ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
+ )
+ endif()
+
+ # Specify command arguments for how to run each unit test
+ #
+ # Assuming that foo was defined via add_executable(foo ...),
+ # the syntax $<TARGET_FILE:foo> gives the full path to the executable.
+ #
+ add_test(NAME ${test}_test
+ COMMAND $<TARGET_FILE:${test}> ${test_params})
+
+ # Group unit tests together
+ set_target_properties(${test} PROPERTIES FOLDER "unit_tests")
+
+ # End for loop
+ endforeach(test)
+
+endif()
+
+
+
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/bhiksha.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/bhiksha.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/binary_format.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/binary_format.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/blank.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/config.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/config.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/enumerate_vocab.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/facade.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/left.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/lm_exception.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/lm_exception.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/max_order.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/model.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/model.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/model_type.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/ngram_query.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/partial.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/quantize.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/quantize.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/read_arpa.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/read_arpa.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/return.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/search_hashed.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/search_hashed.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/search_trie.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/search_trie.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/sizes.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/sizes.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/state.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/trie.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/trie.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/trie_sort.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/trie_sort.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/value.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/value_build.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/value_build.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/virtual_interface.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/virtual_interface.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/vocab.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/weights.hh")
-list(APPEND SOURCE_KENLM "${CMAKE_CURRENT_SOURCE_DIR}/word_index.hh")
-
-add_library(kenlm OBJECT ${SOURCE_KENLM}) \ No newline at end of file
diff --git a/lm/builder/CMakeLists.txt b/lm/builder/CMakeLists.txt
new file mode 100644
index 000000000..d84a7f7da
--- /dev/null
+++ b/lm/builder/CMakeLists.txt
@@ -0,0 +1,87 @@
+cmake_minimum_required(VERSION 2.8.8)
+#
+# The KenLM cmake files make use of add_library(... OBJECTS ...)
+#
+# This syntax allows grouping of source files when compiling
+# (effectively creating "fake" libraries based on source subdirs).
+#
+# This syntax was only added in cmake version 2.8.8
+#
+# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
+
+
+# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
+
+# Explicitly list the source files for this subdirectory
+#
+# If you add any source files to this subdirectory
+# that should be included in the kenlm library,
+# (this excludes any unit test files)
+# you should add them to the following list:
+#
+# In order to set correct paths to these files
+# in case this variable is referenced by CMake files in the parent directory,
+# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
+#
+set(KENLM_BUILDER_SOURCE
+ ${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/output.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc
+ )
+
+
+# Group these objects together for later use.
+#
+# Given add_library(foo OBJECT ${my_foo_sources}),
+# refer to these objects as $<TARGET_OBJECTS:foo>
+#
+add_library(kenlm_builder OBJECT ${KENLM_BUILDER_SOURCE})
+
+
+# Compile the executable, linking against the requisite dependent object files
+add_executable(lmplz lmplz_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
+
+# Link the executable against boost
+target_link_libraries(lmplz ${Boost_LIBRARIES})
+
+# Group executables together
+set_target_properties(lmplz PROPERTIES FOLDER executables)
+
+if(BUILD_TESTING)
+
+ # Explicitly list the Boost test files to be compiled
+ set(KENLM_BOOST_TESTS_LIST
+ adjust_counts_test
+ corpus_count_test
+ )
+
+ # Iterate through the Boost tests list
+ foreach(test ${KENLM_BOOST_TESTS_LIST})
+
+ # Compile the executable, linking against the requisite dependent object files
+ add_executable(${test} ${test}.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
+
+ # Require the following compile flag
+ set_target_properties(${test} PROPERTIES COMPILE_FLAGS "-DBOOST_TEST_DYN_LINK -DBOOST_PROGRAM_OPTIONS_DYN_LINK")
+
+ # Link the executable against boost
+ target_link_libraries(${test} ${Boost_LIBRARIES})
+
+ # Specify command arguments for how to run each unit test
+ #
+ # Assuming that foo was defined via add_executable(foo ...),
+ # the syntax $<TARGET_FILE:foo> gives the full path to the executable.
+ #
+ add_test(NAME ${test}_test
+ COMMAND $<TARGET_FILE:${test}>)
+
+ # Group unit tests together
+ set_target_properties(${test} PROPERTIES FOLDER "unit_tests")
+
+ # End for loop
+ endforeach(test)
+
+endif()
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index 9f23b28a8..04815d805 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -15,9 +15,6 @@
#include "util/stream/timer.hh"
#include "util/tokenize_piece.hh"
-#include <boost/unordered_set.hpp>
-#include <boost/unordered_map.hpp>
-
#include <functional>
#include <stdint.h>
diff --git a/lm/builder/corpus_count_test.cc b/lm/builder/corpus_count_test.cc
index 82f859690..88bcf9657 100644
--- a/lm/builder/corpus_count_test.cc
+++ b/lm/builder/corpus_count_test.cc
@@ -43,12 +43,13 @@ BOOST_AUTO_TEST_CASE(Short) {
util::scoped_fd vocab(util::MakeTemp("corpus_count_test_vocab"));
util::stream::Chain chain(config);
- NGramStream<BuildingPayload> stream;
uint64_t token_count;
WordIndex type_count = 10;
std::vector<bool> prune_words;
CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT);
- chain >> boost::ref(counter) >> stream >> util::stream::kRecycle;
+ chain >> boost::ref(counter);
+ NGramStream<BuildingPayload> stream(chain.Add());
+ chain >> util::stream::kRecycle;
const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};
diff --git a/lm/builder/print.hh b/lm/builder/debug_print.hh
index 5f293de85..193a6892c 100644
--- a/lm/builder/print.hh
+++ b/lm/builder/debug_print.hh
@@ -1,54 +1,18 @@
-#ifndef LM_BUILDER_PRINT_H
-#define LM_BUILDER_PRINT_H
+#ifndef LM_BUILDER_DEBUG_PRINT_H
+#define LM_BUILDER_DEBUG_PRINT_H
-#include "lm/common/ngram_stream.hh"
-#include "lm/builder/output.hh"
#include "lm/builder/payload.hh"
-#include "lm/common/ngram.hh"
+#include "lm/common/print.hh"
+#include "lm/common/ngram_stream.hh"
#include "util/fake_ofstream.hh"
#include "util/file.hh"
-#include "util/mmap.hh"
-#include "util/string_piece.hh"
#include <boost/lexical_cast.hpp>
-#include <ostream>
-#include <cassert>
-
-// Warning: print routines read all unigrams before all bigrams before all
-// trigrams etc. So if other parts of the chain move jointly, you'll have to
-// buffer.
-
namespace lm { namespace builder {
-
-class VocabReconstitute {
- public:
- // fd must be alive for life of this object; does not take ownership.
- explicit VocabReconstitute(int fd);
-
- const char *Lookup(WordIndex index) const {
- assert(index < map_.size() - 1);
- return map_[index];
- }
-
- StringPiece LookupPiece(WordIndex index) const {
- return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]);
- }
-
- std::size_t Size() const {
- // There's an extra entry to support StringPiece lengths.
- return map_.size() - 1;
- }
-
- private:
- util::scoped_memory memory_;
- std::vector<const char*> map_;
-};
-
// Not defined, only specialized.
template <class T> void PrintPayload(util::FakeOFStream &to, const BuildingPayload &payload);
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const BuildingPayload &payload) {
- // TODO slow
to << payload.count;
}
template <> inline void PrintPayload<Uninterpolated>(util::FakeOFStream &to, const BuildingPayload &payload) {
@@ -101,19 +65,6 @@ template <class V> class Print {
int to_;
};
-class PrintARPA : public OutputHook {
- public:
- explicit PrintARPA(int fd, bool verbose_header)
- : OutputHook(PROB_SEQUENTIAL_HOOK), out_fd_(fd), verbose_header_(verbose_header) {}
-
- void Sink(util::stream::Chains &chains);
-
- void Run(const util::stream::ChainPositions &positions);
-
- private:
- util::scoped_fd out_fd_;
- bool verbose_header_;
-};
-
}} // namespaces
-#endif // LM_BUILDER_PRINT_H
+
+#endif // LM_BUILDER_DEBUG_PRINT_H
diff --git a/lm/builder/dump_counts_main.cc b/lm/builder/dump_counts_main.cc
index fa0016792..a4c9478b6 100644
--- a/lm/builder/dump_counts_main.cc
+++ b/lm/builder/dump_counts_main.cc
@@ -1,4 +1,4 @@
-#include "lm/builder/print.hh"
+#include "lm/common/print.hh"
#include "lm/word_index.hh"
#include "util/file.hh"
#include "util/read_compressed.hh"
@@ -20,7 +20,7 @@ int main(int argc, char *argv[]) {
}
util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
- lm::builder::VocabReconstitute vocab(vocab_file.get());
+ lm::VocabReconstitute vocab(vocab_file.get());
unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
diff --git a/lm/builder/header_info.hh b/lm/builder/header_info.hh
index 146195233..d01d0496b 100644
--- a/lm/builder/header_info.hh
+++ b/lm/builder/header_info.hh
@@ -5,6 +5,8 @@
#include <vector>
#include <stdint.h>
+namespace lm { namespace builder {
+
// Some configuration info that is used to add
// comments to the beginning of an ARPA file
struct HeaderInfo {
@@ -21,4 +23,6 @@ struct HeaderInfo {
// TODO: More info if multiple models were interpolated
};
+}} // namespaces
+
#endif
diff --git a/lm/builder/initial_probabilities.cc b/lm/builder/initial_probabilities.cc
index ef8a8ecfd..5b8d86d33 100644
--- a/lm/builder/initial_probabilities.cc
+++ b/lm/builder/initial_probabilities.cc
@@ -1,9 +1,9 @@
#include "lm/builder/initial_probabilities.hh"
#include "lm/builder/discount.hh"
-#include "lm/builder/special.hh"
#include "lm/builder/hash_gamma.hh"
#include "lm/builder/payload.hh"
+#include "lm/common/special.hh"
#include "lm/common/ngram_stream.hh"
#include "util/murmur_hash.hh"
#include "util/file.hh"
diff --git a/lm/builder/initial_probabilities.hh b/lm/builder/initial_probabilities.hh
index dddbbb913..caeea58c5 100644
--- a/lm/builder/initial_probabilities.hh
+++ b/lm/builder/initial_probabilities.hh
@@ -10,9 +10,8 @@
namespace util { namespace stream { class Chains; } }
namespace lm {
-namespace builder {
-
class SpecialVocab;
+namespace builder {
struct InitialProbabilitiesConfig {
// These should be small buffers to keep the adder from getting too far ahead
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 5f0a339bc..6374bcf04 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -1,16 +1,16 @@
#include "lm/builder/interpolate.hh"
#include "lm/builder/hash_gamma.hh"
-#include "lm/builder/joint_order.hh"
-#include "lm/common/ngram_stream.hh"
+#include "lm/builder/payload.hh"
#include "lm/common/compare.hh"
+#include "lm/common/joint_order.hh"
+#include "lm/common/ngram_stream.hh"
#include "lm/lm_exception.hh"
#include "util/fixed_array.hh"
#include "util/murmur_hash.hh"
#include <cassert>
#include <cmath>
-#include <iostream>
namespace lm { namespace builder {
namespace {
@@ -91,7 +91,8 @@ template <class Output> class Callback {
}
}
- void Enter(unsigned order_minus_1, NGram<BuildingPayload> &gram) {
+ void Enter(unsigned order_minus_1, void *data) {
+ NGram<BuildingPayload> gram(data, order_minus_1 + 1);
BuildingPayload &pay = gram.Value();
pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
probs_[order_minus_1 + 1] = pay.complete.prob;
@@ -125,7 +126,7 @@ template <class Output> class Callback {
output_.Gram(order_minus_1, out_backoff, pay.complete);
}
- void Exit(unsigned, const NGram<BuildingPayload> &) const {}
+ void Exit(unsigned, void *) const {}
private:
util::FixedArray<util::stream::Stream> backoffs_;
diff --git a/lm/builder/interpolate.hh b/lm/builder/interpolate.hh
index dcee75adb..d20cd545c 100644
--- a/lm/builder/interpolate.hh
+++ b/lm/builder/interpolate.hh
@@ -1,7 +1,7 @@
#ifndef LM_BUILDER_INTERPOLATE_H
#define LM_BUILDER_INTERPOLATE_H
-#include "lm/builder/special.hh"
+#include "lm/common/special.hh"
#include "lm/word_index.hh"
#include "util/stream/multi_stream.hh"
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index c27490665..cc3f381ca 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -1,6 +1,6 @@
#include "lm/builder/output.hh"
#include "lm/builder/pipeline.hh"
-#include "lm/builder/print.hh"
+#include "lm/common/size_option.hh"
#include "lm/lm_exception.hh"
#include "util/file.hh"
#include "util/file_piece.hh"
@@ -13,21 +13,6 @@
#include <vector>
namespace {
-class SizeNotify {
- public:
- SizeNotify(std::size_t &out) : behind_(out) {}
-
- void operator()(const std::string &from) {
- behind_ = util::ParseSize(from);
- }
-
- private:
- std::size_t &behind_;
-};
-
-boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
- return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
-}
// Parse and validate pruning thresholds then return vector of threshold counts
// for each n-grams order.
@@ -106,17 +91,16 @@ int main(int argc, char *argv[]) {
("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI. If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
- ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
- ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
- ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
+ ("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
+ ("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
+ ("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
- ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes")
("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
- ("intermediate", po::value<std::string>(&intermediate), "Write ngrams to an intermediate file. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on. Implicitly makes --vocab_file be the provided name + .vocab.")
+ ("intermediate", po::value<std::string>(&intermediate), "Write ngrams to intermediate files. Turns off ARPA output (which can be reactivated by --arpa file). Forces --renumber on.")
("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Rrenumber the vocabulary identifiers so that they are monotone with the hash of each string. This is consistent with the ordering used by the trie data structure.")
("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities. See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold. Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above. The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
@@ -217,15 +201,10 @@ int main(int argc, char *argv[]) {
bool writing_intermediate = vm.count("intermediate");
if (writing_intermediate) {
pipeline.renumber_vocabulary = true;
- if (!pipeline.vocab_file.empty()) {
- std::cerr << "--intermediate and --vocab_file are incompatible because --intermediate already makes a vocab file." << std::endl;
- return 1;
- }
- pipeline.vocab_file = intermediate + ".vocab";
}
- lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate);
+ lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q);
if (!writing_intermediate || vm.count("arpa")) {
- output.Add(new lm::builder::PrintARPA(out.release(), verbose_header));
+ output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
}
lm::builder::Pipeline(pipeline, in.release(), output);
} catch (const util::MallocException &e) {
diff --git a/lm/builder/output.cc b/lm/builder/output.cc
index 76478ad06..c92283ac6 100644
--- a/lm/builder/output.cc
+++ b/lm/builder/output.cc
@@ -1,6 +1,8 @@
#include "lm/builder/output.hh"
#include "lm/common/model_buffer.hh"
+#include "lm/common/print.hh"
+#include "util/fake_ofstream.hh"
#include "util/stream/multi_stream.hh"
#include <iostream>
@@ -9,23 +11,22 @@ namespace lm { namespace builder {
OutputHook::~OutputHook() {}
-Output::Output(StringPiece file_base, bool keep_buffer)
- : file_base_(file_base.data(), file_base.size()), keep_buffer_(keep_buffer) {}
+Output::Output(StringPiece file_base, bool keep_buffer, bool output_q)
+ : buffer_(file_base, keep_buffer, output_q) {}
-void Output::SinkProbs(util::stream::Chains &chains, bool output_q) {
+void Output::SinkProbs(util::stream::Chains &chains) {
Apply(PROB_PARALLEL_HOOK, chains);
- if (!keep_buffer_ && !Have(PROB_SEQUENTIAL_HOOK)) {
+ if (!buffer_.Keep() && !Have(PROB_SEQUENTIAL_HOOK)) {
chains >> util::stream::kRecycle;
chains.Wait(true);
return;
}
- lm::common::ModelBuffer buf(file_base_, keep_buffer_, output_q);
- buf.Sink(chains);
+ buffer_.Sink(chains, header_.counts_pruned);
chains >> util::stream::kRecycle;
chains.Wait(false);
if (Have(PROB_SEQUENTIAL_HOOK)) {
std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
- buf.Source(chains);
+ buffer_.Source(chains);
Apply(PROB_SEQUENTIAL_HOOK, chains);
chains >> util::stream::kRecycle;
chains.Wait(true);
@@ -34,8 +35,18 @@ void Output::SinkProbs(util::stream::Chains &chains, bool output_q) {
void Output::Apply(HookType hook_type, util::stream::Chains &chains) {
for (boost::ptr_vector<OutputHook>::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) {
- entry->Sink(chains);
+ entry->Sink(header_, VocabFile(), chains);
}
}
+void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) {
+ if (verbose_header_) {
+ util::FakeOFStream out(file_.get(), 50);
+ out << "# Input file: " << info.input_file << '\n';
+ out << "# Token count: " << info.token_count << '\n';
+ out << "# Smoothing: Modified Kneser-Ney" << '\n';
+ }
+ chains >> PrintARPA(vocab_file, file_.get(), info.counts_pruned);
+}
+
}} // namespaces
diff --git a/lm/builder/output.hh b/lm/builder/output.hh
index c1e0d1469..69d6c6dac 100644
--- a/lm/builder/output.hh
+++ b/lm/builder/output.hh
@@ -2,6 +2,7 @@
#define LM_BUILDER_OUTPUT_H
#include "lm/builder/header_info.hh"
+#include "lm/common/model_buffer.hh"
#include "util/file.hh"
#include <boost/ptr_container/ptr_vector.hpp>
@@ -20,69 +21,64 @@ enum HookType {
NUMBER_OF_HOOKS // Keep this last so we know how many values there are.
};
-class Output;
-
class OutputHook {
public:
- explicit OutputHook(HookType hook_type) : type_(hook_type), master_(NULL) {}
+ explicit OutputHook(HookType hook_type) : type_(hook_type) {}
virtual ~OutputHook();
- virtual void Sink(util::stream::Chains &chains) = 0;
+ virtual void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) = 0;
- protected:
- const HeaderInfo &GetHeader() const;
- int GetVocabFD() const;
+ HookType Type() const { return type_; }
private:
- friend class Output;
- const HookType type_;
- const Output *master_;
+ HookType type_;
};
class Output : boost::noncopyable {
public:
- Output(StringPiece file_base, bool keep_buffer);
+ Output(StringPiece file_base, bool keep_buffer, bool output_q);
// Takes ownership.
void Add(OutputHook *hook) {
- hook->master_ = this;
- outputs_[hook->type_].push_back(hook);
+ outputs_[hook->Type()].push_back(hook);
}
bool Have(HookType hook_type) const {
return !outputs_[hook_type].empty();
}
- void SetVocabFD(int to) { vocab_fd_ = to; }
- int GetVocabFD() const { return vocab_fd_; }
+ int VocabFile() const { return buffer_.VocabFile(); }
void SetHeader(const HeaderInfo &header) { header_ = header; }
const HeaderInfo &GetHeader() const { return header_; }
// This is called by the pipeline.
- void SinkProbs(util::stream::Chains &chains, bool output_q);
+ void SinkProbs(util::stream::Chains &chains);
unsigned int Steps() const { return Have(PROB_SEQUENTIAL_HOOK); }
private:
void Apply(HookType hook_type, util::stream::Chains &chains);
+ ModelBuffer buffer_;
+
boost::ptr_vector<OutputHook> outputs_[NUMBER_OF_HOOKS];
- int vocab_fd_;
HeaderInfo header_;
-
- std::string file_base_;
- bool keep_buffer_;
};
-inline const HeaderInfo &OutputHook::GetHeader() const {
- return master_->GetHeader();
-}
+class PrintHook : public OutputHook {
+ public:
+ // Takes ownership
+ PrintHook(int write_fd, bool verbose_header)
+ : OutputHook(PROB_SEQUENTIAL_HOOK), file_(write_fd), verbose_header_(verbose_header) {}
-inline int OutputHook::GetVocabFD() const {
- return master_->GetVocabFD();
-}
+ void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains);
+
+ private:
+ util::scoped_fd file_;
+ bool verbose_header_;
+};
}} // namespaces
diff --git a/lm/builder/pipeline.cc b/lm/builder/pipeline.cc
index d588beedf..69972e278 100644
--- a/lm/builder/pipeline.cc
+++ b/lm/builder/pipeline.cc
@@ -277,27 +277,27 @@ void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &maste
}
master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.prune_vocab, config.output_q, specials);
gamma_chains >> util::stream::kRecycle;
- output.SinkProbs(master.MutableChains(), config.output_q);
+ output.SinkProbs(master.MutableChains());
}
class VocabNumbering {
public:
- VocabNumbering(StringPiece vocab_file, StringPiece temp_prefix, bool renumber)
- : vocab_file_(vocab_file.data(), vocab_file.size()),
- temp_prefix_(temp_prefix.data(), temp_prefix.size()),
+ VocabNumbering(int final_vocab, StringPiece temp_prefix, bool renumber)
+ : final_vocab_(final_vocab),
renumber_(renumber),
specials_(kBOS, kEOS) {
- InitFile(renumber || vocab_file.empty());
+ if (renumber) {
+ temporary_.reset(util::MakeTemp(temp_prefix));
+ }
}
- int File() const { return null_delimited_.get(); }
+ int WriteOnTheFly() const { return renumber_ ? temporary_.get() : final_vocab_; }
// Compute the vocabulary mapping and return the memory used.
std::size_t ComputeMapping(WordIndex type_count) {
if (!renumber_) return 0;
- util::scoped_fd previous(null_delimited_.release());
- InitFile(vocab_file_.empty());
- ngram::SortedVocabulary::ComputeRenumbering(type_count, previous.get(), null_delimited_.get(), vocab_mapping_);
+ ngram::SortedVocabulary::ComputeRenumbering(type_count, temporary_.get(), final_vocab_, vocab_mapping_);
+ temporary_.reset();
return sizeof(WordIndex) * vocab_mapping_.size();
}
@@ -312,15 +312,9 @@ class VocabNumbering {
const SpecialVocab &Specials() const { return specials_; }
private:
- void InitFile(bool temp) {
- null_delimited_.reset(temp ?
- util::MakeTemp(temp_prefix_) :
- util::CreateOrThrow(vocab_file_.c_str()));
- }
-
- std::string vocab_file_, temp_prefix_;
-
- util::scoped_fd null_delimited_;
+ int final_vocab_;
+ // Out of order vocab file created on the fly.
+ util::scoped_fd temporary_;
bool renumber_;
@@ -349,18 +343,17 @@ void Pipeline(PipelineConfig &config, int text_file, Output &output) {
// master's destructor will wait for chains. But they might be deadlocked if
// this thread dies because e.g. it ran out of memory.
try {
- VocabNumbering numbering(config.vocab_file, config.TempPrefix(), config.renumber_vocabulary);
+ VocabNumbering numbering(output.VocabFile(), config.TempPrefix(), config.renumber_vocabulary);
uint64_t token_count;
WordIndex type_count;
std::string text_file_name;
std::vector<bool> prune_words;
util::scoped_ptr<util::stream::Sort<SuffixOrder, CombineCounts> > sorted_counts(
- CountText(text_file, numbering.File(), master, token_count, type_count, text_file_name, prune_words));
+ CountText(text_file, numbering.WriteOnTheFly(), master, token_count, type_count, text_file_name, prune_words));
std::cerr << "Unigram tokens " << token_count << " types " << type_count << std::endl;
// Create vocab mapping, which uses temporary memory, while nothing else is happening.
std::size_t subtract_for_numbering = numbering.ComputeMapping(type_count);
- output.SetVocabFD(numbering.File());
std::cerr << "=== 2/" << master.Steps() << " Calculating and sorting adjusted counts ===" << std::endl;
master.InitForAdjust(*sorted_counts, type_count, subtract_for_numbering);
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index 695ecf7bd..66f1fd9a8 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -18,7 +18,6 @@ class Output;
struct PipelineConfig {
std::size_t order;
- std::string vocab_file;
util::stream::SortConfig sort;
InitialProbabilitiesConfig initial_probs;
util::stream::ChainConfig read_backoffs;
diff --git a/lm/builder/print.cc b/lm/builder/print.cc
deleted file mode 100644
index 178e54a21..000000000
--- a/lm/builder/print.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-#include "lm/builder/print.hh"
-
-#include "util/fake_ofstream.hh"
-#include "util/file.hh"
-#include "util/mmap.hh"
-#include "util/scoped.hh"
-#include "util/stream/timer.hh"
-
-#include <sstream>
-#include <cstring>
-
-namespace lm { namespace builder {
-
-VocabReconstitute::VocabReconstitute(int fd) {
- uint64_t size = util::SizeOrThrow(fd);
- util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
- const char *const start = static_cast<const char*>(memory_.get());
- const char *i;
- for (i = start; i != start + size; i += strlen(i) + 1) {
- map_.push_back(i);
- }
- // Last one for LookupPiece.
- map_.push_back(i);
-}
-
-void PrintARPA::Sink(util::stream::Chains &chains) {
- chains >> boost::ref(*this);
-}
-
-void PrintARPA::Run(const util::stream::ChainPositions &positions) {
- VocabReconstitute vocab(GetVocabFD());
- util::FakeOFStream out(out_fd_.get());
-
- // Write header.
- if (verbose_header_) {
- out << "# Input file: " << GetHeader().input_file << '\n';
- out << "# Token count: " << GetHeader().token_count << '\n';
- out << "# Smoothing: Modified Kneser-Ney" << '\n';
- }
- out << "\\data\\\n";
- for (size_t i = 0; i < positions.size(); ++i) {
- out << "ngram " << (i+1) << '=' << GetHeader().counts_pruned[i] << '\n';
- }
- out << '\n';
-
- for (unsigned order = 1; order <= positions.size(); ++order) {
- out << "\\" << order << "-grams:" << '\n';
- for (NGramStream<BuildingPayload> stream(positions[order - 1]); stream; ++stream) {
- // Correcting for numerical precision issues. Take that IRST.
- out << stream->Value().complete.prob << '\t' << vocab.Lookup(*stream->begin());
- for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
- out << ' ' << vocab.Lookup(*i);
- }
- if (order != positions.size())
- out << '\t' << stream->Value().complete.backoff;
- out << '\n';
-
- }
- out << '\n';
- }
- out << "\\end\\\n";
-}
-
-}} // namespaces
diff --git a/lm/common/CMakeLists.txt b/lm/common/CMakeLists.txt
new file mode 100644
index 000000000..942e24bdc
--- /dev/null
+++ b/lm/common/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.8)
+#
+# The KenLM cmake files make use of add_library(... OBJECTS ...)
+#
+# This syntax allows grouping of source files when compiling
+# (effectively creating "fake" libraries based on source subdirs).
+#
+# This syntax was only added in cmake version 2.8.8
+#
+# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
+
+
+# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
+
+# Explicitly list the source files for this subdirectory
+#
+# If you add any source files to this subdirectory
+# that should be included in the kenlm library,
+# (this excludes any unit test files)
+# you should add them to the following list:
+#
+# In order to set correct paths to these files
+# in case this variable is referenced by CMake files in the parent directory,
+# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
+#
+set(KENLM_COMMON_SOURCE
+ ${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/print.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc
+ )
+
+
+# Group these objects together for later use.
+#
+# Given add_library(foo OBJECT ${my_foo_sources}),
+# refer to these objects as $<TARGET_OBJECTS:foo>
+#
+add_library(kenlm_common OBJECT ${KENLM_COMMON_SOURCE})
+
diff --git a/lm/common/Jamfile b/lm/common/Jamfile
index 1c9c37210..c9bdfd0df 100644
--- a/lm/common/Jamfile
+++ b/lm/common/Jamfile
@@ -1,2 +1,2 @@
fakelib common : [ glob *.cc : *test.cc *main.cc ]
- ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm ;
+ ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm /top//boost_program_options ;
diff --git a/lm/builder/joint_order.hh b/lm/common/joint_order.hh
index 5f62a4578..6113bb8f1 100644
--- a/lm/builder/joint_order.hh
+++ b/lm/common/joint_order.hh
@@ -1,8 +1,7 @@
-#ifndef LM_BUILDER_JOINT_ORDER_H
-#define LM_BUILDER_JOINT_ORDER_H
+#ifndef LM_COMMON_JOINT_ORDER_H
+#define LM_COMMON_JOINT_ORDER_H
#include "lm/common/ngram_stream.hh"
-#include "lm/builder/payload.hh"
#include "lm/lm_exception.hh"
#ifdef DEBUG
@@ -12,15 +11,19 @@
#include <cstring>
-namespace lm { namespace builder {
+namespace lm {
template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
// Allow matching to reference streams[-1].
- NGramStreams<BuildingPayload> streams_with_dummy;
- streams_with_dummy.InitWithDummy(positions);
- NGramStream<BuildingPayload> *streams = streams_with_dummy.begin() + 1;
+ util::FixedArray<ProxyStream<NGramHeader> > streams_with_dummy(positions.size() + 1);
+ // A bogus stream for [-1].
+ streams_with_dummy.push_back();
+ for (std::size_t i = 0; i < positions.size(); ++i) {
+ streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1));
+ }
+ ProxyStream<NGramHeader> *streams = streams_with_dummy.begin() + 1;
- unsigned int order;
+ std::size_t order;
for (order = 0; order < positions.size() && streams[order]; ++order) {}
assert(order); // should always have <unk>.
@@ -31,11 +34,11 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
less_compare.push_back(i + 1);
#endif // DEBUG
- unsigned int current = 0;
+ std::size_t current = 0;
while (true) {
// Does the context match the lower one?
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
- callback.Enter(current, *streams[current]);
+ callback.Enter(current, streams[current].Get());
// Transition to looking for extensions.
if (++current < order) continue;
}
@@ -51,7 +54,7 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
while(true) {
assert(current > 0);
--current;
- callback.Exit(current, *streams[current]);
+ callback.Exit(current, streams[current].Get());
if (++streams[current]) break;
@@ -63,6 +66,6 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
}
}
-}} // namespaces
+} // namespaces
-#endif // LM_BUILDER_JOINT_ORDER_H
+#endif // LM_COMMON_JOINT_ORDER_H
diff --git a/lm/common/model_buffer.cc b/lm/common/model_buffer.cc
index d4635da51..431d4ae4c 100644
--- a/lm/common/model_buffer.cc
+++ b/lm/common/model_buffer.cc
@@ -8,25 +8,30 @@
#include <boost/lexical_cast.hpp>
-namespace lm { namespace common {
+namespace lm {
namespace {
const char kMetadataHeader[] = "KenLM intermediate binary file";
} // namespace
-ModelBuffer::ModelBuffer(const std::string &file_base, bool keep_buffer, bool output_q)
- : file_base_(file_base), keep_buffer_(keep_buffer), output_q_(output_q) {}
-
-ModelBuffer::ModelBuffer(const std::string &file_base)
- : file_base_(file_base), keep_buffer_(false) {
+ModelBuffer::ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q)
+ : file_base_(file_base.data(), file_base.size()), keep_buffer_(keep_buffer), output_q_(output_q),
+ vocab_file_(keep_buffer ? util::CreateOrThrow((file_base_ + ".vocab").c_str()) : util::MakeTemp(file_base_)) {}
+
+ModelBuffer::ModelBuffer(StringPiece file_base)
+ : file_base_(file_base.data(), file_base.size()), keep_buffer_(false) {
const std::string full_name = file_base_ + ".kenlm_intermediate";
util::FilePiece in(full_name.c_str());
StringPiece token = in.ReadLine();
UTIL_THROW_IF2(token != kMetadataHeader, "File " << full_name << " begins with \"" << token << "\" not " << kMetadataHeader);
token = in.ReadDelimited();
- UTIL_THROW_IF2(token != "Order", "Expected Order, got \"" << token << "\" in " << full_name);
- unsigned long order = in.ReadULong();
+ UTIL_THROW_IF2(token != "Counts", "Expected Counts, got \"" << token << "\" in " << full_name);
+ char got;
+ while ((got = in.get()) == ' ') {
+ counts_.push_back(in.ReadULong());
+ }
+ UTIL_THROW_IF2(got != '\n', "Expected newline at end of counts.");
token = in.ReadDelimited();
UTIL_THROW_IF2(token != "Payload", "Expected Payload, got \"" << token << "\" in " << full_name);
@@ -39,16 +44,16 @@ ModelBuffer::ModelBuffer(const std::string &file_base)
UTIL_THROW(util::Exception, "Unknown payload " << token);
}
- files_.Init(order);
- for (unsigned long i = 0; i < order; ++i) {
+ vocab_file_.reset(util::OpenReadOrThrow((file_base_ + ".vocab").c_str()));
+
+ files_.Init(counts_.size());
+ for (unsigned long i = 0; i < counts_.size(); ++i) {
files_.push_back(util::OpenReadOrThrow((file_base_ + '.' + boost::lexical_cast<std::string>(i + 1)).c_str()));
}
}
-// virtual destructor
-ModelBuffer::~ModelBuffer() {}
-
-void ModelBuffer::Sink(util::stream::Chains &chains) {
+void ModelBuffer::Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts) {
+ counts_ = counts;
// Open files.
files_.Init(chains.size());
for (std::size_t i = 0; i < chains.size(); ++i) {
@@ -64,19 +69,23 @@ void ModelBuffer::Sink(util::stream::Chains &chains) {
if (keep_buffer_) {
util::scoped_fd metadata(util::CreateOrThrow((file_base_ + ".kenlm_intermediate").c_str()));
util::FakeOFStream meta(metadata.get(), 200);
- meta << kMetadataHeader << "\nOrder " << chains.size() << "\nPayload " << (output_q_ ? "q" : "pb") << '\n';
+ meta << kMetadataHeader << "\nCounts";
+ for (std::vector<uint64_t>::const_iterator i = counts_.begin(); i != counts_.end(); ++i) {
+ meta << ' ' << *i;
+ }
+ meta << "\nPayload " << (output_q_ ? "q" : "pb") << '\n';
}
}
void ModelBuffer::Source(util::stream::Chains &chains) {
- assert(chains.size() == files_.size());
- for (unsigned int i = 0; i < files_.size(); ++i) {
+ assert(chains.size() <= files_.size());
+ for (unsigned int i = 0; i < chains.size(); ++i) {
chains[i] >> util::stream::PRead(files_[i].get());
}
}
-std::size_t ModelBuffer::Order() const {
- return files_.size();
+void ModelBuffer::Source(std::size_t order_minus_1, util::stream::Chain &chain) {
+ chain >> util::stream::PRead(files_[order_minus_1].get());
}
-}} // namespaces
+} // namespace
diff --git a/lm/common/model_buffer.hh b/lm/common/model_buffer.hh
index 6a5c7bf49..92662bbf8 100644
--- a/lm/common/model_buffer.hh
+++ b/lm/common/model_buffer.hh
@@ -1,5 +1,5 @@
-#ifndef LM_BUILDER_MODEL_BUFFER_H
-#define LM_BUILDER_MODEL_BUFFER_H
+#ifndef LM_COMMON_MODEL_BUFFER_H
+#define LM_COMMON_MODEL_BUFFER_H
/* Format with separate files in suffix order. Each file contains
* n-grams of the same order.
@@ -9,37 +9,55 @@
#include "util/fixed_array.hh"
#include <string>
+#include <vector>
-namespace util { namespace stream { class Chains; } }
+namespace util { namespace stream {
+class Chains;
+class Chain;
+}} // namespaces
-namespace lm { namespace common {
+namespace lm {
class ModelBuffer {
public:
- // Construct for writing.
- ModelBuffer(const std::string &file_base, bool keep_buffer, bool output_q);
+ // Construct for writing. Must call VocabFile() and fill it with null-delimited vocab words.
+ ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q);
// Load from file.
- explicit ModelBuffer(const std::string &file_base);
-
- // explicit for virtual destructor.
- ~ModelBuffer();
+ explicit ModelBuffer(StringPiece file_base);
- void Sink(util::stream::Chains &chains);
+ // Must call VocabFile and populate before calling this function.
+ void Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts);
+ // Read files and write to the given chains. If fewer chains are provided,
+ // only do the lower orders.
void Source(util::stream::Chains &chains);
+ void Source(std::size_t order_minus_1, util::stream::Chain &chain);
+
// The order of the n-gram model that is associated with the model buffer.
- std::size_t Order() const;
+ std::size_t Order() const { return counts_.size(); }
+ // Requires Sink or load from file.
+ const std::vector<uint64_t> &Counts() const {
+ assert(!counts_.empty());
+ return counts_;
+ }
+
+ int VocabFile() const { return vocab_file_.get(); }
+ int StealVocabFile() { return vocab_file_.release(); }
+
+ bool Keep() const { return keep_buffer_; }
private:
const std::string file_base_;
const bool keep_buffer_;
bool output_q_;
+ std::vector<uint64_t> counts_;
+ util::scoped_fd vocab_file_;
util::FixedArray<util::scoped_fd> files_;
};
-}} // namespaces
+} // namespace lm
-#endif // LM_BUILDER_MODEL_BUFFER_H
+#endif // LM_COMMON_MODEL_BUFFER_H
diff --git a/lm/common/ngram.hh b/lm/common/ngram.hh
index 813017640..7a6d1c358 100644
--- a/lm/common/ngram.hh
+++ b/lm/common/ngram.hh
@@ -16,6 +16,8 @@ class NGramHeader {
NGramHeader(void *begin, std::size_t order)
: begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
+ NGramHeader() : begin_(NULL), end_(NULL) {}
+
const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); }
@@ -32,6 +34,7 @@ class NGramHeader {
const WordIndex *end() const { return end_; }
WordIndex *end() { return end_; }
+ std::size_t size() const { return end_ - begin_; }
std::size_t Order() const { return end_ - begin_; }
private:
@@ -42,6 +45,8 @@ template <class PayloadT> class NGram : public NGramHeader {
public:
typedef PayloadT Payload;
+ NGram() : NGramHeader(NULL, 0) {}
+
NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {}
// Would do operator++ but that can get confusing for a stream.
diff --git a/lm/common/ngram_stream.hh b/lm/common/ngram_stream.hh
index 53c4ffcb8..8bdf36e3c 100644
--- a/lm/common/ngram_stream.hh
+++ b/lm/common/ngram_stream.hh
@@ -10,24 +10,21 @@
namespace lm {
-template <class Payload> class NGramStream {
+template <class Proxy> class ProxyStream {
public:
- NGramStream() : gram_(NULL, 0) {}
+ // Make an invalid stream.
+ ProxyStream() {}
- NGramStream(const util::stream::ChainPosition &position) : gram_(NULL, 0) {
- Init(position);
+ explicit ProxyStream(const util::stream::ChainPosition &position, const Proxy &proxy = Proxy())
+ : proxy_(proxy), stream_(position) {
+ proxy_.ReBase(stream_.Get());
}
- void Init(const util::stream::ChainPosition &position) {
- stream_.Init(position);
- gram_ = NGram<Payload>(stream_.Get(), NGram<Payload>::OrderFromSize(position.GetChain().EntrySize()));
- }
-
- NGram<Payload> &operator*() { return gram_; }
- const NGram<Payload> &operator*() const { return gram_; }
+ Proxy &operator*() { return proxy_; }
+ const Proxy &operator*() const { return proxy_; }
- NGram<Payload> *operator->() { return &gram_; }
- const NGram<Payload> *operator->() const { return &gram_; }
+ Proxy *operator->() { return &proxy_; }
+ const Proxy *operator->() const { return &proxy_; }
void *Get() { return stream_.Get(); }
const void *Get() const { return stream_.Get(); }
@@ -36,21 +33,25 @@ template <class Payload> class NGramStream {
bool operator!() const { return !stream_; }
void Poison() { stream_.Poison(); }
- NGramStream &operator++() {
+ ProxyStream<Proxy> &operator++() {
++stream_;
- gram_.ReBase(stream_.Get());
+ proxy_.ReBase(stream_.Get());
return *this;
}
private:
- NGram<Payload> gram_;
+ Proxy proxy_;
util::stream::Stream stream_;
};
-template <class Payload> inline util::stream::Chain &operator>>(util::stream::Chain &chain, NGramStream<Payload> &str) {
- str.Init(chain.Add());
- return chain;
-}
+template <class Payload> class NGramStream : public ProxyStream<NGram<Payload> > {
+ public:
+ // Make an invalid stream.
+ NGramStream() {}
+
+ explicit NGramStream(const util::stream::ChainPosition &position) :
+ ProxyStream<NGram<Payload> >(position, NGram<Payload>(NULL, NGram<Payload>::OrderFromSize(position.GetChain().EntrySize()))) {}
+};
template <class Payload> class NGramStreams : public util::stream::GenericStreams<NGramStream<Payload> > {
private:
diff --git a/lm/common/print.cc b/lm/common/print.cc
new file mode 100644
index 000000000..cd2a80260
--- /dev/null
+++ b/lm/common/print.cc
@@ -0,0 +1,62 @@
+#include "lm/common/print.hh"
+
+#include "lm/common/ngram_stream.hh"
+#include "util/fake_ofstream.hh"
+#include "util/file.hh"
+#include "util/mmap.hh"
+#include "util/scoped.hh"
+
+#include <sstream>
+#include <cstring>
+
+namespace lm {
+
+VocabReconstitute::VocabReconstitute(int fd) {
+ uint64_t size = util::SizeOrThrow(fd);
+ util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
+ const char *const start = static_cast<const char*>(memory_.get());
+ const char *i;
+ for (i = start; i != start + size; i += strlen(i) + 1) {
+ map_.push_back(i);
+ }
+ // Last one for LookupPiece.
+ map_.push_back(i);
+}
+
+namespace {
+template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FakeOFStream &out) {
+ out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
+ for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
+ out << ' ' << vocab.Lookup(*i);
+ }
+}
+} // namespace
+
+void PrintARPA::Run(const util::stream::ChainPositions &positions) {
+ VocabReconstitute vocab(vocab_fd_);
+ util::FakeOFStream out(out_fd_);
+ out << "\\data\\\n";
+ for (size_t i = 0; i < positions.size(); ++i) {
+ out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
+ }
+ out << '\n';
+
+ for (unsigned order = 1; order < positions.size(); ++order) {
+ out << "\\" << order << "-grams:" << '\n';
+ for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
+ PrintLead(vocab, stream, out);
+ out << '\t' << stream->Value().backoff << '\n';
+ }
+ out << '\n';
+ }
+
+ out << "\\" << positions.size() << "-grams:" << '\n';
+ for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
+ PrintLead(vocab, stream, out);
+ out << '\n';
+ }
+ out << '\n';
+ out << "\\end\\\n";
+}
+
+} // namespace lm
diff --git a/lm/common/print.hh b/lm/common/print.hh
new file mode 100644
index 000000000..6aa08b32a
--- /dev/null
+++ b/lm/common/print.hh
@@ -0,0 +1,58 @@
+#ifndef LM_COMMON_PRINT_H
+#define LM_COMMON_PRINT_H
+
+#include "lm/word_index.hh"
+#include "util/mmap.hh"
+#include "util/string_piece.hh"
+
+#include <cassert>
+#include <vector>
+
+namespace util { namespace stream { class ChainPositions; }}
+
+// Warning: PrintARPA routines read all unigrams before all bigrams before all
+// trigrams etc. So if other parts of the chain move jointly, you'll have to
+// buffer.
+
+namespace lm {
+
+class VocabReconstitute {
+ public:
+ // fd must be alive for life of this object; does not take ownership.
+ explicit VocabReconstitute(int fd);
+
+ const char *Lookup(WordIndex index) const {
+ assert(index < map_.size() - 1);
+ return map_[index];
+ }
+
+ StringPiece LookupPiece(WordIndex index) const {
+ return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]);
+ }
+
+ std::size_t Size() const {
+ // There's an extra entry to support StringPiece lengths.
+ return map_.size() - 1;
+ }
+
+ private:
+ util::scoped_memory memory_;
+ std::vector<const char*> map_;
+};
+
+class PrintARPA {
+ public:
+ // Does not take ownership of vocab_fd or out_fd.
+ explicit PrintARPA(int vocab_fd, int out_fd, const std::vector<uint64_t> &counts)
+ : vocab_fd_(vocab_fd), out_fd_(out_fd), counts_(counts) {}
+
+ void Run(const util::stream::ChainPositions &positions);
+
+ private:
+ int vocab_fd_;
+ int out_fd_;
+ std::vector<uint64_t> counts_;
+};
+
+} // namespace lm
+#endif // LM_COMMON_PRINT_H
diff --git a/lm/common/size_option.cc b/lm/common/size_option.cc
new file mode 100644
index 000000000..46a920e69
--- /dev/null
+++ b/lm/common/size_option.cc
@@ -0,0 +1,24 @@
+#include <boost/program_options.hpp>
+#include "util/usage.hh"
+
+namespace lm {
+
+namespace {
+class SizeNotify {
+ public:
+ explicit SizeNotify(std::size_t &out) : behind_(out) {}
+
+ void operator()(const std::string &from) {
+ behind_ = util::ParseSize(from);
+ }
+
+ private:
+ std::size_t &behind_;
+};
+}
+
+boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
+ return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
+}
+
+} // namespace lm
diff --git a/lm/common/size_option.hh b/lm/common/size_option.hh
new file mode 100644
index 000000000..d3b8e33cb
--- /dev/null
+++ b/lm/common/size_option.hh
@@ -0,0 +1,11 @@
+#include <boost/program_options.hpp>
+
+#include <cstddef>
+#include <string>
+
+namespace lm {
+
+// Create a boost program option for data sizes. This parses sizes like 1T and 10k.
+boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value);
+
+} // namespace lm
diff --git a/lm/builder/special.hh b/lm/common/special.hh
index c70865ce1..0677cd71b 100644
--- a/lm/builder/special.hh
+++ b/lm/common/special.hh
@@ -1,9 +1,9 @@
-#ifndef LM_BUILDER_SPECIAL_H
-#define LM_BUILDER_SPECIAL_H
+#ifndef LM_COMMON_SPECIAL_H
+#define LM_COMMON_SPECIAL_H
#include "lm/word_index.hh"
-namespace lm { namespace builder {
+namespace lm {
class SpecialVocab {
public:
@@ -22,6 +22,6 @@ class SpecialVocab {
WordIndex eos_;
};
-}} // namespaces
+} // namespace lm
-#endif // LM_BUILDER_SPECIAL_H
+#endif // LM_COMMON_SPECIAL_H
diff --git a/lm/filter/CMakeLists.txt b/lm/filter/CMakeLists.txt
new file mode 100644
index 000000000..4e791cef8
--- /dev/null
+++ b/lm/filter/CMakeLists.txt
@@ -0,0 +1,62 @@
+cmake_minimum_required(VERSION 2.8.8)
+#
+# The KenLM cmake files make use of add_library(... OBJECTS ...)
+#
+# This syntax allows grouping of source files when compiling
+# (effectively creating "fake" libraries based on source subdirs).
+#
+# This syntax was only added in cmake version 2.8.8
+#
+# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
+
+
+# This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
+
+# Explicitly list the source files for this subdirectory
+#
+# If you add any source files to this subdirectory
+# that should be included in the kenlm library,
+# (this excludes any unit test files)
+# you should add them to the following list:
+#
+# In order to set correct paths to these files
+# in case this variable is referenced by CMake files in the parent directory,
+# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
+#
+set(KENLM_FILTER_SOURCE
+ ${CMAKE_CURRENT_SOURCE_DIR}/arpa_io.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/phrase.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc
+ )
+
+
+# Group these objects together for later use.
+#
+# Given add_library(foo OBJECT ${my_foo_sources}),
+# refer to these objects as $<TARGET_OBJECTS:foo>
+#
+add_library(kenlm_filter OBJECT ${KENLM_FILTER_SOURCE})
+
+
+# Explicitly list the executable files to be compiled
+set(EXE_LIST
+ filter
+ phrase_table_vocab
+)
+
+
+# Iterate through the executable list
+foreach(exe ${EXE_LIST})
+
+ # Compile the executable, linking against the requisite dependent object files
+ add_executable(${exe} ${exe}_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_filter> $<TARGET_OBJECTS:kenlm_util>)
+
+ # Link the executable against boost
+ target_link_libraries(${exe} ${Boost_LIBRARIES})
+
+ # Group executables together
+ set_target_properties(${exe} PROPERTIES FOLDER executables)
+
+# End for loop
+endforeach(exe)
+