From a3dd9155bab174643fe358c1041f881fd98474e8 Mon Sep 17 00:00:00 2001 From: graehl Date: Thu, 25 Jun 2015 00:45:09 -0700 Subject: vocabulary no-copy find_string pair-char-ptrs --- .gitignore | 1 + src/find_string.hpp | 77 +++++++++++++++++ src/neuralLM.h | 36 +++++--- src/neuralTM.h | 220 ++++++++++++++++++++++++++----------------------- src/replace_digits.hpp | 62 ++++++++++++++ src/types.hpp | 41 --------- src/vocabulary.h | 32 ++++--- 7 files changed, 299 insertions(+), 170 deletions(-) create mode 100644 src/find_string.hpp create mode 100644 src/replace_digits.hpp delete mode 100644 src/types.hpp diff --git a/.gitignore b/.gitignore index 23c4020..12fab12 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ src/prepareNeuralTM src/testNeuralLM src/testNeuralNetwork src/trainNeuralNetwork +.history diff --git a/src/find_string.hpp b/src/find_string.hpp new file mode 100644 index 0000000..fd612fd --- /dev/null +++ b/src/find_string.hpp @@ -0,0 +1,77 @@ +/** \file \author Jonathan Graehl + + find_string(boost::unordered_map, pair) pair is [begin, end), a key: map.find(std:string(key.first, + key.second)) read-only since unordered_map doesn't support lazy construction + of string from a pair key. + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. +*/ + +#ifndef FIND_STRING_GRAEHL_2015_06_24_HPP +#define FIND_STRING_GRAEHL_2015_06_24_HPP +#pragma once + +#include +#include +#include +#include + +namespace std { +/// we do not change standard semantics of any supported comparison e.g. pair vs +/// pair, but simply allow string to be compared against pair of char pointers. +inline bool operator==(std::string const& str, std::pair slice) { + return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin()); +} +inline bool operator==(std::pair slice, std::string const& str) { + return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin()); +} +inline bool operator==(std::string const& str, std::pair slice) { + return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin()); +} +inline bool operator==(std::pair slice, std::string const& str) { + return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin()); +} +/// techinically not allowed but easiest route to ADL. we could rename these instead. +inline std::size_t hash_value(std::pair slice) { + return boost::hash_range(slice.first, slice.second); +} +inline std::size_t hash_value(std::pair slice) { + return boost::hash_range(slice.first, slice.second); +} +inline std::size_t hash_value(std::string const& str) { + return boost::hash_range(str.begin(), str.end()); +} +} + +struct slice_or_string_eq { + typedef bool result_type; + template + bool operator()(A const& a, B const& b) const { + return a == b; + } +}; + +struct slice_or_string_hash { + typedef std::size_t result_type; + template + std::size_t operator()(Slice const& slice) const { + return hash_value(slice); + } +}; + +/// \return map.find(std:string(key.first, key.second)) but faster +template +typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, Slice const& key) { + return map.find(key, slice_or_string_hash(), slice_or_string_eq()); +} + +/// \return map.find(std:string(key.first, key.second)) but faster +template +typename UnorderedMap::iterator find_string(UnorderedMap& map, Slice const& key) { + return map.find(key, slice_or_string_hash(), slice_or_string_eq()); +} + +#endif diff --git a/src/neuralLM.h b/src/neuralLM.h index c18485f..5febaeb 100644 --- a/src/neuralLM.h +++ b/src/neuralLM.h @@ -11,6 +11,7 @@ #include "util.h" #include "vocabulary.h" #include "neuralNetwork.h" +#include "replace_digits.hpp" /* To do: @@ -20,17 +21,16 @@ namespace nplm { -class neuralLM : public neuralNetwork +class neuralLM : public neuralNetwork, graehl::replace_digits { - char map_digits; boost::shared_ptr vocab; int start, null; public: neuralLM() : neuralNetwork(), - vocab(new vocabulary()), - map_digits(0) + graehl::replace_digits(0), + vocab(new vocabulary()) { } @@ -45,25 +45,35 @@ class neuralLM : public neuralNetwork const vocabulary &get_vocabulary() const { return *(this->vocab); } + int lookup_word(const std::string &word) const { if (map_digits) - for (int i=0; ilookup_word(mapped_word); } return vocab->lookup_word(word); } + int lookup_word(std::pair slice) const + { + if (map_digits) + for (char const* i = slice.first; i != slice.second; ++i) + if (graehl::ascii_digit(*i)) { + std::string mapped_word(slice.first, slice.second); + replace(mapped_word, i - slice.first); + return vocab->lookup_word(mapped_word); + } + return vocab->lookup_word(slice); + } + double lookup_ngram(const int *ngram_a, int n) { Eigen::Matrix ngram(m->ngram_size); - for (int i=0; ingram_size; i++) + for (int i=0; ingram_size; ++i) { if (i-m->ngram_size+n < 0) { @@ -114,7 +124,7 @@ void addStartStop(std::vector &input, std::vector &output, int ngram_size, { output.clear(); output.resize(input.size()+ngram_size); - for (int i=0; i &words, else { if (nums.size() != ngram_size) { - std::cerr << "error: wrong number of fields in line" << std::endl; + std::cerr << "error: wrong number of fields in line\n"; std::exit(1); } ngrams.push_back(nums); diff --git a/src/neuralTM.h b/src/neuralTM.h index 4ad6752..4c3db51 100644 --- a/src/neuralTM.h +++ b/src/neuralTM.h @@ -11,120 +11,134 @@ #include "util.h" #include "vocabulary.h" #include "neuralNetwork.h" +#include "replace_digits.hpp" namespace nplm { -class neuralTM : public neuralNetwork +class neuralTM : public neuralNetwork, graehl::replace_digits { - char map_digits; - boost::shared_ptr input_vocab, output_vocab; - int start, null; + boost::shared_ptr input_vocab, output_vocab; + int start, null; -public: - neuralTM() + public: + neuralTM() : neuralNetwork(), - map_digits(0), + graehl::replace_digits(0), input_vocab(new vocabulary()), output_vocab(new vocabulary()) - { - } - - void set_map_digits(char value) { map_digits = value; } - - void set_input_vocabulary(const vocabulary &vocab) - { - *(this->input_vocab) = vocab; - start = vocab.lookup_word(""); - null = vocab.lookup_word(""); - } - - void set_output_vocabulary(const vocabulary &vocab) - { - *(this->output_vocab) = vocab; - } - - const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); } - const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); } - - int lookup_input_word(const std::string &word) const - { - if (map_digits) - for (int i=0; ilookup_word(mapped_word); - } - return input_vocab->lookup_word(word); - } - - int lookup_output_word(const std::string &word) const - { - if (map_digits) - for (int i=0; ilookup_word(mapped_word); - } - return output_vocab->lookup_word(word); - } - - double lookup_ngram(const int *ngram_a, int n) - { - Eigen::Matrix ngram(m->ngram_size); - for (int i=0; ingram_size; i++) - { - if (i-m->ngram_size+n < 0) - { - if (ngram_a[0] == start) - ngram(i) = start; - else - ngram(i) = null; - } - else - { - ngram(i) = ngram_a[i-m->ngram_size+n]; - } - } - return neuralNetwork::lookup_ngram(ngram); - } - - double lookup_ngram(const std::vector &ngram_v) - { - return lookup_ngram(ngram_v.data(), ngram_v.size()); - } - - template - double lookup_ngram(const Eigen::MatrixBase &ngram) - { - return neuralNetwork::lookup_ngram(ngram); - } - - template - void lookup_ngram(const Eigen::MatrixBase &ngram, const Eigen::MatrixBase &log_probs_const) - { - return neuralNetwork::lookup_ngram(ngram, log_probs_const); - } - - void read(const std::string &filename) + { + } + + void set_map_digits(char value) { map_digits = value; } + + void set_input_vocabulary(const vocabulary &vocab) + { + *(this->input_vocab) = vocab; + start = vocab.lookup_word(""); + null = vocab.lookup_word(""); + } + + void set_output_vocabulary(const vocabulary &vocab) + { + *(this->output_vocab) = vocab; + } + + const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); } + const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); } + + int lookup_word(const std::string &word, vocabulary const& vocab) const + { + if (map_digits) + for (int i=0, n=word.size(); ilookup_word(mapped_word); + } + return vocab->lookup_word(word); + } + + int lookup_word(std::pair slice, vocabulary const& vocab) const + { + if (map_digits) + for (char const* i = slice.first; i != slice.second; ++i) + if (graehl::ascii_digit(*i)) { + std::string mapped_word(slice.first, slice.second); + replace(mapped_word, i - slice.first); + return vocab->lookup_word(mapped_word); + } + return vocab->lookup_word(slice); + } + + int lookup_input_word(const std::string &word) const + { + return lookup_word(word, *input_vocab); + } + + int lookup_output_word(const std::string &word) const + { + return lookup_word(word, *output_vocab); + } + + int lookup_input_word(std::pair word) const + { + return lookup_word(word, *input_vocab); + } + + int lookup_output_word(std::pair word) const + { + return lookup_word(word, *output_vocab); + } + + double lookup_ngram(const int *ngram_a, int n) + { + Eigen::Matrix ngram(m->ngram_size); + for (int i=0; ingram_size; i++) { - std::vector input_words; - std::vector output_words; - m->read(filename, input_words, output_words); - set_input_vocabulary(vocabulary(input_words)); - set_output_vocabulary(vocabulary(output_words)); - resize(); - // this is faster but takes more memory - //m->premultiply(); + if (i-m->ngram_size+n < 0) + { + if (ngram_a[0] == start) + ngram(i) = start; + else + ngram(i) = null; + } + else + { + ngram(i) = ngram_a[i-m->ngram_size+n]; + } } + return neuralNetwork::lookup_ngram(ngram); + } + + double lookup_ngram(const std::vector &ngram_v) + { + return lookup_ngram(ngram_v.data(), ngram_v.size()); + } + + template + double lookup_ngram(const Eigen::MatrixBase &ngram) + { + return neuralNetwork::lookup_ngram(ngram); + } + + template + void lookup_ngram(const Eigen::MatrixBase &ngram, const Eigen::MatrixBase &log_probs_const) + { + return neuralNetwork::lookup_ngram(ngram, log_probs_const); + } + + void read(const std::string &filename) + { + std::vector input_words; + std::vector output_words; + m->read(filename, input_words, output_words); + set_input_vocabulary(vocabulary(input_words)); + set_output_vocabulary(vocabulary(output_words)); + resize(); + // this is faster but takes more memory + //m->premultiply(); + } }; diff --git a/src/replace_digits.hpp b/src/replace_digits.hpp new file mode 100644 index 0000000..e8ac957 --- /dev/null +++ b/src/replace_digits.hpp @@ -0,0 +1,62 @@ +/** \file \author Jonathan Graehl + + replace 0-9 ascii chars with another ascii replacement + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. +*/ + +#ifndef REPLACEDIGITS_GRAEHL_2015_06_25_H +#define REPLACEDIGITS_GRAEHL_2015_06_25_H +#pragma once + +#include +#include + +namespace graehl { + +inline bool ascii_digit(char c) { + return c >= '0' && c <= '9'; +} + +struct replace_digits { + char map_digits; + replace_digits(char map_digits = '@') : map_digits(map_digits) {} + + /// \return whether anything was replaced + bool replaced(char* i, char* end) const { + for (; i != end; ++i) + if (ascii_digit(*i)) { + *i = map_digits; + while (++i != end) + if (ascii_digit(*i)) *i = map_digits; + return true; + } + return false; + } + /// maybe: only if non-0 map_digits, do the thing + bool maybe_replaced(char* i, char* end) const { return map_digits && replaced(i, end); } + + void replace(char* i, char* end) const { + for (; i != end; ++i) + if (ascii_digit(*i)) *i = map_digits; + } + void maybe_replace(char* i, char* end) const { + if (map_digits) replace(i, end); + } + + void replace(std::string& str, std::string::size_type i = 0) const { + std::string::size_type n = str.size(); + char* d = (char *)str.data(); // although only C++11 officially allows this, in reality everyone does + replace(d + i, d + n); + } + void maybe_replace(std::string& str, std::string::size_type i = 0) const { + if (map_digits) replace(str, i); + } +}; + + +} + +#endif diff --git a/src/types.hpp b/src/types.hpp deleted file mode 100644 index 08b010f..0000000 --- a/src/types.hpp +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef TYPES_HPP -#define TYPES_HPP - -#include -#include -#include -#include -#include - -namespace biglm{ - -typedef double weight_type; -const weight_type IMPOSSIBLE = -HUGE_VAL; - -typedef unsigned long block_type; -const size_t bits_per_block = (std::numeric_limits::digits); - //typedef std::size_t size_type; -typedef boost::uint64_t size_type; -typedef unsigned char byte_type; - -template -struct bytes { - static const byte_type *data(const T& key) { return reinterpret_cast(&key); } - static size_type size(const T& key) { return sizeof(T); } -}; - -template<> -struct bytes { - static const byte_type *data(const std::string& key) { return reinterpret_cast(key.data()); } - static size_type size(const std::string& key) { return key.size(); } -}; - -template -struct bytes > { - static const byte_type *data(const std::vector& key) { return reinterpret_cast(&key[0]); } - static size_type size(const std::vector& key) { return key.size() * sizeof(U); } -}; - -} //namespace nplm - -#endif diff --git a/src/vocabulary.h b/src/vocabulary.h index fe08d86..c8cd518 100644 --- a/src/vocabulary.h +++ b/src/vocabulary.h @@ -5,6 +5,9 @@ #include #include #include +#include "find_string.hpp" + +#define NPLM_HAVE_FIND_STRING_PIECE 1 namespace nplm { @@ -17,7 +20,8 @@ struct compare_second class vocabulary { std::vector m_words; - boost::unordered_map m_index; + typedef boost::unordered_map WordId; + WordId m_index; int unk; public: @@ -37,21 +41,24 @@ class vocabulary { int lookup_word(const std::string &word) const { - boost::unordered_map::const_iterator pos = m_index.find(word); - if (pos != m_index.end()) - return pos->second; - else - return unk; + return lookup_word(word, unk); } // lookup word using custom unknown-word id - int lookup_word(const std::string &word, int unk) const + int lookup_word(const std::string &word, int unkid) const + { + WordId::const_iterator pos = m_index.find(word); + return pos == m_index.end() ? unkid : pos->second; + } + + int lookup_word(std::pair slice) const { + return lookup_word(slice, unk); + } + + int lookup_word(std::pair slice, int unkid) const { - boost::unordered_map::const_iterator pos = m_index.find(word); - if (pos != m_index.end()) - return pos->second; - else - return unk; + WordId::const_iterator pos = find_string(m_index, slice); + return pos == m_index.end() ? unkid : pos->second; } int insert_word(const std::string &word) @@ -89,7 +96,6 @@ class vocabulary { const std::vector &words() const { return m_words; } - const boost::unordered_map& get_idmap() const { return m_index; } }; } // namespace nplm -- cgit v1.2.3