Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@gmail.com>2015-06-25 10:45:09 +0300
committergraehl <graehl@gmail.com>2015-06-25 11:31:16 +0300
commita3dd9155bab174643fe358c1041f881fd98474e8 (patch)
treef8781d8cb1191b03ef7f0bee0c98bd376ce08005
parent589a79fa094a1d36dfa110d2bbb58be078c7a05b (diff)
vocabulary no-copy find_string pair-char-ptrs
-rw-r--r--.gitignore1
-rw-r--r--src/find_string.hpp77
-rw-r--r--src/neuralLM.h36
-rw-r--r--src/neuralTM.h220
-rw-r--r--src/replace_digits.hpp62
-rw-r--r--src/types.hpp41
-rw-r--r--src/vocabulary.h32
7 files changed, 299 insertions, 170 deletions
diff --git a/.gitignore b/.gitignore
index 23c4020..12fab12 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ src/prepareNeuralTM
src/testNeuralLM
src/testNeuralNetwork
src/trainNeuralNetwork
+.history
diff --git a/src/find_string.hpp b/src/find_string.hpp
new file mode 100644
index 0000000..fd612fd
--- /dev/null
+++ b/src/find_string.hpp
@@ -0,0 +1,77 @@
+/** \file \author Jonathan Graehl <graehl@gmail.com>
+
+ find_string(boost::unordered_map<std::string, ...>, pair<char const*, char
+ const*>) pair is [begin, end), a key: map.find(std:string(key.first,
+ key.second)) read-only since unordered_map doesn't support lazy construction
+ of string from a pair key.
+
+ To the extent possible under law, the author(s) have dedicated all copyright
+ and related and neighboring rights to this software to the public domain
+ worldwide. This software is distributed without any warranty.
+*/
+
+#ifndef FIND_STRING_GRAEHL_2015_06_24_HPP
+#define FIND_STRING_GRAEHL_2015_06_24_HPP
+#pragma once
+
+#include <utility>
+#include <algorithm>
+#include <cstddef>
+#include <boost/functional/hash.hpp>
+
+namespace std {
+/// we do not change standard semantics of any supported comparison e.g. pair vs
+/// pair, but simply allow string to be compared against pair of char pointers.
+inline bool operator==(std::string const& str, std::pair<char const*, char const*> slice) {
+ return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::pair<char const*, char const*> slice, std::string const& str) {
+ return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::string const& str, std::pair<char*, char*> slice) {
+ return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::pair<char*, char*> slice, std::string const& str) {
+ return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+/// techinically not allowed but easiest route to ADL. we could rename these instead.
+inline std::size_t hash_value(std::pair<char const*, char const*> slice) {
+ return boost::hash_range(slice.first, slice.second);
+}
+inline std::size_t hash_value(std::pair<char*, char*> slice) {
+ return boost::hash_range(slice.first, slice.second);
+}
+inline std::size_t hash_value(std::string const& str) {
+ return boost::hash_range(str.begin(), str.end());
+}
+}
+
+struct slice_or_string_eq {
+ typedef bool result_type;
+ template <class A, class B>
+ bool operator()(A const& a, B const& b) const {
+ return a == b;
+ }
+};
+
+struct slice_or_string_hash {
+ typedef std::size_t result_type;
+ template <class Slice>
+ std::size_t operator()(Slice const& slice) const {
+ return hash_value(slice);
+ }
+};
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, Slice const& key) {
+ return map.find(key, slice_or_string_hash(), slice_or_string_eq());
+}
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::iterator find_string(UnorderedMap& map, Slice const& key) {
+ return map.find(key, slice_or_string_hash(), slice_or_string_eq());
+}
+
+#endif
diff --git a/src/neuralLM.h b/src/neuralLM.h
index c18485f..5febaeb 100644
--- a/src/neuralLM.h
+++ b/src/neuralLM.h
@@ -11,6 +11,7 @@
#include "util.h"
#include "vocabulary.h"
#include "neuralNetwork.h"
+#include "replace_digits.hpp"
/*
To do:
@@ -20,17 +21,16 @@
namespace nplm
{
-class neuralLM : public neuralNetwork
+class neuralLM : public neuralNetwork, graehl::replace_digits
{
- char map_digits;
boost::shared_ptr<vocabulary> vocab;
int start, null;
public:
neuralLM()
: neuralNetwork(),
- vocab(new vocabulary()),
- map_digits(0)
+ graehl::replace_digits(0),
+ vocab(new vocabulary())
{
}
@@ -45,25 +45,35 @@ class neuralLM : public neuralNetwork
const vocabulary &get_vocabulary() const { return *(this->vocab); }
+
int lookup_word(const std::string &word) const
{
if (map_digits)
- for (int i=0; i<word.length(); i++)
- if (isdigit(word[i]))
- {
+ for (int i=0, n=word.size(); i<n; ++i)
+ if (graehl::ascii_digit(word[i])) {
std::string mapped_word(word);
- for (; i<word.length(); i++)
- if (isdigit(word[i]))
- mapped_word[i] = map_digits;
+ replace(mapped_word, i);
return vocab->lookup_word(mapped_word);
}
return vocab->lookup_word(word);
}
+ int lookup_word(std::pair<char const*, char const*> slice) const
+ {
+ if (map_digits)
+ for (char const* i = slice.first; i != slice.second; ++i)
+ if (graehl::ascii_digit(*i)) {
+ std::string mapped_word(slice.first, slice.second);
+ replace(mapped_word, i - slice.first);
+ return vocab->lookup_word(mapped_word);
+ }
+ return vocab->lookup_word(slice);
+ }
+
double lookup_ngram(const int *ngram_a, int n)
{
Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
- for (int i=0; i<m->ngram_size; i++)
+ for (int i=0; i<m->ngram_size; ++i)
{
if (i-m->ngram_size+n < 0)
{
@@ -114,7 +124,7 @@ void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size,
{
output.clear();
output.resize(input.size()+ngram_size);
- for (int i=0; i<ngram_size-1; i++)
+ for (int i=0; i<ngram_size-1; ++i)
output[i] = start;
std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
output[output.size()-1] = stop;
@@ -168,7 +178,7 @@ inline void preprocessWords(const std::vector<std::string> &words,
else {
if (nums.size() != ngram_size)
{
- std::cerr << "error: wrong number of fields in line" << std::endl;
+ std::cerr << "error: wrong number of fields in line\n";
std::exit(1);
}
ngrams.push_back(nums);
diff --git a/src/neuralTM.h b/src/neuralTM.h
index 4ad6752..4c3db51 100644
--- a/src/neuralTM.h
+++ b/src/neuralTM.h
@@ -11,120 +11,134 @@
#include "util.h"
#include "vocabulary.h"
#include "neuralNetwork.h"
+#include "replace_digits.hpp"
namespace nplm
{
-class neuralTM : public neuralNetwork
+class neuralTM : public neuralNetwork, graehl::replace_digits
{
- char map_digits;
- boost::shared_ptr<vocabulary> input_vocab, output_vocab;
- int start, null;
+ boost::shared_ptr<vocabulary> input_vocab, output_vocab;
+ int start, null;
-public:
- neuralTM()
+ public:
+ neuralTM()
: neuralNetwork(),
- map_digits(0),
+ graehl::replace_digits(0),
input_vocab(new vocabulary()),
output_vocab(new vocabulary())
- {
- }
-
- void set_map_digits(char value) { map_digits = value; }
-
- void set_input_vocabulary(const vocabulary &vocab)
- {
- *(this->input_vocab) = vocab;
- start = vocab.lookup_word("<s>");
- null = vocab.lookup_word("<null>");
- }
-
- void set_output_vocabulary(const vocabulary &vocab)
- {
- *(this->output_vocab) = vocab;
- }
-
- const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
- const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); }
-
- int lookup_input_word(const std::string &word) const
- {
- if (map_digits)
- for (int i=0; i<word.length(); i++)
- if (isdigit(word[i]))
- {
- std::string mapped_word(word);
- for (; i<word.length(); i++)
- if (isdigit(word[i]))
- mapped_word[i] = map_digits;
- return input_vocab->lookup_word(mapped_word);
- }
- return input_vocab->lookup_word(word);
- }
-
- int lookup_output_word(const std::string &word) const
- {
- if (map_digits)
- for (int i=0; i<word.length(); i++)
- if (isdigit(word[i]))
- {
- std::string mapped_word(word);
- for (; i<word.length(); i++)
- if (isdigit(word[i]))
- mapped_word[i] = map_digits;
- return output_vocab->lookup_word(mapped_word);
- }
- return output_vocab->lookup_word(word);
- }
-
- double lookup_ngram(const int *ngram_a, int n)
- {
- Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
- for (int i=0; i<m->ngram_size; i++)
- {
- if (i-m->ngram_size+n < 0)
- {
- if (ngram_a[0] == start)
- ngram(i) = start;
- else
- ngram(i) = null;
- }
- else
- {
- ngram(i) = ngram_a[i-m->ngram_size+n];
- }
- }
- return neuralNetwork::lookup_ngram(ngram);
- }
-
- double lookup_ngram(const std::vector<int> &ngram_v)
- {
- return lookup_ngram(ngram_v.data(), ngram_v.size());
- }
-
- template <typename Derived>
- double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
- {
- return neuralNetwork::lookup_ngram(ngram);
- }
-
- template <typename DerivedA, typename DerivedB>
- void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
- {
- return neuralNetwork::lookup_ngram(ngram, log_probs_const);
- }
-
- void read(const std::string &filename)
+ {
+ }
+
+ void set_map_digits(char value) { map_digits = value; }
+
+ void set_input_vocabulary(const vocabulary &vocab)
+ {
+ *(this->input_vocab) = vocab;
+ start = vocab.lookup_word("<s>");
+ null = vocab.lookup_word("<null>");
+ }
+
+ void set_output_vocabulary(const vocabulary &vocab)
+ {
+ *(this->output_vocab) = vocab;
+ }
+
+ const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
+ const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); }
+
+ int lookup_word(const std::string &word, vocabulary const& vocab) const
+ {
+ if (map_digits)
+ for (int i=0, n=word.size(); i<n; ++i)
+ if (graehl::ascii_digit(word[i])) {
+ std::string mapped_word(word);
+ replace(mapped_word, i);
+ return vocab->lookup_word(mapped_word);
+ }
+ return vocab->lookup_word(word);
+ }
+
+ int lookup_word(std::pair<char const*, char const*> slice, vocabulary const& vocab) const
+ {
+ if (map_digits)
+ for (char const* i = slice.first; i != slice.second; ++i)
+ if (graehl::ascii_digit(*i)) {
+ std::string mapped_word(slice.first, slice.second);
+ replace(mapped_word, i - slice.first);
+ return vocab->lookup_word(mapped_word);
+ }
+ return vocab->lookup_word(slice);
+ }
+
+ int lookup_input_word(const std::string &word) const
+ {
+ return lookup_word(word, *input_vocab);
+ }
+
+ int lookup_output_word(const std::string &word) const
+ {
+ return lookup_word(word, *output_vocab);
+ }
+
+ int lookup_input_word(std::pair<char const*, char const*> word) const
+ {
+ return lookup_word(word, *input_vocab);
+ }
+
+ int lookup_output_word(std::pair<char const*, char const*> word) const
+ {
+ return lookup_word(word, *output_vocab);
+ }
+
+ double lookup_ngram(const int *ngram_a, int n)
+ {
+ Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+ for (int i=0; i<m->ngram_size; i++)
{
- std::vector<std::string> input_words;
- std::vector<std::string> output_words;
- m->read(filename, input_words, output_words);
- set_input_vocabulary(vocabulary(input_words));
- set_output_vocabulary(vocabulary(output_words));
- resize();
- // this is faster but takes more memory
- //m->premultiply();
+ if (i-m->ngram_size+n < 0)
+ {
+ if (ngram_a[0] == start)
+ ngram(i) = start;
+ else
+ ngram(i) = null;
+ }
+ else
+ {
+ ngram(i) = ngram_a[i-m->ngram_size+n];
+ }
}
+ return neuralNetwork::lookup_ngram(ngram);
+ }
+
+ double lookup_ngram(const std::vector<int> &ngram_v)
+ {
+ return lookup_ngram(ngram_v.data(), ngram_v.size());
+ }
+
+ template <typename Derived>
+ double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+ {
+ return neuralNetwork::lookup_ngram(ngram);
+ }
+
+ template <typename DerivedA, typename DerivedB>
+ void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+ {
+ return neuralNetwork::lookup_ngram(ngram, log_probs_const);
+ }
+
+ void read(const std::string &filename)
+ {
+ std::vector<std::string> input_words;
+ std::vector<std::string> output_words;
+ m->read(filename, input_words, output_words);
+ set_input_vocabulary(vocabulary(input_words));
+ set_output_vocabulary(vocabulary(output_words));
+ resize();
+ // this is faster but takes more memory
+ //m->premultiply();
+ }
};
diff --git a/src/replace_digits.hpp b/src/replace_digits.hpp
new file mode 100644
index 0000000..e8ac957
--- /dev/null
+++ b/src/replace_digits.hpp
@@ -0,0 +1,62 @@
+/** \file \author Jonathan Graehl <graehl@gmail.com>
+
+ replace 0-9 ascii chars with another ascii replacement
+
+ To the extent possible under law, the author(s) have dedicated all copyright
+ and related and neighboring rights to this software to the public domain
+ worldwide. This software is distributed without any warranty.
+*/
+
+#ifndef REPLACEDIGITS_GRAEHL_2015_06_25_H
+#define REPLACEDIGITS_GRAEHL_2015_06_25_H
+#pragma once
+
+#include <string>
+#include <utility>
+
+namespace graehl {
+
+inline bool ascii_digit(char c) {
+ return c >= '0' && c <= '9';
+}
+
+struct replace_digits {
+ char map_digits;
+ replace_digits(char map_digits = '@') : map_digits(map_digits) {}
+
+ /// \return whether anything was replaced
+ bool replaced(char* i, char* end) const {
+ for (; i != end; ++i)
+ if (ascii_digit(*i)) {
+ *i = map_digits;
+ while (++i != end)
+ if (ascii_digit(*i)) *i = map_digits;
+ return true;
+ }
+ return false;
+ }
+ /// maybe: only if non-0 map_digits, do the thing
+ bool maybe_replaced(char* i, char* end) const { return map_digits && replaced(i, end); }
+
+ void replace(char* i, char* end) const {
+ for (; i != end; ++i)
+ if (ascii_digit(*i)) *i = map_digits;
+ }
+ void maybe_replace(char* i, char* end) const {
+ if (map_digits) replace(i, end);
+ }
+
+ void replace(std::string& str, std::string::size_type i = 0) const {
+ std::string::size_type n = str.size();
+ char* d = (char *)str.data(); // although only C++11 officially allows this, in reality everyone does
+ replace(d + i, d + n);
+ }
+ void maybe_replace(std::string& str, std::string::size_type i = 0) const {
+ if (map_digits) replace(str, i);
+ }
+};
+
+
+}
+
+#endif
diff --git a/src/types.hpp b/src/types.hpp
deleted file mode 100644
index 08b010f..0000000
--- a/src/types.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef TYPES_HPP
-#define TYPES_HPP
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <boost/cstdint.hpp>
-#include <limits>
-
-namespace biglm{
-
-typedef double weight_type;
-const weight_type IMPOSSIBLE = -HUGE_VAL;
-
-typedef unsigned long block_type;
-const size_t bits_per_block = (std::numeric_limits<block_type>::digits);
- //typedef std::size_t size_type;
-typedef boost::uint64_t size_type;
-typedef unsigned char byte_type;
-
-template<typename T>
-struct bytes {
- static const byte_type *data(const T& key) { return reinterpret_cast<const byte_type *>(&key); }
- static size_type size(const T& key) { return sizeof(T); }
-};
-
-template<>
-struct bytes<std::string> {
- static const byte_type *data(const std::string& key) { return reinterpret_cast<const byte_type *>(key.data()); }
- static size_type size(const std::string& key) { return key.size(); }
-};
-
-template<typename U>
-struct bytes<std::vector<U> > {
- static const byte_type *data(const std::vector<U>& key) { return reinterpret_cast<const byte_type *>(&key[0]); }
- static size_type size(const std::vector<U>& key) { return key.size() * sizeof(U); }
-};
-
-} //namespace nplm
-
-#endif
diff --git a/src/vocabulary.h b/src/vocabulary.h
index fe08d86..c8cd518 100644
--- a/src/vocabulary.h
+++ b/src/vocabulary.h
@@ -5,6 +5,9 @@
#include <string>
#include <queue>
#include <boost/unordered_map.hpp>
+#include "find_string.hpp"
+
+#define NPLM_HAVE_FIND_STRING_PIECE 1
namespace nplm
{
@@ -17,7 +20,8 @@ struct compare_second
class vocabulary {
std::vector<std::string> m_words;
- boost::unordered_map<std::string, int> m_index;
+ typedef boost::unordered_map<std::string, int> WordId;
+ WordId m_index;
int unk;
public:
@@ -37,21 +41,24 @@ class vocabulary {
int lookup_word(const std::string &word) const
{
- boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
- if (pos != m_index.end())
- return pos->second;
- else
- return unk;
+ return lookup_word(word, unk);
}
// lookup word using custom unknown-word id
- int lookup_word(const std::string &word, int unk) const
+ int lookup_word(const std::string &word, int unkid) const
+ {
+ WordId::const_iterator pos = m_index.find(word);
+ return pos == m_index.end() ? unkid : pos->second;
+ }
+
+ int lookup_word(std::pair<char const*, char const*> slice) const {
+ return lookup_word(slice, unk);
+ }
+
+ int lookup_word(std::pair<char const*, char const*> slice, int unkid) const
{
- boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
- if (pos != m_index.end())
- return pos->second;
- else
- return unk;
+ WordId::const_iterator pos = find_string(m_index, slice);
+ return pos == m_index.end() ? unkid : pos->second;
}
int insert_word(const std::string &word)
@@ -89,7 +96,6 @@ class vocabulary {
const std::vector<std::string> &words() const { return m_words; }
- const boost::unordered_map<std::string, int>& get_idmap() const { return m_index; }
};
} // namespace nplm