diff options
author | Kenneth Heafield <github@kheafield.com> | 2013-10-29 22:00:37 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2013-10-29 22:00:37 +0400 |
commit | 78eecfdd7ef4cc0aef575c828c6fef747c63da19 (patch) | |
tree | cbd1e84c871306a35e1352286f7749ccac4f60bc /src/multinomial.h | |
parent | e4138ba17732e70bfe9ad8e806173c083a9ddd0e (diff) |
Copy nplm-0.1 after removing some executable bits
Diffstat (limited to 'src/multinomial.h')
-rw-r--r-- | src/multinomial.h | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/src/multinomial.h b/src/multinomial.h new file mode 100644 index 0000000..1314fcb --- /dev/null +++ b/src/multinomial.h @@ -0,0 +1,135 @@ +#ifndef MULTINOMIAL_H +#define MULTINOMIAL_H + +#include <vector> +#include <set> +#include <cassert> +#include <cmath> + +#include <boost/random/uniform_int_distribution.hpp> +#include <boost/random/uniform_real_distribution.hpp> + +namespace nplm +{ + +template <typename Count> +class multinomial { + std::vector<int> J; + std::vector<double> q; + boost::random::uniform_int_distribution<Count> unif_int; + boost::random::uniform_real_distribution<> unif_real; + std::vector<double> m_prob, m_logprob; + +public: + multinomial() : unif_real(0.0, 1.0) { } + multinomial(const std::vector<Count> &counts) : unif_real(0.0, 1.0) { estimate(counts); } + + void estimate(const std::vector<Count>& counts) + { + int k = counts.size(); + Count n = 0; + m_prob.clear(); + m_prob.resize(k, 0.0); + m_logprob.clear(); + m_logprob.resize(k, 0.0); + for (int i=0; i<k; i++) + n += counts[i]; + for (int i=0; i<k; i++) + { + m_prob[i] = static_cast<double>(counts[i]) / n; + m_logprob[i] = std::log(m_prob[i]); + } + setup(m_prob); + } + + double prob(int i) const { return m_prob[i]; } + double logprob(int i) const { return m_logprob[i]; } + + template <typename Engine> + int sample(Engine &eng) const + { + int m = unif_int(eng); + double p = unif_real(eng); + int s; + if (q[m] > p) + s = m; + else + s = J[m]; + assert (s >= 0); + return s; + } + +private: + void setup(const std::vector<double>& probs) + { + int k = probs.size(); + + unif_int = boost::random::uniform_int_distribution<Count>(0, k-1); + J.resize(k, -1); + q.resize(k, 0); + + // "small" outcomes (prob < 1/k) + std::set<int> S; + std::set<int>::iterator s_it; + // "large" outcomes (prob >= 1/k) + std::set<int> L; + std::set<int>::iterator l_it; + const double tol = 1e-3; + + for (int i=0; i<k; i++) + { + q[i] = k*probs[i]; + if (q[i] < 1.0) + { + S.insert(i); + } + else + { + L.insert(i); + } + } + + while (S.size() > 0 && L.size() > 0) + { + // choose an arbitrary element s from S and l from L + s_it = S.begin(); + int s = *s_it; + l_it = L.begin(); + int l = *l_it; + + // pair up s and (part of) l as its alias + J[s] = l; + S.erase(s_it); + //q[l] = q[l] - (1.0 - q[s]); + q[l] = q[l] + q[s] - 1.0; // more stable? + + // move l from L to S if necessary + if (q[l] < 1.0) + { + S.insert(l); + L.erase(l_it); + } + } + + // any remaining elements must have q/n close to 1, so we leave them alone + for (s_it = S.begin(); s_it != S.end(); ++s_it) { + //assert (fabs(q[*s_it] - 1) < tol); + if (std::fabs(q[*s_it] - 1) > tol) + { + std::cerr << "warning: multinomial: probability differs from one by " << std::fabs(q[*s_it]-1) << std::endl; + } + q[*s_it] = 1.0; + } + for (l_it = L.begin(); l_it != L.end(); ++l_it) { + if (std::fabs(q[*l_it] - 1) > tol) + { + std::cerr << "warning: multinomial: probability differs from one by " << std::fabs(q[*l_it]-1) << std::endl; + } + } + } + +}; + +} // namespace nplm + +#endif |