#pragma once #include #include #include #include #include #include #include #include #include #include #include #ifdef USE_CHRONO #include #endif #include #include "maybe_omp.h" #if NPLM_DOUBLE_PRECISION == 1 typedef double user_data_t; #else typedef float user_data_t; #endif // Make matrices hashable namespace Eigen { template size_t hash_value(const DenseBase &m) { size_t h=0; for (int i=0; i &items); void readWordsFile(std::ifstream &TRAININ, std::vector &word_list); void readWordsFile(const std::string &file, std::vector &word_list); void writeWordsFile(const std::vector &words, std::ofstream &file); void writeWordsFile(const std::vector &words, const std::string &filename); void readDataFile(const std::string &filename, int &ngram_size, std::vector &data, int minibatch_size=0); void readUnigramProbs(const std::string &unigram_probs_file, std::vector &unigram_probs); void readWeightsFile(std::ifstream &TRAININ, std::vector &weights); //template readSentFile(const std::string &file, T &sentences); template void readSentFile(const std::string &file, T &sentences) { std::cerr << "Reading sentences from: " << file << std::endl; std::ifstream TRAININ; TRAININ.open(file.c_str()); if (! TRAININ) { std::cerr << "Error: can't read from file " << file<< std::endl; exit(-1); } std::string line; while (getline(TRAININ, line)) { std::vector words; splitBySpace(line, words); sentences.push_back(words); } TRAININ.close(); } inline void intgerize(std::vector &ngram,std::vector &int_ngram){ int ngram_size = ngram.size(); for (int i=0;i(ngram[i])); } // Functions that take non-const matrices as arguments // are supposed to declare them const and then use this // to cast away constness. #define UNCONST(t,c,uc) Eigen::MatrixBase &uc = const_cast&>(c); template void initMatrix(boost::random::mt19937 &engine, const Eigen::MatrixBase &p_const, bool init_normal, user_data_t range) { UNCONST(Derived, p_const, p); if (init_normal == 0) // initialize with uniform distribution in [-range, range] { boost::random::uniform_real_distribution<> unif_real(-range, range); for (int i = 0; i < p.rows(); i++) { for (int j = 0; j< p.cols(); j++) { p(i,j) = unif_real(engine); } } } else // initialize with gaussian distribution with mean 0 and stdev range { boost::random::normal_distribution unif_normal(0., range); for (int i = 0; i < p.rows(); i++) { for (int j = 0; j < p.cols(); j++) { p(i,j) = unif_normal(engine); } } } } template void initBias(boost::random::mt19937 &engine, const Eigen::MatrixBase &p_const, bool init_normal, user_data_t range) { UNCONST(Derived, p_const, p); if (init_normal == 0) // initialize with uniform distribution in [-range, range] { boost::random::uniform_real_distribution<> unif_real(-range, range); for (int i = 0; i < p.size(); i++) { p(i) = unif_real(engine); } } else // initialize with gaussian distribution with mean 0 and stdev range { boost::random::normal_distribution unif_normal(0., range); for (int i = 0; i < p.size(); i++) { p(i) = unif_normal(engine); } } } template void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase ¶m_const) { UNCONST(Derived, param_const, param); int i = 0; std::string line; std::vector fields; while (std::getline(TRAININ, line) && line != "") { splitBySpace(line, fields); if (fields.size() != param.cols()) { std::ostringstream err; err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")"; throw std::runtime_error(err.str()); } if (i >= param.rows()) { std::ostringstream err; err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")"; throw std::runtime_error(err.str()); } for (int j=0; j(fields[j]); } i++; } if (i != param.rows()) { std::ostringstream err; err << "error: wrong number of rows (expected " << param.rows() << ", found more)"; throw std::runtime_error(err.str()); } } template void readMatrix(const std::string ¶m_file, const Eigen::MatrixBase ¶m_const) { UNCONST(Derived, param_const, param); std::cerr << "Reading data from file: " << param_file << std::endl; std::ifstream TRAININ(param_file.c_str()); if (!TRAININ) { std::cerr << "Error: can't read training data from file " << param_file << std::endl; exit(-1); } readMatrix(TRAININ, param); TRAININ.close(); } template void writeMatrix(const Eigen::MatrixBase ¶m, const std::string &filename) { std::cerr << "Writing parameters to " << filename << std::endl; std::ofstream OUT; OUT.precision(16); OUT.open(filename.c_str()); if (! OUT) { std::cerr << "Error: can't write to file " << filename<< std::endl; exit(-1); } writeMatrix(param, OUT); OUT.close(); } template void writeMatrix(const Eigen::MatrixBase ¶m, std::ofstream &OUT) { for (int row = 0;row < param.rows();row++) { int col; for (col = 0;col < param.cols()-1;col++) { OUT< user_data_t logsum(const Eigen::MatrixBase &v) { int mi; user_data_t m = v.maxCoeff(&mi); user_data_t logz = 0.0; for (int i=0; i m_start; std::vector m_total; public: Timer() { } Timer(int n) { resize(n); } void resize(int n) { m_start.resize(n); m_total.resize(n); } int size() const { return m_start.size(); } void start(int i); void stop(int i); void reset(int i); double get(int i) const; }; extern Timer timer; #define start_timer(x) timer.start(x) #define stop_timer(x) timer.stop(x) #else #define start_timer(x) (void)0 #define stop_timer(x) (void)0 #endif // replace input word with default value () with probability of 1-input_dropout struct bernoulli_replace { mutable boost::random::mt19937 engine; boost::random::bernoulli_distribution bernoulli_dist; int default_value; bernoulli_replace(boost::random::mt19937 &rng, double input_dropout, int null_index) : engine(rng), bernoulli_dist(input_dropout), default_value(null_index) {} int operator() (int x) const { if (bernoulli_dist(engine)) return x; else return default_value; } }; int setup_threads(int n_threads); } // namespace nplm