Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/nplm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRico Sennrich <rico.sennrich@gmx.ch>2015-07-17 21:39:42 +0300
committerRico Sennrich <rico.sennrich@gmx.ch>2015-07-17 21:39:42 +0300
commita7da1b618082964152054b00c142e5962e4ca692 (patch)
tree45872fc848d3729e8632af0ffdc431726e39e7a2
parent28bdadf328c63ee086e8aa5de23cfe0c11728c5b (diff)
parentc461c4ad7232274dab8405b736bb1ac55cc7874d (diff)
Merge pull request #5 from graehl/master
c++11
-rw-r--r--.gitignore2
-rw-r--r--src/Activation_function.h129
-rw-r--r--src/Makefile4
-rw-r--r--src/SoftmaxLoss.h159
-rw-r--r--src/USCMatrix.h227
-rw-r--r--src/find_string.hpp89
-rw-r--r--src/graphClasses.h89
-rw-r--r--src/model.cpp482
-rw-r--r--src/neuralClasses.h1794
-rw-r--r--src/neuralLM.h213
-rw-r--r--src/neuralNetwork.h319
-rw-r--r--src/neuralTM.h222
-rw-r--r--src/prepareNeuralLM.cpp1057
-rw-r--r--src/propagator.h641
-rw-r--r--src/replace_digits.hpp62
-rw-r--r--src/testNeuralLM.cpp279
-rw-r--r--src/trainNeuralNetwork.cpp229
-rw-r--r--src/types.hpp41
-rw-r--r--src/util.h281
-rw-r--r--src/vocabulary.h130
20 files changed, 3304 insertions, 3145 deletions
diff --git a/.gitignore b/.gitignore
index 23c4020..2843613 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,5 @@ src/prepareNeuralTM
src/testNeuralLM
src/testNeuralNetwork
src/trainNeuralNetwork
+.history
+src/make.sh
diff --git a/src/Activation_function.h b/src/Activation_function.h
index 66342bb..742c2fc 100644
--- a/src/Activation_function.h
+++ b/src/Activation_function.h
@@ -3,7 +3,6 @@
#include <cmath>
#include <string>
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
#include "util.h"
@@ -19,28 +18,28 @@ enum activation_function_type { Tanh, HardTanh, Rectifier, Identity, InvalidFunc
inline activation_function_type string_to_activation_function (const std::string &s)
{
- if (s == "identity")
- return Identity;
- else if (s == "rectifier")
- return Rectifier;
- else if (s == "tanh")
- return Tanh;
- else if (s == "hardtanh")
- return HardTanh;
- else
- return InvalidFunction;
+ if (s == "identity")
+ return Identity;
+ else if (s == "rectifier")
+ return Rectifier;
+ else if (s == "tanh")
+ return Tanh;
+ else if (s == "hardtanh")
+ return HardTanh;
+ else
+ return InvalidFunction;
}
inline std::string activation_function_to_string (activation_function_type f)
{
- if (f == Identity)
- return "identity";
- else if (f == Rectifier)
- return "rectifier";
- else if (f == Tanh)
- return "tanh";
- else if (f == HardTanh)
- return "hardtanh";
+ if (f == Identity)
+ return "identity";
+ else if (f == Rectifier)
+ return "rectifier";
+ else if (f == Tanh)
+ return "tanh";
+ else if (f == HardTanh)
+ return "hardtanh";
}
struct hardtanh_functor {
@@ -69,51 +68,53 @@ struct drectifier_functor {
class Activation_function
{
- int size;
- activation_function_type f;
-
- public:
- Activation_function() : size(0), f(Rectifier) { }
-
- void resize(int size) { this->size = size; }
- void set_activation_function(activation_function_type f) { this->f = f; }
-
- template <typename Engine>
- void initialize(Engine &engine, bool init_normal, double init_range) { }
-
- int n_inputs () const { return size; }
- int n_outputs () const { return size; }
-
- template <typename DerivedIn, typename DerivedOut>
- void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
- {
- UNCONST(DerivedOut, output, my_output);
-
- switch (f)
- {
- case Identity: my_output = input; break;
- case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break;
- case Tanh: my_output = input.unaryExpr(tanh_functor()); break;
- case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break;
- }
- }
-
- template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut>
- void bProp(const MatrixBase<DerivedGOut> &input,
- MatrixBase<DerivedGIn> &output,
- const MatrixBase<DerivedIn> &finput,
- const MatrixBase<DerivedOut> &foutput) const
- {
- UNCONST(DerivedGIn, output, my_output);
-
- switch (f)
- {
- case Identity: my_output = input; break;
- case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break;
- case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break;
- case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break;
- }
- }
+ int size;
+ activation_function_type f;
+
+ public:
+ Activation_function() : size(0), f(Rectifier) { }
+
+ void resize(int size) { this->size = size; }
+ void set_activation_function(activation_function_type f) { this->f = f; }
+
+ template <typename Engine>
+ void initialize(Engine &engine, bool init_normal, double init_range) { }
+
+ int n_inputs () const { return size; }
+ int n_outputs () const { return size; }
+
+ template <typename DerivedIn, typename DerivedOut>
+ void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
+ {
+ UNCONST(DerivedOut, output, my_output);
+
+ switch (f)
+ {
+ case Identity: my_output = input; break;
+ case Rectifier: my_output = input.unaryExpr(rectifier_functor()); break;
+ case Tanh: my_output = input.unaryExpr(tanh_functor()); break;
+ case HardTanh: my_output = input.unaryExpr(hardtanh_functor()); break;
+ case InvalidFunction: std::abort();
+ }
+ }
+
+ template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut>
+ void bProp(const MatrixBase<DerivedGOut> &input,
+ MatrixBase<DerivedGIn> &output,
+ const MatrixBase<DerivedIn> &finput,
+ const MatrixBase<DerivedOut> &foutput) const
+ {
+ UNCONST(DerivedGIn, output, my_output);
+
+ switch (f)
+ {
+ case Identity: my_output = input; break;
+ case Rectifier: my_output = finput.array().unaryExpr(drectifier_functor()) * input.array(); break;
+ case Tanh: my_output = foutput.array().unaryExpr(tanh_functor()) * input.array(); break;
+ case HardTanh: my_output = finput.array().unaryExpr(hardtanh_functor()) * input.array(); break;
+ case InvalidFunction: std::abort();
+ }
+ }
};
} // namespace nplm
diff --git a/src/Makefile b/src/Makefile
index 1611ccb..2a27405 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,12 +1,12 @@
### Compilation options.
# C++ compiler. Tested with g++ and Intel icpc.
-CXX=/usr/bin/g++
+CXX=g++
#CXX=icpc
# Compiler options. Note that -DEIGEN_NO_DEBUG is essential for good performance!
#CFLAGS=-g
-CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG
+CFLAGS=-O3 -DEIGEN_NO_DEBUG -DNDEBUG $(CXXFLAGS)
# Architecture. Set to x86_64 or i686 to override.
ARCH:=$(shell uname -m)
diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h
index bc55762..d89cde6 100644
--- a/src/SoftmaxLoss.h
+++ b/src/SoftmaxLoss.h
@@ -1,7 +1,6 @@
- #ifndef SOFTMAXLOSS_H
+#ifndef SOFTMAXLOSS_H
#define SOFTMAXLOSS_H
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
#include "multinomial.h"
#include "util.h"
@@ -20,55 +19,55 @@ enum loss_function_type { LogLoss, NCELoss, InvalidLoss };
inline loss_function_type string_to_loss_function (const std::string &s)
{
- if (s == "log")
- return LogLoss;
- else if (s == "nce")
- return NCELoss;
- else
- return InvalidLoss;
+ if (s == "log")
+ return LogLoss;
+ else if (s == "nce")
+ return NCELoss;
+ else
+ return InvalidLoss;
}
inline std::string loss_function_to_string (loss_function_type f)
{
- if (f == LogLoss)
- return "log";
- else if (f == NCELoss)
- return "nce";
+ if (f == LogLoss)
+ return "log";
+ else if (f == NCELoss)
+ return "nce";
}
/// Note: Outputs log-probabilities.
struct SoftmaxLogLoss
{
- template <typename DerivedI, typename DerivedW, typename DerivedO>
- void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
+ template <typename DerivedI, typename DerivedW, typename DerivedO>
+ void fProp(const MatrixBase<DerivedI> &input, const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output_const, double &loss)
+ {
+ UNCONST(DerivedO, output_const, output);
+
+ double log_likelihood = 0.0;
+
+#pragma omp parallel for reduction(+:log_likelihood)
+ for (int train_id = 0; train_id < input.cols(); train_id++)
{
- UNCONST(DerivedO, output_const, output);
-
- double log_likelihood = 0.0;
-
- #pragma omp parallel for reduction(+:log_likelihood)
- for (int train_id = 0; train_id < input.cols(); train_id++)
- {
- double normalization = logsum(input.col(train_id));
- output.col(train_id).array() = input.col(train_id).array() - normalization;
- log_likelihood += output(output_words(train_id), train_id);
- }
- loss = log_likelihood;
+ double normalization = logsum(input.col(train_id));
+ output.col(train_id).array() = input.col(train_id).array() - normalization;
+ log_likelihood += output(output_words(train_id), train_id);
}
-
- template <typename DerivedW, typename DerivedO, typename DerivedI>
- void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
+ loss = log_likelihood;
+ }
+
+ template <typename DerivedW, typename DerivedO, typename DerivedI>
+ void bProp(const MatrixBase<DerivedW> &output_words, const MatrixBase<DerivedO> &output, const MatrixBase<DerivedI> &grad_input_const)
+ {
+ UNCONST(DerivedI, grad_input_const, grad_input);
+ grad_input.setZero();
+#pragma omp parallel for
+ for (int train_id = 0; train_id < output.cols(); train_id++)
{
- UNCONST(DerivedI, grad_input_const, grad_input);
- grad_input.setZero();
- #pragma omp parallel for
- for (int train_id = 0; train_id < output.cols(); train_id++)
- {
- grad_input(output_words(train_id), train_id) += 1.;
- grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
- }
+ grad_input(output_words(train_id), train_id) += 1.;
+ grad_input.col(train_id) -= output.col(train_id).array().exp().matrix();
}
+ }
};
///// Softmax layer plus NCE loss function.
@@ -81,55 +80,55 @@ struct SoftmaxLogLoss
template <typename Multinomial>
class SoftmaxNCELoss
{
- const Multinomial &unigram;
+ const Multinomial &unigram;
-public:
- SoftmaxNCELoss(const Multinomial &unigram)
+ public:
+ SoftmaxNCELoss(const Multinomial &unigram)
: unigram(unigram)
+ {
+ }
+
+ template <typename DerivedI, typename DerivedW, typename DerivedO>
+ void fProp(const MatrixBase<DerivedI> &scores,
+ const MatrixBase<DerivedW> &minibatch_samples,
+ const MatrixBase<DerivedO> &output_const, double &loss)
+ {
+ UNCONST(DerivedO, output_const, output);
+ double log_likelihood = 0.0;
+ int num_noise_samples = minibatch_samples.rows()-1;
+ double log_num_noise_samples = std::log(num_noise_samples);
+#pragma omp parallel for reduction(+:log_likelihood) schedule(static)
+ for (int train_id = 0; train_id < scores.cols(); train_id++)
{
+ for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++)
+ {
+ int sample = minibatch_samples(sample_id, train_id);
+ // To avoid zero or infinite probabilities,
+ // never take exp of score without normalizing first,
+ // even if it's a little slower...
+ double score = scores(sample_id, train_id);
+ double score_noise = log_num_noise_samples + unigram.logprob(sample);
+ double z = logadd(score, score_noise);
+ double logprob = score - z;
+ double logprob_noise = score_noise - z;
+ output(sample_id, train_id) = std::exp(logprob);
+ log_likelihood += sample_id == 0 ? logprob : logprob_noise;
+ }
}
-
- template <typename DerivedI, typename DerivedW, typename DerivedO>
- void fProp(const MatrixBase<DerivedI> &scores,
- const MatrixBase<DerivedW> &minibatch_samples,
- const MatrixBase<DerivedO> &output_const, double &loss)
- {
- UNCONST(DerivedO, output_const, output);
- double log_likelihood = 0.0;
- int num_noise_samples = minibatch_samples.rows()-1;
- double log_num_noise_samples = std::log(num_noise_samples);
- #pragma omp parallel for reduction(+:log_likelihood) schedule(static)
- for (int train_id = 0; train_id < scores.cols(); train_id++)
- {
- for (int sample_id = 0;sample_id < minibatch_samples.rows(); sample_id++)
- {
- int sample = minibatch_samples(sample_id, train_id);
- // To avoid zero or infinite probabilities,
- // never take exp of score without normalizing first,
- // even if it's a little slower...
- double score = scores(sample_id, train_id);
- double score_noise = log_num_noise_samples + unigram.logprob(sample);
- double z = logadd(score, score_noise);
- double logprob = score - z;
- double logprob_noise = score_noise - z;
- output(sample_id, train_id) = std::exp(logprob);
- log_likelihood += sample_id == 0 ? logprob : logprob_noise;
- }
- }
- loss = log_likelihood;
- }
-
- template <typename DerivedO, typename DerivedI>
- void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
+ loss = log_likelihood;
+ }
+
+ template <typename DerivedO, typename DerivedI>
+ void bProp(const MatrixBase<DerivedO> &probs, const MatrixBase<DerivedI> &output_const)
+ {
+ UNCONST(DerivedI, output_const, output);
+#pragma omp parallel for schedule(static)
+ for (int train_id = 0; train_id < probs.cols(); train_id++)
{
- UNCONST(DerivedI, output_const, output);
- #pragma omp parallel for schedule(static)
- for (int train_id = 0; train_id < probs.cols(); train_id++)
- {
- output.col(train_id) = -probs.col(train_id);
- output(0, train_id) += 1.0;
- }
+ output.col(train_id) = -probs.col(train_id);
+ output(0, train_id) += 1.0;
}
+ }
};
} // namespace nplm
diff --git a/src/USCMatrix.h b/src/USCMatrix.h
index 02aeb33..784fa1b 100644
--- a/src/USCMatrix.h
+++ b/src/USCMatrix.h
@@ -1,7 +1,6 @@
#ifndef USCMATRIX_H
#define USCMATRIX_H
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
#include "maybe_omp.h"
#include "util.h"
@@ -34,108 +33,108 @@ template <typename Scalar, typename Index=int> // should be EIGEN_DEFAULT_DENSE_
class USCMatrix
{
-public:
- Matrix<Index,Dynamic,Dynamic> indexes;
- Matrix<Scalar,Dynamic,Dynamic> values;
- int m_rows;
+ public:
+ Matrix<Index,Dynamic,Dynamic> indexes;
+ Matrix<Scalar,Dynamic,Dynamic> values;
+ int m_rows;
- USCMatrix() : m_rows(0) { }
+ USCMatrix() : m_rows(0) { }
- template <typename Indexes, typename Values>
- USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values)
- :
- indexes(indexes),
- values(values),
- m_rows(rows)
- { }
+ template <typename Indexes, typename Values>
+ USCMatrix(Index rows, const MatrixBase<Indexes> &indexes, const MatrixBase<Values> &values)
+ :
+ indexes(indexes),
+ values(values),
+ m_rows(rows)
+ { }
- USCMatrix(Index rows, Index nnz, Index cols)
- :
- indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)),
+ USCMatrix(Index rows, Index nnz, Index cols)
+ :
+ indexes(Matrix<Index,Dynamic,Dynamic>(nnz, cols)),
values(Matrix<Scalar,Dynamic,Dynamic>(nnz, cols)),
m_rows(rows)
- {
- this->indexes.fill(-1);
- }
-
- Index rows() const { return m_rows; }
- Index cols() const { return indexes.cols(); }
-
- void resize(Index rows, Index nnz, Index cols) {
- indexes.resize(nnz, cols);
- values.resize(nnz, cols);
- m_rows = rows;
- }
+ {
+ this->indexes.fill(-1);
+ }
+
+ Index rows() const { return m_rows; }
+ Index cols() const { return indexes.cols(); }
+
+ void resize(Index rows, Index nnz, Index cols) {
+ indexes.resize(nnz, cols);
+ values.resize(nnz, cols);
+ m_rows = rows;
+ }
};
// Dense matrix - sparse matrix product
// a is presumably very wide
template <typename DerivedA, typename ScalarB, typename Index, typename DerivedC>
-void uscgemm(double alpha, const MatrixBase<DerivedA> &a,
- const USCMatrix<ScalarB,Index> &b,
- const MatrixBase<DerivedC> &c_const)
+void uscgemm(double alpha, const MatrixBase<DerivedA> &a,
+ const USCMatrix<ScalarB,Index> &b,
+ const MatrixBase<DerivedC> &c_const)
{
- UNCONST(DerivedC, c_const, c);
- eigen_assert(a.rows() == c.rows());
- eigen_assert(a.cols() == b.rows());
- eigen_assert(b.cols() == c.cols());
-
- #pragma omp parallel for
- for (Index k=0; k<b.cols(); k++)
- for (Index r=0; r<b.indexes.rows(); r++)
- {
- Index j = b.indexes(r,k);
- eigen_assert(j >= 0);
- eigen_assert(j < a.cols());
- c.col(k) += alpha * a.col(j) * b.values(r,k);
- }
+ UNCONST(DerivedC, c_const, c);
+ eigen_assert(a.rows() == c.rows());
+ eigen_assert(a.cols() == b.rows());
+ eigen_assert(b.cols() == c.cols());
+
+#pragma omp parallel for
+ for (Index k=0; k<b.cols(); k++)
+ for (Index r=0; r<b.indexes.rows(); r++)
+ {
+ Index j = b.indexes(r,k);
+ eigen_assert(j >= 0);
+ eigen_assert(j < a.cols());
+ c.col(k) += alpha * a.col(j) * b.values(r,k);
+ }
}
// sparse matrix - dense matrix product
template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
-void uscgemm(double alpha,
- const USCMatrix<ScalarA,Index> &a,
- const MatrixBase<DerivedB> &b,
- const MatrixBase<DerivedC> &c_const)
+void uscgemm(double alpha,
+ const USCMatrix<ScalarA,Index> &a,
+ const MatrixBase<DerivedB> &b,
+ const MatrixBase<DerivedC> &c_const)
{
- UNCONST(DerivedC, c_const, c);
- eigen_assert(a.rows() == c.rows());
- eigen_assert(a.cols() == b.rows());
- eigen_assert(b.cols() == c.cols());
-
- // This needs to be tuned for each system, unfortunately,
- // and seems to vary a lot. A lot.
- int i_blocks = omp_get_num_threads()*16;
-
- // Assume only one block in k direction.
- // We don't need to explicitly block in the j direction.
- #pragma omp parallel for
- for (Index ib=0; ib<i_blocks; ib++)
- for (Index j=0; j<a.cols(); j++)
- for (Index r=0; r<a.indexes.rows(); r++)
- {
- Index i = a.indexes(r,j);
- eigen_assert(i >= 0);
- eigen_assert(i < c.rows());
- if (i % i_blocks == ib)
- c.row(i) += alpha * a.values(r,j) * b.row(j);
- }
-
- /*
+ UNCONST(DerivedC, c_const, c);
+ eigen_assert(a.rows() == c.rows());
+ eigen_assert(a.cols() == b.rows());
+ eigen_assert(b.cols() == c.cols());
+
+ // This needs to be tuned for each system, unfortunately,
+ // and seems to vary a lot. A lot.
+ int i_blocks = omp_get_num_threads()*16;
+
+ // Assume only one block in k direction.
+ // We don't need to explicitly block in the j direction.
+#pragma omp parallel for
+ for (Index ib=0; ib<i_blocks; ib++)
+ for (Index j=0; j<a.cols(); j++)
+ for (Index r=0; r<a.indexes.rows(); r++)
+ {
+ Index i = a.indexes(r,j);
+ eigen_assert(i >= 0);
+ eigen_assert(i < c.rows());
+ if (i % i_blocks == ib)
+ c.row(i) += alpha * a.values(r,j) * b.row(j);
+ }
+
+ /*
If c.cols() is really large, then theoretically it seems like we should do:
parallel for blocks in i direction
- for blocks in j direction
- pack block of a into smaller sparse matrix
- for blocks in k direction
- for k
- for i (sparse)
- for j
- c(i,k) += a(i,j) * b(j,k)
+ for blocks in j direction
+ pack block of a into smaller sparse matrix
+ for blocks in k direction
+ for k
+ for i (sparse)
+ for j
+ c(i,k) += a(i,j) * b(j,k)
However, the copying of blocks of a doesn't seem practical for any realistic
sizes of c.cols().
- */
+ */
}
// Dense matrix - dense matrix product, but masked by a sparse matrix,
@@ -147,45 +146,45 @@ void uscgemm(double alpha,
template <typename DerivedA, typename DerivedB, typename ScalarC, typename Index>
void uscgemm_masked(double alpha,
- const MatrixBase<DerivedA> &a,
- const MatrixBase<DerivedB> &b,
- USCMatrix<ScalarC,Index> &c)
+ const MatrixBase<DerivedA> &a,
+ const MatrixBase<DerivedB> &b,
+ USCMatrix<ScalarC,Index> &c)
{
- eigen_assert(a.rows() == c.rows());
- eigen_assert(a.cols() == b.rows());
- eigen_assert(b.cols() == c.cols());
-
- #pragma omp parallel for
- for (Index k=0; k<b.cols(); k++)
- for (Index r=0; r<c.indexes.rows(); r++)
- {
- Index i = c.indexes(r, k);
- eigen_assert(i >= 0);
- eigen_assert(i < a.rows());
- c.values(r, k) += alpha * a.row(i) * b.col(k);
- }
+ eigen_assert(a.rows() == c.rows());
+ eigen_assert(a.cols() == b.rows());
+ eigen_assert(b.cols() == c.cols());
+
+#pragma omp parallel for
+ for (Index k=0; k<b.cols(); k++)
+ for (Index r=0; r<c.indexes.rows(); r++)
+ {
+ Index i = c.indexes(r, k);
+ eigen_assert(i >= 0);
+ eigen_assert(i < a.rows());
+ c.values(r, k) += alpha * a.row(i) * b.col(k);
+ }
}
// sparse matrix - dense vector product
template <typename ScalarA, typename Index, typename DerivedB, typename DerivedC>
-void uscgemv(double alpha,
- const USCMatrix<ScalarA,Index> &a,
- const MatrixBase<DerivedB> &b,
- const MatrixBase<DerivedC> &c_const)
+void uscgemv(double alpha,
+ const USCMatrix<ScalarA,Index> &a,
+ const MatrixBase<DerivedB> &b,
+ const MatrixBase<DerivedC> &c_const)
{
- UNCONST(DerivedC, c_const, c);
- eigen_assert(a.rows() == c.rows());
- eigen_assert(a.cols() == b.rows());
- eigen_assert(b.cols() == 1 && c.cols() == 1);
-
- for (Index j=0; j<a.cols(); j++)
- for (Index r=0; r<a.indexes.rows(); r++)
- {
- Index i = a.indexes(r,j);
- eigen_assert(i >= 0);
- eigen_assert(i < c.rows());
- c(i) += alpha * a.values(r,j) * b(j);
- }
+ UNCONST(DerivedC, c_const, c);
+ eigen_assert(a.rows() == c.rows());
+ eigen_assert(a.cols() == b.rows());
+ eigen_assert(b.cols() == 1 && c.cols() == 1);
+
+ for (Index j=0; j<a.cols(); j++)
+ for (Index r=0; r<a.indexes.rows(); r++)
+ {
+ Index i = a.indexes(r,j);
+ eigen_assert(i >= 0);
+ eigen_assert(i < c.rows());
+ c(i) += alpha * a.values(r,j) * b(j);
+ }
}
}
diff --git a/src/find_string.hpp b/src/find_string.hpp
new file mode 100644
index 0000000..d26f6fe
--- /dev/null
+++ b/src/find_string.hpp
@@ -0,0 +1,89 @@
+/** \file \author Jonathan Graehl <graehl@gmail.com>
+
+ find_string(boost::unordered_map<std::string, ...>, pair<char const*, char
+ const*>) pair is [begin, end), a key: map.find(std:string(key.first,
+ key.second)) read-only since unordered_map doesn't support lazy construction
+ of string from a pair key.
+
+ To the extent possible under law, the author(s) have dedicated all copyright
+ and related and neighboring rights to this software to the public domain
+ worldwide. This software is distributed without any warranty.
+*/
+
+#ifndef FIND_STRING_GRAEHL_2015_06_24_HPP
+#define FIND_STRING_GRAEHL_2015_06_24_HPP
+#pragma once
+
+#include <utility>
+#include <algorithm>
+#include <cstddef>
+#include <boost/functional/hash.hpp>
+
+namespace std {
+/// we do not change standard semantics of any supported comparison e.g. pair vs
+/// pair, but simply allow string to be compared against pair of char pointers.
+inline bool operator==(std::string const& str, std::pair<char const*, char const*> slice) {
+ return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::pair<char const*, char const*> slice, std::string const& str) {
+ return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::string const& str, std::pair<char*, char*> slice) {
+ return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+inline bool operator==(std::pair<char*, char*> slice, std::string const& str) {
+ return str.size() == (slice.second - slice.first) && std::equal(slice.first, slice.second, str.begin());
+}
+/// techinically not allowed but easiest route to ADL. we could rename these instead.
+inline std::size_t hash_value(std::pair<char const*, char const*> slice) {
+ return boost::hash_range(slice.first, slice.second);
+}
+inline std::size_t hash_value(std::pair<char*, char*> slice) {
+ return boost::hash_range(slice.first, slice.second);
+}
+inline std::size_t hash_value(std::string const& str) {
+ return boost::hash_range(str.begin(), str.end());
+}
+}
+
+struct slice_or_string_eq {
+ typedef bool result_type;
+ template <class A, class B>
+ bool operator()(A const& a, B const& b) const {
+ return a == b;
+ }
+};
+
+struct slice_or_string_hash {
+ typedef std::size_t result_type;
+ template <class Slice>
+ std::size_t operator()(Slice const& slice) const {
+ return hash_value(slice);
+ }
+};
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, Slice const& key) {
+ return map.find(key, slice_or_string_hash(), slice_or_string_eq());
+}
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::iterator find_string(UnorderedMap& map, Slice const& key) {
+ return map.find(key, slice_or_string_hash(), slice_or_string_eq());
+}
+
+
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::const_iterator find_string(UnorderedMap const& map, char const* key) {
+ return map.find(std::pair<char const*, char const*>(key, key+std::strlen(key)), slice_or_string_hash(), slice_or_string_eq());
+}
+
+/// \return map.find(std:string(key.first, key.second)) but faster
+template <class UnorderedMap, class Slice>
+typename UnorderedMap::iterator find_string(UnorderedMap& map, char const* key) {
+ return map.find(std::pair<char const*, char const*>(key, key+std::strlen(key)), slice_or_string_hash(), slice_or_string_eq());
+}
+
+#endif
diff --git a/src/graphClasses.h b/src/graphClasses.h
index d3c0c4a..cd80a4c 100644
--- a/src/graphClasses.h
+++ b/src/graphClasses.h
@@ -3,7 +3,6 @@
#include <cstdlib>
#include "neuralClasses.h"
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
namespace nplm
@@ -11,50 +10,50 @@ namespace nplm
template <class X>
class Node {
- public:
- X * param; //what parameter is this
- //vector <void *> children;
- //vector <void *> parents;
- Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
- Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
- int minibatch_size;
-
- public:
- Node() : param(NULL), minibatch_size(0) { }
-
- Node(X *input_param, int minibatch_size)
- : param(input_param),
- minibatch_size(minibatch_size)
- {
- resize(minibatch_size);
- }
-
- void resize(int minibatch_size)
- {
- this->minibatch_size = minibatch_size;
- if (param->n_outputs() != -1)
- {
- fProp_matrix.setZero(param->n_outputs(), minibatch_size);
- }
- if (param->n_inputs() != -1)
- {
- bProp_matrix.setZero(param->n_inputs(), minibatch_size);
- }
- }
-
- void resize() { resize(minibatch_size); }
-
- /*
- void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols)
- {
- param->fProp(input,fProp_matrix,0,0,n_cols);
- }
- void Fprop(Matrix<double,1,Dynamic> & input,int n_cols)
- {
- param->fProp(input,fProp_matrix,0,0,n_cols);
- }
- */
- //for f prop, just call the fProp node of the particular parameter.
+ public:
+ X * param; //what parameter is this
+ //vector <void *> children;
+ //vector <void *> parents;
+ Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> fProp_matrix;
+ Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> bProp_matrix;
+ int minibatch_size;
+
+ public:
+ Node() : param(NULL), minibatch_size(0) { }
+
+ Node(X *input_param, int minibatch_size)
+ : param(input_param),
+ minibatch_size(minibatch_size)
+ {
+ resize(minibatch_size);
+ }
+
+ void resize(int minibatch_size)
+ {
+ this->minibatch_size = minibatch_size;
+ if (param->n_outputs() != -1)
+ {
+ fProp_matrix.setZero(param->n_outputs(), minibatch_size);
+ }
+ if (param->n_inputs() != -1)
+ {
+ bProp_matrix.setZero(param->n_inputs(), minibatch_size);
+ }
+ }
+
+ void resize() { resize(minibatch_size); }
+
+ /*
+ void Fprop(Matrix<double,Dynamic,Dynamic> & input,int n_cols)
+ {
+ param->fProp(input,fProp_matrix,0,0,n_cols);
+ }
+ void Fprop(Matrix<double,1,Dynamic> & input,int n_cols)
+ {
+ param->fProp(input,fProp_matrix,0,0,n_cols);
+ }
+ */
+ //for f prop, just call the fProp node of the particular parameter.
};
diff --git a/src/model.cpp b/src/model.cpp
index 3767f4b..db7f006 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -13,295 +13,295 @@ namespace nplm
{
void model::resize(int ngram_size,
- int input_vocab_size,
- int output_vocab_size,
- int input_embedding_dimension,
- int num_hidden,
- int output_embedding_dimension)
+ int input_vocab_size,
+ int output_vocab_size,
+ int input_embedding_dimension,
+ int num_hidden,
+ int output_embedding_dimension)
{
- input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
- if (num_hidden == 0)
- {
- first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
- first_hidden_activation.resize(output_embedding_dimension);
- second_hidden_linear.resize(1,1);
- second_hidden_activation.resize(1);
- }
- else
- {
- first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
- first_hidden_activation.resize(num_hidden);
- second_hidden_linear.resize(output_embedding_dimension, num_hidden);
- second_hidden_activation.resize(output_embedding_dimension);
- }
- output_layer.resize(output_vocab_size, output_embedding_dimension);
- this->ngram_size = ngram_size;
- this->input_vocab_size = input_vocab_size;
- this->output_vocab_size = output_vocab_size;
- this->input_embedding_dimension = input_embedding_dimension;
- this->num_hidden = num_hidden;
- this->output_embedding_dimension = output_embedding_dimension;
- premultiplied = false;
+ input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
+ if (num_hidden == 0)
+ {
+ first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
+ first_hidden_activation.resize(output_embedding_dimension);
+ second_hidden_linear.resize(1,1);
+ second_hidden_activation.resize(1);
+ }
+ else
+ {
+ first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
+ first_hidden_activation.resize(num_hidden);
+ second_hidden_linear.resize(output_embedding_dimension, num_hidden);
+ second_hidden_activation.resize(output_embedding_dimension);
+ }
+ output_layer.resize(output_vocab_size, output_embedding_dimension);
+ this->ngram_size = ngram_size;
+ this->input_vocab_size = input_vocab_size;
+ this->output_vocab_size = output_vocab_size;
+ this->input_embedding_dimension = input_embedding_dimension;
+ this->num_hidden = num_hidden;
+ this->output_embedding_dimension = output_embedding_dimension;
+ premultiplied = false;
}
-
-void model::initialize(mt19937 &init_engine,
- bool init_normal,
- double init_range,
- double init_bias,
- string &parameter_update,
- double adagrad_epsilon)
+
+void model::initialize(boost::random::mt19937 &init_engine,
+ bool init_normal,
+ double init_range,
+ double init_bias,
+ string &parameter_update,
+ double adagrad_epsilon)
{
- input_layer.initialize(init_engine,
- init_normal,
- init_range,
- parameter_update,
- adagrad_epsilon);
- output_layer.initialize(init_engine,
- init_normal,
- init_range,
- init_bias,
- parameter_update,
- adagrad_epsilon);
- first_hidden_linear.initialize(init_engine,
- init_normal,
- init_range,
- parameter_update,
- adagrad_epsilon);
- second_hidden_linear.initialize(init_engine,
- init_normal,
- init_range,
- parameter_update,
- adagrad_epsilon);
+ input_layer.initialize(init_engine,
+ init_normal,
+ init_range,
+ parameter_update,
+ adagrad_epsilon);
+ output_layer.initialize(init_engine,
+ init_normal,
+ init_range,
+ init_bias,
+ parameter_update,
+ adagrad_epsilon);
+ first_hidden_linear.initialize(init_engine,
+ init_normal,
+ init_range,
+ parameter_update,
+ adagrad_epsilon);
+ second_hidden_linear.initialize(init_engine,
+ init_normal,
+ init_range,
+ parameter_update,
+ adagrad_epsilon);
}
void model::premultiply()
{
- // Since input and first_hidden_linear are both linear,
- // we can multiply them into a single linear layer *if* we are not training
- int context_size = ngram_size-1;
- Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
- if (num_hidden == 0)
- {
- first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
- }
- else
- {
- first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
- }
- for (int i=0; i<context_size; i++)
- first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
- input_layer.W->resize(1,1); // try to save some memory
- premultiplied = true;
+ // Since input and first_hidden_linear are both linear,
+ // we can multiply them into a single linear layer *if* we are not training
+ int context_size = ngram_size-1;
+ Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
+ if (num_hidden == 0)
+ {
+ first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
+ }
+ else
+ {
+ first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
+ }
+ for (int i=0; i<context_size; i++)
+ first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
+ input_layer.W->resize(1,1); // try to save some memory
+ premultiplied = true;
}
void model::readConfig(ifstream &config_file)
{
- string line;
- vector<string> fields;
- int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
- activation_function_type activation_function = this->activation_function;
- while (getline(config_file, line) && line != "")
+ string line;
+ vector<string> fields;
+ int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
+ activation_function_type activation_function = this->activation_function;
+ while (getline(config_file, line) && line != "")
+ {
+ splitBySpace(line, fields);
+ if (fields[0] == "ngram_size")
+ ngram_size = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "vocab_size")
+ input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "input_vocab_size")
+ input_vocab_size = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "output_vocab_size")
+ output_vocab_size = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "input_embedding_dimension")
+ input_embedding_dimension = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "num_hidden")
+ num_hidden = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "output_embedding_dimension")
+ output_embedding_dimension = lexical_cast<int>(fields[1]);
+ else if (fields[0] == "activation_function")
+ activation_function = string_to_activation_function(fields[1]);
+ else if (fields[0] == "version")
{
- splitBySpace(line, fields);
- if (fields[0] == "ngram_size")
- ngram_size = lexical_cast<int>(fields[1]);
- else if (fields[0] == "vocab_size")
- input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
- else if (fields[0] == "input_vocab_size")
- input_vocab_size = lexical_cast<int>(fields[1]);
- else if (fields[0] == "output_vocab_size")
- output_vocab_size = lexical_cast<int>(fields[1]);
- else if (fields[0] == "input_embedding_dimension")
- input_embedding_dimension = lexical_cast<int>(fields[1]);
- else if (fields[0] == "num_hidden")
- num_hidden = lexical_cast<int>(fields[1]);
- else if (fields[0] == "output_embedding_dimension")
- output_embedding_dimension = lexical_cast<int>(fields[1]);
- else if (fields[0] == "activation_function")
- activation_function = string_to_activation_function(fields[1]);
- else if (fields[0] == "version")
- {
- int version = lexical_cast<int>(fields[1]);
- if (version != 1)
- {
- cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
- exit(1);
- }
- }
- else
- cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+ int version = lexical_cast<int>(fields[1]);
+ if (version != 1)
+ {
+ cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
+ exit(1);
+ }
}
- resize(ngram_size,
- input_vocab_size,
- output_vocab_size,
- input_embedding_dimension,
- num_hidden,
- output_embedding_dimension);
- set_activation_function(activation_function);
+ else
+ cerr << "warning: unrecognized field in config: " << fields[0] << endl;
+ }
+ resize(ngram_size,
+ input_vocab_size,
+ output_vocab_size,
+ input_embedding_dimension,
+ num_hidden,
+ output_embedding_dimension);
+ set_activation_function(activation_function);
}
void model::readConfig(const string &filename)
{
- ifstream config_file(filename.c_str());
- if (!config_file)
- {
- cerr << "error: could not open config file " << filename << endl;
- exit(1);
- }
- readConfig(config_file);
- config_file.close();
+ ifstream config_file(filename.c_str());
+ if (!config_file)
+ {
+ cerr << "error: could not open config file " << filename << endl;
+ exit(1);
+ }
+ readConfig(config_file);
+ config_file.close();
}
-
+
void model::read(const string &filename)
{
- vector<string> input_words;
- vector<string> output_words;
- read(filename, input_words, output_words);
+ vector<string> input_words;
+ vector<string> output_words;
+ read(filename, input_words, output_words);
}
void model::read(const string &filename, vector<string> &words)
{
- vector<string> output_words;
- read(filename, words, output_words);
+ vector<string> output_words;
+ read(filename, words, output_words);
}
void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words)
{
- ifstream file(filename.c_str());
- if (!file) throw runtime_error("Could not open file " + filename);
-
- param myParam;
- string line;
-
- while (getline(file, line))
+ ifstream file(filename.c_str());
+ if (!file) throw runtime_error("Could not open file " + filename);
+
+ param myParam;
+ string line;
+
+ while (getline(file, line))
+ {
+ if (line == "\\config")
+ {
+ readConfig(file);
+ }
+
+ else if (line == "\\vocab")
+ {
+ input_words.clear();
+ readWordsFile(file, input_words);
+ output_words = input_words;
+ }
+
+ else if (line == "\\input_vocab")
{
- if (line == "\\config")
- {
- readConfig(file);
- }
-
- else if (line == "\\vocab")
- {
- input_words.clear();
- readWordsFile(file, input_words);
- output_words = input_words;
- }
-
- else if (line == "\\input_vocab")
- {
- input_words.clear();
- readWordsFile(file, input_words);
- }
-
- else if (line == "\\output_vocab")
- {
- output_words.clear();
- readWordsFile(file, output_words);
- }
-
- else if (line == "\\input_embeddings")
- input_layer.read(file);
- else if (line == "\\hidden_weights 1")
- first_hidden_linear.read_weights(file);
- else if (line == "\\hidden_biases 1")
- first_hidden_linear.read_biases (file);
- else if (line == "\\hidden_weights 2")
- second_hidden_linear.read_weights(file);
- else if (line == "\\hidden_biases 2")
- second_hidden_linear.read_biases (file);
- else if (line == "\\output_weights")
- output_layer.read_weights(file);
- else if (line == "\\output_biases")
- output_layer.read_biases(file);
- else if (line == "\\end")
- break;
- else if (line == "")
- continue;
- else
- {
- cerr << "warning: unrecognized section: " << line << endl;
- // skip over section
- while (getline(file, line) && line != "") { }
- }
+ input_words.clear();
+ readWordsFile(file, input_words);
}
- file.close();
+
+ else if (line == "\\output_vocab")
+ {
+ output_words.clear();
+ readWordsFile(file, output_words);
+ }
+
+ else if (line == "\\input_embeddings")
+ input_layer.read(file);
+ else if (line == "\\hidden_weights 1")
+ first_hidden_linear.read_weights(file);
+ else if (line == "\\hidden_biases 1")
+ first_hidden_linear.read_biases (file);
+ else if (line == "\\hidden_weights 2")
+ second_hidden_linear.read_weights(file);
+ else if (line == "\\hidden_biases 2")
+ second_hidden_linear.read_biases (file);
+ else if (line == "\\output_weights")
+ output_layer.read_weights(file);
+ else if (line == "\\output_biases")
+ output_layer.read_biases(file);
+ else if (line == "\\end")
+ break;
+ else if (line == "")
+ continue;
+ else
+ {
+ cerr << "warning: unrecognized section: " << line << endl;
+ // skip over section
+ while (getline(file, line) && line != "") { }
+ }
+ }
+ file.close();
}
void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words)
-{
- write(filename, &input_words, &output_words);
+{
+ write(filename, &input_words, &output_words);
}
void model::write(const string &filename, const vector<string> &words)
-{
- write(filename, &words, NULL);
+{
+ write(filename, &words, NULL);
}
-void model::write(const string &filename)
-{
- write(filename, NULL, NULL);
+void model::write(const string &filename)
+{
+ write(filename, NULL, NULL);
}
void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords)
{
- ofstream file(filename.c_str());
- if (!file) throw runtime_error("Could not open file " + filename);
-
- file << "\\config" << endl;
- file << "version 1" << endl;
- file << "ngram_size " << ngram_size << endl;
- file << "input_vocab_size " << input_vocab_size << endl;
- file << "output_vocab_size " << output_vocab_size << endl;
- file << "input_embedding_dimension " << input_embedding_dimension << endl;
- file << "num_hidden " << num_hidden << endl;
- file << "output_embedding_dimension " << output_embedding_dimension << endl;
- file << "activation_function " << activation_function_to_string(activation_function) << endl;
- file << endl;
-
- if (input_pwords)
- {
- file << "\\input_vocab" << endl;
- writeWordsFile(*input_pwords, file);
- file << endl;
- }
+ ofstream file(filename.c_str());
+ if (!file) throw runtime_error("Could not open file " + filename);
- if (output_pwords)
- {
- file << "\\output_vocab" << endl;
- writeWordsFile(*output_pwords, file);
- file << endl;
- }
+ file << "\\config" << endl;
+ file << "version 1" << endl;
+ file << "ngram_size " << ngram_size << endl;
+ file << "input_vocab_size " << input_vocab_size << endl;
+ file << "output_vocab_size " << output_vocab_size << endl;
+ file << "input_embedding_dimension " << input_embedding_dimension << endl;
+ file << "num_hidden " << num_hidden << endl;
+ file << "output_embedding_dimension " << output_embedding_dimension << endl;
+ file << "activation_function " << activation_function_to_string(activation_function) << endl;
+ file << endl;
- file << "\\input_embeddings" << endl;
- input_layer.write(file);
- file << endl;
-
- file << "\\hidden_weights 1" << endl;
- first_hidden_linear.write_weights(file);
+ if (input_pwords)
+ {
+ file << "\\input_vocab" << endl;
+ writeWordsFile(*input_pwords, file);
file << endl;
+ }
- file << "\\hidden_biases 1" << endl;
- first_hidden_linear.write_biases(file);
- file <<endl;
-
- file << "\\hidden_weights 2" << endl;
- second_hidden_linear.write_weights(file);
+ if (output_pwords)
+ {
+ file << "\\output_vocab" << endl;
+ writeWordsFile(*output_pwords, file);
file << endl;
+ }
- file << "\\hidden_biases 2" << endl;
- second_hidden_linear.write_biases(file);
- file << endl;
-
- file << "\\output_weights" << endl;
- output_layer.write_weights(file);
- file << endl;
-
- file << "\\output_biases" << endl;
- output_layer.write_biases(file);
- file << endl;
-
- file << "\\end" << endl;
- file.close();
+ file << "\\input_embeddings" << endl;
+ input_layer.write(file);
+ file << endl;
+
+ file << "\\hidden_weights 1" << endl;
+ first_hidden_linear.write_weights(file);
+ file << endl;
+
+ file << "\\hidden_biases 1" << endl;
+ first_hidden_linear.write_biases(file);
+ file <<endl;
+
+ file << "\\hidden_weights 2" << endl;
+ second_hidden_linear.write_weights(file);
+ file << endl;
+
+ file << "\\hidden_biases 2" << endl;
+ second_hidden_linear.write_biases(file);
+ file << endl;
+
+ file << "\\output_weights" << endl;
+ output_layer.write_weights(file);
+ file << endl;
+
+ file << "\\output_biases" << endl;
+ output_layer.write_biases(file);
+ file << endl;
+
+ file << "\\end" << endl;
+ file.close();
}
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index 26dae06..458f80e 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -6,8 +6,7 @@
#include <cmath>
#include <vector>
-#include <boost/unordered_map.hpp>
-//#include <../3rdparty/Eigen/Dense>
+#include <boost/unordered_map.hpp>
#include <Eigen/Dense>
#include "maybe_omp.h"
@@ -35,7 +34,7 @@ using Eigen::Dynamic;
typedef boost::unordered_map<int,bool> int_map;
struct Clipper{
- double operator() (double x) const {
+ double operator() (double x) const {
return std::min(0.5, std::max(x,-0.5));
//return(x);
}
@@ -44,978 +43,997 @@ struct Clipper{
class Linear_layer
{
- private:
- Matrix<double,Dynamic,Dynamic> U;
- Matrix<double,Dynamic,Dynamic> U_gradient;
- Matrix<double,Dynamic,Dynamic> U_velocity;
- Matrix<double,Dynamic,Dynamic> U_running_gradient;
- Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
- // Biases
- Matrix<double,Dynamic,1> b;
- Matrix<double,Dynamic,1> b_velocity;
- Matrix<double,Dynamic,1> b_running_gradient;
- Matrix<double,Dynamic,1> b_running_parameter_update;
- Matrix<double,Dynamic,1> b_gradient;
-
- friend class model;
-
- public:
- Linear_layer() { }
- Linear_layer(int rows, int cols) { resize(rows, cols); }
-
- void resize(int rows, int cols)
- {
- U.setZero(rows, cols);
- U_gradient.setZero(rows, cols);
- //U_running_gradient.setZero(rows, cols);
- //U_running_parameter_updates.setZero(rows, cols);
- //U_velocity.setZero(rows, cols);
- b.resize(rows);
- b_gradient.setZero(rows);
- //b_running_gradient.resize(rows);
- //b_velocity.resize(rows);
- }
-
- void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
- void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
+ private:
+ Matrix<double,Dynamic,Dynamic> U;
+ Matrix<double,Dynamic,Dynamic> U_gradient;
+ Matrix<double,Dynamic,Dynamic> U_velocity;
+ Matrix<double,Dynamic,Dynamic> U_running_gradient;
+ Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
+ // Biases
+ Matrix<double,Dynamic,1> b;
+ Matrix<double,Dynamic,1> b_velocity;
+ Matrix<double,Dynamic,1> b_running_gradient;
+ Matrix<double,Dynamic,1> b_running_parameter_update;
+ Matrix<double,Dynamic,1> b_gradient;
+
+ friend class model;
+
+ public:
+ Linear_layer() { }
+ Linear_layer(int rows, int cols) { resize(rows, cols); }
+
+ void resize(int rows, int cols)
+ {
+ U.setZero(rows, cols);
+ U_gradient.setZero(rows, cols);
+ //U_running_gradient.setZero(rows, cols);
+ //U_running_parameter_updates.setZero(rows, cols);
+ //U_velocity.setZero(rows, cols);
+ b.resize(rows);
+ b_gradient.setZero(rows);
+ //b_running_gradient.resize(rows);
+ //b_velocity.resize(rows);
+ }
+
+ void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
+ void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
- template <typename Engine>
- void initialize(Engine &engine,
- bool init_normal,
- double init_range,
- string &parameter_update,
- double adagrad_epsilon)
- {
- if (parameter_update == "ADA") {
- U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
- b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
- }
- if (parameter_update == "ADAD") {
- U_running_gradient.setZero(U.rows(),U.cols());
- b_running_gradient.setZero(b.size());
- U_running_parameter_update.setZero(U.rows(),U.cols());
- b_running_parameter_update.setZero(b.size());
- }
+ template <typename Engine>
+ void initialize(Engine &engine,
+ bool init_normal,
+ double init_range,
+ string &parameter_update,
+ double adagrad_epsilon)
+ {
+ if (parameter_update == "ADA") {
+ U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
+ b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+ }
+ if (parameter_update == "ADAD") {
+ U_running_gradient.setZero(U.rows(),U.cols());
+ b_running_gradient.setZero(b.size());
+ U_running_parameter_update.setZero(U.rows(),U.cols());
+ b_running_parameter_update.setZero(b.size());
+ }
- initMatrix(engine, U, init_normal, init_range);
- initBias(engine, b, init_normal, init_range);
- }
+ initMatrix(engine, U, init_normal, init_range);
+ initBias(engine, b, init_normal, init_range);
+ }
- int n_inputs () const { return U.cols(); }
- int n_outputs () const { return U.rows(); }
+ int n_inputs () const { return U.cols(); }
+ int n_outputs () const { return U.rows(); }
template <typename DerivedIn, typename DerivedOut>
- void fProp(const MatrixBase<DerivedIn> &input,
- const MatrixBase<DerivedOut> &output) const
+ void fProp(const MatrixBase<DerivedIn> &input,
+ const MatrixBase<DerivedOut> &output) const
{
- UNCONST(DerivedOut, output, my_output);
- my_output.leftCols(input.cols()).noalias() = U*input;
- int num_examples = input.cols();
- for (int example = 0;example < num_examples;example++)
- {
- my_output.leftCols(input.cols()).col(example) += b;
- }
+ UNCONST(DerivedOut, output, my_output);
+ my_output.leftCols(input.cols()).noalias() = U*input;
+ int num_examples = input.cols();
+ for (int example = 0;example < num_examples;example++)
+ {
+ my_output.leftCols(input.cols()).col(example) += b;
+ }
}
- // Sparse input
+ // Sparse input
template <typename ScalarIn, typename DerivedOut>
- void fProp(const USCMatrix<ScalarIn> &input,
- const MatrixBase<DerivedOut> &output_const) const
- {
- UNCONST(DerivedOut, output_const, output);
- output.setZero();
- uscgemm(1.0, U, input, output.leftCols(input.cols()));
- // Each column corresponds to a training example. We
- // parallelize the adding of biases per dimension.
- int num_examples = input.cols();
- for (int example = 0;example < num_examples;example++)
- {
- output.leftCols(input.cols()).col(example) += b;
- }
+ void fProp(const USCMatrix<ScalarIn> &input,
+ const MatrixBase<DerivedOut> &output_const) const
+ {
+ UNCONST(DerivedOut, output_const, output);
+ output.setZero();
+ uscgemm(1.0, U, input, output.leftCols(input.cols()));
+ // Each column corresponds to a training example. We
+ // parallelize the adding of biases per dimension.
+ int num_examples = input.cols();
+ for (int example = 0;example < num_examples;example++)
+ {
+ output.leftCols(input.cols()).col(example) += b;
+ }
}
template <typename DerivedGOut, typename DerivedGIn>
- void bProp(const MatrixBase<DerivedGOut> &input,
- MatrixBase<DerivedGIn> &output) const
+ void bProp(const MatrixBase<DerivedGOut> &input,
+ MatrixBase<DerivedGIn> &output) const
{
- UNCONST(DerivedGIn, output, my_output);
- my_output.noalias() = U.transpose()*input;
- }
+ UNCONST(DerivedGIn, output, my_output);
+ my_output.noalias() = U.transpose()*input;
+ }
template <typename DerivedGOut, typename DerivedIn>
- void computeGradient( const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate, double momentum, double L2_reg)
- {
- U_gradient.noalias() = bProp_input*fProp_input.transpose();
-
- // get the bias gradient for all dimensions in parallel
- int size = b.size();
- b_gradient = bProp_input.rowwise().sum();
- // This used to be multithreaded, but there was no measureable difference
- if (L2_reg > 0.0)
- {
- U_gradient -= 2*L2_reg*U;
- b_gradient -= 2*L2_reg*b;
- }
- if (momentum > 0.0)
- {
- U_velocity = momentum*U_velocity + U_gradient;
- U += learning_rate * U_velocity;
- b_velocity = momentum*b_velocity + b_gradient;
- b += learning_rate * b_velocity;
- }
- else
- {
- U += learning_rate * U_gradient;
- b += learning_rate * b_gradient;
- /*
- //UPDATE CLIPPING
- U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
- b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
- //GRADIENT CLIPPING
- //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
- //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
- */
- }
- }
+ void computeGradient( const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate, double momentum, double L2_reg)
+ {
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
+
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient = bProp_input.rowwise().sum();
+ // This used to be multithreaded, but there was no measureable difference
+ if (L2_reg > 0.0)
+ {
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
+ if (momentum > 0.0)
+ {
+ U_velocity = momentum*U_velocity + U_gradient;
+ U += learning_rate * U_velocity;
+ b_velocity = momentum*b_velocity + b_gradient;
+ b += learning_rate * b_velocity;
+ }
+ else
+ {
+ U += learning_rate * U_gradient;
+ b += learning_rate * b_gradient;
+ /*
+ //UPDATE CLIPPING
+ U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
+ b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
+ //GRADIENT CLIPPING
+ //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
+ //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
+ */
+ }
+ }
template <typename DerivedGOut, typename DerivedIn>
- void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate,
- double L2_reg)
+ void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate,
+ double L2_reg)
{
- U_gradient.noalias() = bProp_input*fProp_input.transpose();
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
-
- // get the bias gradient for all dimensions in parallel
- int size = b.size();
- b_gradient.noalias() = bProp_input.rowwise().sum();
- if (L2_reg != 0)
- {
- U_gradient -= 2*L2_reg*U;
- b_gradient -= 2*L2_reg*b;
- }
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
- // ignore momentum?
- #pragma omp parallel for
- for (int col=0; col<U.cols(); col++) {
- U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
- U.col(col) += learning_rate * (U_gradient.col(col).array() /
- U_running_gradient.col(col).array().sqrt()).matrix();
- /*
- //UPDATE CLIPPING
- U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
- unaryExpr(Clipper()).matrix();
- */
- }
- b_running_gradient += b_gradient.array().square().matrix();
- b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+ if (L2_reg != 0)
+ {
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
+
+ // ignore momentum?
+#pragma omp parallel for
+ for (int col=0; col<U.cols(); col++) {
+ U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
+ U.col(col) += learning_rate * (U_gradient.col(col).array() /
+ U_running_gradient.col(col).array().sqrt()).matrix();
/*
//UPDATE CLIPPING
- b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
+ unaryExpr(Clipper()).matrix();
*/
+ }
+ b_running_gradient += b_gradient.array().square().matrix();
+ b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+ /*
+ //UPDATE CLIPPING
+ b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ */
}
template <typename DerivedGOut, typename DerivedIn>
- void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- double learning_rate,
- double L2_reg,
- double conditioning_constant,
- double decay)
- {
- //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
- U_gradient.noalias() = bProp_input*fProp_input.transpose();
-
- Array<double,Dynamic,1> b_current_parameter_update;
-
- // get the bias gradient for all dimensions in parallel
- int size = b.size();
- b_gradient.noalias() = bProp_input.rowwise().sum();
-
- if (L2_reg != 0)
- {
- U_gradient -= 2*L2_reg*U;
- b_gradient -= 2*L2_reg*b;
- }
+ void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &fProp_input,
+ double learning_rate,
+ double L2_reg,
+ double conditioning_constant,
+ double decay)
+ {
+ //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
+ U_gradient.noalias() = bProp_input*fProp_input.transpose();
- // ignore momentum?
- #pragma omp parallel for
- //cerr<<"U gradient is "<<U_gradient<<endl;
- for (int col=0; col<U.cols(); col++) {
- Array<double,Dynamic,1> U_current_parameter_update;
- U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
- (1-decay)*U_gradient.col(col).array().square().matrix();
- //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
- //getchar();
- U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
- (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
- U_gradient.col(col).array();
- //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
- //getchar();
- //update the running parameter update
- U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
- (1.-decay)*U_current_parameter_update.square().matrix();
- U.col(col) += learning_rate*U_current_parameter_update.matrix();
- }
- b_running_gradient = decay*b_running_gradient +
- (1.-decay)*b_gradient.array().square().matrix();
- b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
- (b_running_gradient.array()+conditioning_constant).sqrt()) *
- b_gradient.array();
- b_running_parameter_update = decay*(b_running_parameter_update) +
- (1.-decay)*b_current_parameter_update.square().matrix();
- b += learning_rate*b_current_parameter_update.matrix();
+ Array<double,Dynamic,1> b_current_parameter_update;
+
+ // get the bias gradient for all dimensions in parallel
+ int size = b.size();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
+
+ if (L2_reg != 0)
+ {
+ U_gradient -= 2*L2_reg*U;
+ b_gradient -= 2*L2_reg*b;
+ }
+
+ // ignore momentum?
+#pragma omp parallel for
+ //cerr<<"U gradient is "<<U_gradient<<endl;
+ for (int col=0; col<U.cols(); col++) {
+ Array<double,Dynamic,1> U_current_parameter_update;
+ U_running_gradient.col(col) = decay*U_running_gradient.col(col) +
+ (1-decay)*U_gradient.col(col).array().square().matrix();
+ //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
+ //getchar();
+ U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
+ (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
+ U_gradient.col(col).array();
+ //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
+ //getchar();
+ //update the running parameter update
+ U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
+ (1.-decay)*U_current_parameter_update.square().matrix();
+ U.col(col) += learning_rate*U_current_parameter_update.matrix();
+ }
+ b_running_gradient = decay*b_running_gradient +
+ (1.-decay)*b_gradient.array().square().matrix();
+ b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (b_running_gradient.array()+conditioning_constant).sqrt()) *
+ b_gradient.array();
+ b_running_parameter_update = decay*(b_running_parameter_update) +
+ (1.-decay)*b_current_parameter_update.square().matrix();
+ b += learning_rate*b_current_parameter_update.matrix();
}
template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
- void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &fProp_input,
- const MatrixBase<DerivedGW> &gradient) const
+ void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &fProp_input,
+ const MatrixBase<DerivedGW> &gradient) const
{
- UNCONST(DerivedGW, gradient, my_gradient);
- my_gradient.noalias() = bProp_input*fProp_input.transpose();
+ UNCONST(DerivedGW, gradient, my_gradient);
+ my_gradient.noalias() = bProp_input*fProp_input.transpose();
}
};
class Output_word_embeddings
{
- private:
- // row-major is better for uscgemm
- //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
- // Having W be a pointer to a matrix allows ease of sharing
- // input and output word embeddings
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
- std::vector<double> W_data;
- Matrix<double,Dynamic,1> b;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
- Matrix<double,Dynamic,1> b_running_gradient;
- Matrix<double,Dynamic,1> b_gradient;
- Matrix<double,Dynamic,1> b_running_parameter_update;
-
- public:
- Output_word_embeddings() { }
- Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
-
- void resize(int rows, int cols)
- {
- W->setZero(rows, cols);
- b.setZero(rows);
- }
- void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
- W = input_W;
- }
- void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
- void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
- void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
- void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
-
- template <typename Engine>
- void initialize(Engine &engine,
- bool init_normal,
- double init_range,
- double init_bias,
- string &parameter_update,
- double adagrad_epsilon)
- {
+ private:
+ // row-major is better for uscgemm
+ //Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W;
+ // Having W be a pointer to a matrix allows ease of sharing
+ // input and output word embeddings
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+ std::vector<double> W_data;
+ Matrix<double,Dynamic,1> b;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+ Matrix<double,Dynamic,1> b_running_gradient;
+ Matrix<double,Dynamic,1> b_gradient;
+ Matrix<double,Dynamic,1> b_running_parameter_update;
+
+ public:
+ Output_word_embeddings() { }
+ Output_word_embeddings(int rows, int cols) { resize(rows, cols); }
+
+ void resize(int rows, int cols)
+ {
+ W->setZero(rows, cols);
+ b.setZero(rows);
+ }
+ void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+ W = input_W;
+ }
+ void read_weights(std::ifstream &W_file) { readMatrix(W_file, *W); }
+ void write_weights(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+ void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
+ void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
- W_gradient.setZero(W->rows(),W->cols());
- b_gradient.setZero(b.size());
- if (parameter_update == "ADA") {
- W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
- b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
- //W_gradient.setZero(W->rows(),W->cols());
- //b_gradient.setZero(b.size());
- }
- if (parameter_update == "ADAD") {
- W_running_gradient.setZero(W->rows(),W->cols());
- b_running_gradient.setZero(b.size());
- W_gradient.setZero(W->rows(),W->cols());
- //b_gradient.setZero(b.size());
- //W_running_parameter_update.setZero(W->rows(),W->cols());
- b_running_parameter_update.setZero(b.size());
- }
-
- initMatrix(engine, *W, init_normal, init_range);
- b.fill(init_bias);
- }
-
- int n_inputs () const { return W->cols(); }
- int n_outputs () const { return W->rows(); }
-
- template <typename DerivedIn, typename DerivedOut>
- void fProp(const MatrixBase<DerivedIn> &input,
- const MatrixBase<DerivedOut> &output) const
- {
- UNCONST(DerivedOut, output, my_output);
- my_output = ((*W) * input).colwise() + b;
- }
-
- // Sparse output version
- template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
- void fProp(const MatrixBase<DerivedIn> &input,
- const MatrixBase<DerivedOutI> &samples,
- const MatrixBase<DerivedOutV> &output) const
- {
- UNCONST(DerivedOutV, output, my_output);
- #pragma omp parallel for
- for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
- {
- for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
- {
- my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
- }
- }
- USCMatrix<double> sparse_output(W->rows(), samples, my_output);
- uscgemm_masked(1.0, *W, input, sparse_output);
- my_output = sparse_output.values; // too bad, so much copying
- }
-
- // Return single element of output matrix
- template <typename DerivedIn>
- double fProp(const MatrixBase<DerivedIn> &input,
- int word,
- int instance) const
- {
- return W->row(word).dot(input.col(instance)) + b(word);
+ template <typename Engine>
+ void initialize(Engine &engine,
+ bool init_normal,
+ double init_range,
+ double init_bias,
+ string &parameter_update,
+ double adagrad_epsilon)
+ {
+
+ W_gradient.setZero(W->rows(),W->cols());
+ b_gradient.setZero(b.size());
+ if (parameter_update == "ADA") {
+ W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+ b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+ //W_gradient.setZero(W->rows(),W->cols());
+ //b_gradient.setZero(b.size());
+ }
+ if (parameter_update == "ADAD") {
+ W_running_gradient.setZero(W->rows(),W->cols());
+ b_running_gradient.setZero(b.size());
+ W_gradient.setZero(W->rows(),W->cols());
+ //b_gradient.setZero(b.size());
+ //W_running_parameter_update.setZero(W->rows(),W->cols());
+ b_running_parameter_update.setZero(b.size());
}
- // Dense versions (for log-likelihood loss)
+ initMatrix(engine, *W, init_normal, init_range);
+ b.fill(init_bias);
+ }
+
+ int n_inputs () const { return W->cols(); }
+ int n_outputs () const { return W->rows(); }
- template <typename DerivedGOut, typename DerivedGIn>
- void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
- const MatrixBase<DerivedGIn> &bProp_matrix) const
+ template <typename DerivedIn, typename DerivedOut>
+ void fProp(const MatrixBase<DerivedIn> &input,
+ const MatrixBase<DerivedOut> &output) const
+ {
+ UNCONST(DerivedOut, output, my_output);
+ my_output = ((*W) * input).colwise() + b;
+ /* TODO: without EIGEN_NO_DEBUG - is this a bug?
+ ProductBase.h:102: Eigen::ProductBase<Derived, Lhs, Rhs>::ProductBase(const Lhs&
+ , const Rhs&) [with Derived = Eigen::GeneralProduct<Eigen::Matrix<double, -1, -1
+ , 1>, Eigen::Matrix<double, -1, -1>, 5>; Lhs = Eigen::Matrix<double, -1, -1, 1>;
+ Rhs = Eigen::Matrix<double, -1, -1>]: Assertion `a_lhs.cols() == a_rhs.rows() &
+ & "invalid matrix product" && "if you wanted a coeff-wise or a dot product use t
+ he respective explicit functions"' failed.
+
+ (gdb) p a_lhs.cols()
+ $3 = 50
+ (gdb) p a_rhs.rows()
+ $4 = 100
+
+ (gdb) p a_lhs.rows()
+ $5 = 2
+ (gdb) p a_rhs.cols()
+ $6 = 1
+
+ from lookup_ngram normalization prop.skip_hidden in neuralNetwork.h:100
+ */
+ }
+
+ // Sparse output version
+ template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+ void fProp(const MatrixBase<DerivedIn> &input,
+ const MatrixBase<DerivedOutI> &samples,
+ const MatrixBase<DerivedOutV> &output) const
+ {
+ UNCONST(DerivedOutV, output, my_output);
+#pragma omp parallel for
+ for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
{
- // W is vocab_size x output_embedding_dimension
- // input_bProp_matrix is vocab_size x minibatch_size
- // bProp_matrix is output_embedding_dimension x minibatch_size
- UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
- my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
+ for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
+ {
+ my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+ }
+ }
+ USCMatrix<double> sparse_output(W->rows(), samples, my_output);
+ uscgemm_masked(1.0, *W, input, sparse_output);
+ my_output = sparse_output.values; // too bad, so much copying
+ }
+
+ // Return single element of output matrix
+ template <typename DerivedIn>
+ double fProp(const MatrixBase<DerivedIn> &input,
+ int word,
+ int instance) const
+ {
+ return W->row(word).dot(input.col(instance)) + b(word);
+ }
+
+ // Dense versions (for log-likelihood loss)
+
+ template <typename DerivedGOut, typename DerivedGIn>
+ void bProp(const MatrixBase<DerivedGOut> &input_bProp_matrix,
+ const MatrixBase<DerivedGIn> &bProp_matrix) const
+ {
+ // W is vocab_size x output_embedding_dimension
+ // input_bProp_matrix is vocab_size x minibatch_size
+ // bProp_matrix is output_embedding_dimension x minibatch_size
+ UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+ my_bProp_matrix.leftCols(input_bProp_matrix.cols()).noalias() =
W->transpose() * input_bProp_matrix;
- }
+ }
- template <typename DerivedIn, typename DerivedGOut>
- void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate,
- double momentum) //not sure if we want to use momentum here
- {
- // W is vocab_size x output_embedding_dimension
- // b is vocab_size x 1
- // predicted_embeddings is output_embedding_dimension x minibatch_size
- // bProp_input is vocab_size x minibatch_size
- W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
- b += learning_rate * bProp_input.rowwise().sum();
-
- /*
- //GRADIENT CLIPPING
- W->noalias() += learning_rate *
- ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
- b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
- //UPDATE CLIPPING
- W->noalias() += (learning_rate *
- (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
- b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
- */
- }
-
- template <typename DerivedIn, typename DerivedGOut>
- void computeGradientAdagrad(
- const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate) //not sure if we want to use momentum here
- {
- // W is vocab_size x output_embedding_dimension
- // b is vocab_size x 1
- // predicted_embeddings is output_embedding_dimension x minibatch_size
- // bProp_input is vocab_size x minibatch_sizea
- W_gradient.setZero(W->rows(), W->cols());
- b_gradient.setZero(b.size());
- W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
- b_gradient.noalias() = bProp_input.rowwise().sum();
- W_running_gradient += W_gradient.array().square().matrix();
- b_running_gradient += b_gradient.array().square().matrix();
- W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
- b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
- /*
- //UPDATE CLIPPING
- *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
- b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
- */
- }
-
- template <typename DerivedIn, typename DerivedGOut>
- void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOut> &bProp_input,
- double learning_rate,
- double conditioning_constant,
- double decay) //not sure if we want to use momentum here
- {
- // W is vocab_size x output_embedding_dimension
- // b is vocab_size x 1
- // predicted_embeddings is output_embedding_dimension x minibatch_size
- // bProp_input is vocab_size x minibatch_size
- Array<double,Dynamic,Dynamic> W_current_parameter_update;
- Array<double,Dynamic,1> b_current_parameter_update;
- W_gradient.setZero(W->rows(), W->cols());
- b_gradient.setZero(b.size());
- W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
- b_gradient.noalias() = bProp_input.rowwise().sum();
- W_running_gradient = decay*W_running_gradient +
- (1.-decay)*W_gradient.array().square().matrix();
- b_running_gradient = decay*b_running_gradient+
- (1.-decay)*b_gradient.array().square().matrix();
- W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
- (W_running_gradient.array()+conditioning_constant).sqrt())*
- W_gradient.array();
- b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
- (b_running_gradient.array()+conditioning_constant).sqrt())*
- b_gradient.array();
- W_running_parameter_update = decay*W_running_parameter_update +
- (1.-decay)*W_current_parameter_update.square().matrix();
- b_running_parameter_update = decay*b_running_parameter_update +
- (1.-decay)*b_current_parameter_update.square().matrix();
-
- *W += learning_rate*W_current_parameter_update.matrix();
- b += learning_rate*b_current_parameter_update.matrix();
- }
-
- // Sparse versions
-
- template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
- void bProp(const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- const MatrixBase<DerivedGIn> &bProp_matrix) const
- {
- UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
- my_bProp_matrix.setZero();
- uscgemm(1.0,
- W->transpose(),
+ template <typename DerivedIn, typename DerivedGOut>
+ void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOut> &bProp_input,
+ double learning_rate,
+ double momentum) //not sure if we want to use momentum here
+ {
+ // W is vocab_size x output_embedding_dimension
+ // b is vocab_size x 1
+ // predicted_embeddings is output_embedding_dimension x minibatch_size
+ // bProp_input is vocab_size x minibatch_size
+ W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
+ b += learning_rate * bProp_input.rowwise().sum();
+
+ /*
+ //GRADIENT CLIPPING
+ W->noalias() += learning_rate *
+ ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
+ b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
+ //UPDATE CLIPPING
+ W->noalias() += (learning_rate *
+ (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
+ b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
+ */
+ }
+
+ template <typename DerivedIn, typename DerivedGOut>
+ void computeGradientAdagrad(
+ const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOut> &bProp_input,
+ double learning_rate) //not sure if we want to use momentum here
+ {
+ // W is vocab_size x output_embedding_dimension
+ // b is vocab_size x 1
+ // predicted_embeddings is output_embedding_dimension x minibatch_size
+ // bProp_input is vocab_size x minibatch_sizea
+ W_gradient.setZero(W->rows(), W->cols());
+ b_gradient.setZero(b.size());
+ W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
+ W_running_gradient += W_gradient.array().square().matrix();
+ b_running_gradient += b_gradient.array().square().matrix();
+ W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
+ b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+ /*
+ //UPDATE CLIPPING
+ *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+ */
+ }
+
+ template <typename DerivedIn, typename DerivedGOut>
+ void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOut> &bProp_input,
+ double learning_rate,
+ double conditioning_constant,
+ double decay) //not sure if we want to use momentum here
+ {
+ // W is vocab_size x output_embedding_dimension
+ // b is vocab_size x 1
+ // predicted_embeddings is output_embedding_dimension x minibatch_size
+ // bProp_input is vocab_size x minibatch_size
+ Array<double,Dynamic,Dynamic> W_current_parameter_update;
+ Array<double,Dynamic,1> b_current_parameter_update;
+ W_gradient.setZero(W->rows(), W->cols());
+ b_gradient.setZero(b.size());
+ W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+ b_gradient.noalias() = bProp_input.rowwise().sum();
+ W_running_gradient = decay*W_running_gradient +
+ (1.-decay)*W_gradient.array().square().matrix();
+ b_running_gradient = decay*b_running_gradient+
+ (1.-decay)*b_gradient.array().square().matrix();
+ W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (W_running_gradient.array()+conditioning_constant).sqrt())*
+ W_gradient.array();
+ b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+ (b_running_gradient.array()+conditioning_constant).sqrt())*
+ b_gradient.array();
+ W_running_parameter_update = decay*W_running_parameter_update +
+ (1.-decay)*W_current_parameter_update.square().matrix();
+ b_running_parameter_update = decay*b_running_parameter_update +
+ (1.-decay)*b_current_parameter_update.square().matrix();
+
+ *W += learning_rate*W_current_parameter_update.matrix();
+ b += learning_rate*b_current_parameter_update.matrix();
+ }
+
+ // Sparse versions
+
+ template <typename DerivedGOutI, typename DerivedGOutV, typename DerivedGIn>
+ void bProp(const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ const MatrixBase<DerivedGIn> &bProp_matrix) const
+ {
+ UNCONST(DerivedGIn, bProp_matrix, my_bProp_matrix);
+ my_bProp_matrix.setZero();
+ uscgemm(1.0,
+ W->transpose(),
USCMatrix<double>(W->rows(), samples, weights),
my_bProp_matrix.leftCols(samples.cols())); // narrow bProp_matrix for possible short minibatch
+ }
+
+ template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+ void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ double learning_rate, double momentum) //not sure if we want to use momentum here
+ {
+ //cerr<<"in gradient"<<endl;
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(learning_rate,
+ gradient_output,
+ predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
+ *W); // narrow predicted_embeddings for possible short minibatch
+ uscgemv(learning_rate,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
+ b);
+ /*
+ //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
+ //FIRST
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ W_gradient);
+ uscgemv(1.0,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ b_gradient);
+
+ int_map update_map; //stores all the parameters that have been updated
+ for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+ for (int train_id=0; train_id<samples.cols(); train_id++)
+ update_map[samples(sample_id, train_id)] = 1;
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ update_items.push_back(it->first);
+ int num_items = update_items.size();
+
+ //#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ //W->row(update_item) += learning_rate * W_gradient.row(update_item);
+ //b(update_item) += learning_rate * b_gradient(update_item);
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
+ double update = learning_rate * b_gradient(update_item);
+ b(update_item) += std::min(0.5, std::max(update,-0.5));
+ //GRADIENT CLIPPING
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
}
+ */
+ //cerr<<"Finished gradient"<<endl;
+ }
- template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
- void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- double learning_rate, double momentum) //not sure if we want to use momentum here
- {
- //cerr<<"in gradient"<<endl;
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(learning_rate,
- gradient_output,
- predicted_embeddings.leftCols(gradient_output.cols()).transpose(),
- *W); // narrow predicted_embeddings for possible short minibatch
- uscgemv(learning_rate,
- gradient_output,
- Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
- b);
+ template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+ void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ double learning_rate) //not sure if we want to use momentum here
+ {
+ //W_gradient.setZero(W->rows(), W->cols());
+ //b_gradient.setZero(b.size());
+ //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ W_gradient);
+ uscgemv(1.0,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ b_gradient);
+
+ int_map update_map; //stores all the parameters that have been updated
+ for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+ for (int train_id=0; train_id<samples.cols(); train_id++)
+ update_map[samples(sample_id, train_id)] = 1;
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ update_items.push_back(it->first);
+ int num_items = update_items.size();
+
+ //#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+ b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
+ W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
+ b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
/*
- //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
- //FIRST
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(1.0,
- gradient_output,
- predicted_embeddings.leftCols(samples.cols()).transpose(),
- W_gradient);
- uscgemv(1.0,
- gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
- b_gradient);
-
- int_map update_map; //stores all the parameters that have been updated
- for (int sample_id=0; sample_id<samples.rows(); sample_id++)
- for (int train_id=0; train_id<samples.cols(); train_id++)
- update_map[samples(sample_id, train_id)] = 1;
-
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- update_items.push_back(it->first);
- int num_items = update_items.size();
-
- //#pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- //W->row(update_item) += learning_rate * W_gradient.row(update_item);
- //b(update_item) += learning_rate * b_gradient(update_item);
- //UPDATE CLIPPING
- W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
- double update = learning_rate * b_gradient(update_item);
- b(update_item) += std::min(0.5, std::max(update,-0.5));
- //GRADIENT CLIPPING
- W_gradient.row(update_item).setZero();
- b_gradient(update_item) = 0.;
- }
- */
- //cerr<<"Finished gradient"<<endl;
- }
-
- template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
- void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- double learning_rate) //not sure if we want to use momentum here
- {
- //W_gradient.setZero(W->rows(), W->cols());
- //b_gradient.setZero(b.size());
- //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(1.0,
- gradient_output,
- predicted_embeddings.leftCols(samples.cols()).transpose(),
- W_gradient);
- uscgemv(1.0,
- gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
- b_gradient);
-
- int_map update_map; //stores all the parameters that have been updated
- for (int sample_id=0; sample_id<samples.rows(); sample_id++)
- for (int train_id=0; train_id<samples.cols(); train_id++)
- update_map[samples(sample_id, train_id)] = 1;
-
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- update_items.push_back(it->first);
- int num_items = update_items.size();
-
- //#pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
- b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
- W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
- b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
- /*
- //UPDATE CLIPPING
- W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
- double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
- b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
- */
- W_gradient.row(update_item).setZero();
- b_gradient(update_item) = 0.;
- }
- }
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
+ double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+ b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
+ */
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
+ }
+ }
- template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
- void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- double learning_rate,
- double conditioning_constant,
- double decay) //not sure if we want to use momentum here
- {
- //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
- //W_gradient.setZero(W->rows(), W->cols());
- //b_gradient.setZero(b.size());
-
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(1.0,
- gradient_output,
- predicted_embeddings.leftCols(samples.cols()).transpose(),
- W_gradient);
- uscgemv(1.0,
- gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()),
- b_gradient);
-
- int_map update_map; //stores all the parameters that have been updated
- for (int sample_id=0; sample_id<samples.rows(); sample_id++)
- for (int train_id=0; train_id<samples.cols(); train_id++)
- update_map[samples(sample_id, train_id)] = 1;
-
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- update_items.push_back(it->first);
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- Array<double,1,Dynamic> W_current_parameter_update;
- double b_current_parameter_update;
-
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
- (1.-decay)*W_gradient.row(update_item).array().square().matrix();
- b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
- (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
- //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
- //getchar();
-
- //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
- //getchar();
- W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
- (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
- W_gradient.row(update_item).array();
- b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
- sqrt(b_running_gradient(update_item)+conditioning_constant))*
- b_gradient(update_item);
- //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
- //getchar();
- //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
- //getchar();
- //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
- W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
- (1.-decay)*(W_current_parameter_update.square().matrix());
- b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
- (1.-decay)*b_current_parameter_update*b_current_parameter_update;
- //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
- //getchar();
- W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
- b(update_item) += learning_rate*b_current_parameter_update;
- W_gradient.row(update_item).setZero();
- b_gradient(update_item) = 0.;
- }
- }
+ template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+ void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ double learning_rate,
+ double conditioning_constant,
+ double decay) //not sure if we want to use momentum here
+ {
+ //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
+ //W_gradient.setZero(W->rows(), W->cols());
+ //b_gradient.setZero(b.size());
+
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ W_gradient);
+ uscgemv(1.0,
+ gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()),
+ b_gradient);
+
+ int_map update_map; //stores all the parameters that have been updated
+ for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+ for (int train_id=0; train_id<samples.cols(); train_id++)
+ update_map[samples(sample_id, train_id)] = 1;
+
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+ update_items.push_back(it->first);
+ int num_items = update_items.size();
+
+#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ Array<double,1,Dynamic> W_current_parameter_update;
+ double b_current_parameter_update;
+
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+ (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+ b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
+ (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
+ //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
+ //getchar();
+
+ //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
+ //getchar();
+ W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+ (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+ W_gradient.row(update_item).array();
+ b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
+ sqrt(b_running_gradient(update_item)+conditioning_constant))*
+ b_gradient(update_item);
+ //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
+ //getchar();
+ //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
+ //getchar();
+ //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
+ W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+ (1.-decay)*(W_current_parameter_update.square().matrix());
+ b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
+ (1.-decay)*b_current_parameter_update*b_current_parameter_update;
+ //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
+ //getchar();
+ W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+ b(update_item) += learning_rate*b_current_parameter_update;
+ W_gradient.row(update_item).setZero();
+ b_gradient(update_item) = 0.;
+ }
+ }
- template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
- void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
- const MatrixBase<DerivedGOutI> &samples,
- const MatrixBase<DerivedGOutV> &weights,
- const MatrixBase<DerivedGW> &gradient_W,
- const MatrixBase<DerivedGb> &gradient_b) const
- {
- UNCONST(DerivedGW, gradient_W, my_gradient_W);
- UNCONST(DerivedGb, gradient_b, my_gradient_b);
- my_gradient_W.setZero();
- my_gradient_b.setZero();
- USCMatrix<double> gradient_output(W->rows(), samples, weights);
- uscgemm(1.0,
- gradient_output,
- predicted_embeddings.leftCols(samples.cols()).transpose(),
- my_gradient_W);
- uscgemv(1.0, gradient_output,
- Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
+ template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
+ void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
+ const MatrixBase<DerivedGOutI> &samples,
+ const MatrixBase<DerivedGOutV> &weights,
+ const MatrixBase<DerivedGW> &gradient_W,
+ const MatrixBase<DerivedGb> &gradient_b) const
+ {
+ UNCONST(DerivedGW, gradient_W, my_gradient_W);
+ UNCONST(DerivedGb, gradient_b, my_gradient_b);
+ my_gradient_W.setZero();
+ my_gradient_b.setZero();
+ USCMatrix<double> gradient_output(W->rows(), samples, weights);
+ uscgemm(1.0,
+ gradient_output,
+ predicted_embeddings.leftCols(samples.cols()).transpose(),
+ my_gradient_W);
+ uscgemv(1.0, gradient_output,
+ Matrix<double,Dynamic,1>::Ones(weights.cols()), my_gradient_b);
}
};
class Input_word_embeddings
{
- private:
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
- int context_size, vocab_size;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
- Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
-
- friend class model;
-
- public:
- Input_word_embeddings() : context_size(0), vocab_size(0) { }
- Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
-
- void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
- W = input_W;
- }
+ private:
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
+ int context_size, vocab_size;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+ Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
- void resize(int rows, int cols, int context)
- {
- context_size = context;
- vocab_size = rows;
- W->setZero(rows, cols);
- }
+ friend class model;
- void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
- void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+ public:
+ Input_word_embeddings() : context_size(0), vocab_size(0) { }
+ Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
- template <typename Engine>
- void initialize(Engine &engine,
- bool init_normal,
- double init_range,
- string &parameter_update,
- double adagrad_epsilon)
- {
- W_gradient.setZero(W->rows(),W->cols());
-
- if (parameter_update == "ADA") {
- W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
- //W_gradient.setZero(W->rows(),W->cols());
- }
- if (parameter_update == "ADAD") {
- W_running_gradient.setZero(W->rows(),W->cols());
- //W_gradient.setZero(W->rows(),W->cols());
- W_running_parameter_update.setZero(W->rows(),W->cols());
- }
- initMatrix(engine,
- *W,
- init_normal,
- init_range);
- }
+ void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+ W = input_W;
+ }
- int n_inputs() const { return -1; }
- int n_outputs() const { return W->cols() * context_size; }
-
- // set output_id's embedding to the weighted average of all embeddings
- template <typename Dist>
- void average(const Dist &dist, int output_id)
- {
- W->row(output_id).setZero();
- for (int i=0; i < W->rows(); i++)
- if (i != output_id)
- W->row(output_id) += dist.prob(i) * W->row(i);
- }
-
- template <typename DerivedIn, typename DerivedOut>
- void fProp(const MatrixBase<DerivedIn> &input,
- const MatrixBase<DerivedOut> &output) const
- {
- int embedding_dimension = W->cols();
-
- // W is vocab_size x embedding_dimension
- // input is ngram_size*vocab_size x minibatch_size
- // output is ngram_size*embedding_dimension x minibatch_size
-
- /*
- // Dense version:
- for (int ngram=0; ngram<context_size; ngram++)
- output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
- */
-
- UNCONST(DerivedOut, output, my_output);
- my_output.setZero();
- for (int ngram=0; ngram<context_size; ngram++)
- {
- // input might be narrower than expected due to a short minibatch,
- // so narrow output to match
- uscgemm(1.0,
- W->transpose(),
- USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
- my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
- }
- }
-
- // When model is premultiplied, this layer doesn't get used,
- // but this method is used to get the input into a sparse matrix.
- // Hopefully this can get eliminated someday
- template <typename DerivedIn, typename ScalarOut>
- void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const
- {
- output.resize(vocab_size*context_size, context_size, input.cols());
- for (int i=0; i < context_size; i++)
- output.indexes.row(i).array() = input.row(i).array() + i*vocab_size;
- output.values.fill(1.0);
- }
+ void resize(int rows, int cols, int context)
+ {
+ context_size = context;
+ vocab_size = rows;
+ W->setZero(rows, cols);
+ }
+
+ void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
+ void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
+
+ template <typename Engine>
+ void initialize(Engine &engine,
+ bool init_normal,
+ double init_range,
+ string &parameter_update,
+ double adagrad_epsilon)
+ {
+ W_gradient.setZero(W->rows(),W->cols());
+
+ if (parameter_update == "ADA") {
+ W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+ //W_gradient.setZero(W->rows(),W->cols());
+ }
+ if (parameter_update == "ADAD") {
+ W_running_gradient.setZero(W->rows(),W->cols());
+ //W_gradient.setZero(W->rows(),W->cols());
+ W_running_parameter_update.setZero(W->rows(),W->cols());
+ }
+ initMatrix(engine,
+ *W,
+ init_normal,
+ init_range);
+ }
+
+ int n_inputs() const { return -1; }
+ int n_outputs() const { return W->cols() * context_size; }
+
+ // set output_id's embedding to the weighted average of all embeddings
+ template <typename Dist>
+ void average(const Dist &dist, int output_id)
+ {
+ W->row(output_id).setZero();
+ for (int i=0; i < W->rows(); i++)
+ if (i != output_id)
+ W->row(output_id) += dist.prob(i) * W->row(i);
+ }
+
+ template <typename DerivedIn, typename DerivedOut>
+ void fProp(const MatrixBase<DerivedIn> &input,
+ const MatrixBase<DerivedOut> &output) const
+ {
+ int embedding_dimension = W->cols();
+
+ // W is vocab_size x embedding_dimension
+ // input is ngram_size*vocab_size x minibatch_size
+ // output is ngram_size*embedding_dimension x minibatch_size
+
+ /*
+ // Dense version:
+ for (int ngram=0; ngram<context_size; ngram++)
+ output.middleRows(ngram*embedding_dimension, embedding_dimension) = W.transpose() * input.middleRows(ngram*vocab_size, vocab_size);
+ */
+
+ UNCONST(DerivedOut, output, my_output);
+ my_output.setZero();
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ // input might be narrower than expected due to a short minibatch,
+ // so narrow output to match
+ uscgemm(1.0,
+ W->transpose(),
+ USCMatrix<double>(W->rows(),input.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input.cols())),
+ my_output.block(ngram*embedding_dimension, 0, embedding_dimension, input.cols()));
+ }
+ }
+
+ // When model is premultiplied, this layer doesn't get used,
+ // but this method is used to get the input into a sparse matrix.
+ // Hopefully this can get eliminated someday
+ template <typename DerivedIn, typename ScalarOut>
+ void munge(const MatrixBase<DerivedIn> &input, USCMatrix<ScalarOut> &output) const
+ {
+ output.resize(vocab_size*context_size, context_size, input.cols());
+ for (int i=0; i < context_size; i++)
+ output.indexes.row(i).array() = input.row(i).array() + i*vocab_size;
+ output.values.fill(1.0);
+ }
template <typename DerivedGOut, typename DerivedIn>
void computeGradient(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- double learning_rate, double momentum, double L2_reg)
+ const MatrixBase<DerivedIn> &input_words,
+ double learning_rate, double momentum, double L2_reg)
{
- int embedding_dimension = W->cols();
+ int embedding_dimension = W->cols();
- // W is vocab_size x embedding_dimension
- // input is ngram_size*vocab_size x minibatch_size
- // bProp_input is ngram_size*embedding_dimension x minibatch_size
+ // W is vocab_size x embedding_dimension
+ // input is ngram_size*vocab_size x minibatch_size
+ // bProp_input is ngram_size*embedding_dimension x minibatch_size
- /*
- // Dense version:
- for (int ngram=0; ngram<context_size; ngram++)
- W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
- */
+ /*
+ // Dense version:
+ for (int ngram=0; ngram<context_size; ngram++)
+ W += learning_rate * input_words.middleRows(ngram*vocab_size, vocab_size) * bProp_input.middleRows(ngram*embedding_dimension, embedding_dimension).transpose()
+ */
- for (int ngram=0; ngram<context_size; ngram++)
- {
- uscgemm(learning_rate,
- USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
- *W);
- }
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(learning_rate,
+ USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
+ *W);
+ }
- /*
- //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
- //PERFORM CLIPPING WHILE UPDATING
-
- for (int ngram=0; ngram<context_size; ngram++)
- {
- uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
- W_gradient);
- }
- int_map update_map; //stores all the parameters that have been updated
- for (int ngram=0; ngram<context_size; ngram++)
- {
- for (int train_id=0; train_id<input_words.cols(); train_id++)
- {
- update_map[input_words(ngram,train_id)] = 1;
- }
- }
+ /*
+ //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
+ //PERFORM CLIPPING WHILE UPDATING
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- {
- update_items.push_back(it->first);
- }
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- //UPDATE CLIPPING
- W->row(update_item) += (learning_rate*
- W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
- //GRADIENT CLIPPING
- //W->row(update_item) += learning_rate*
- // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
- //SETTING THE GRADIENT TO ZERO
- W_gradient.row(update_item).setZero();
- }
- */
- }
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ W_gradient);
+ }
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
+ {
+ update_map[input_words(ngram,train_id)] = 1;
+ }
+ }
- template <typename DerivedGOut, typename DerivedIn>
- void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- double learning_rate,
- double L2_reg)
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
{
- int embedding_dimension = W->cols();
- //W_gradient.setZero(W->rows(), W->cols());
- /*
+ update_items.push_back(it->first);
+ }
+ int num_items = update_items.size();
+
+ #pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate*
+ W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
+ //GRADIENT CLIPPING
+ //W->row(update_item) += learning_rate*
+ // W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
+ //SETTING THE GRADIENT TO ZERO
+ W_gradient.row(update_item).setZero();
+ }
+ */
+ }
+
+ template <typename DerivedGOut, typename DerivedIn>
+ void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &input_words,
+ double learning_rate,
+ double L2_reg)
+ {
+ int embedding_dimension = W->cols();
+ //W_gradient.setZero(W->rows(), W->cols());
+ /*
if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
- W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
- */
- for (int ngram=0; ngram<context_size; ngram++)
- {
- uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
- W_gradient);
- }
- int_map update_map; //stores all the parameters that have been updated
- for (int ngram=0; ngram<context_size; ngram++)
- {
- for (int train_id=0; train_id<input_words.cols(); train_id++)
- {
- update_map[input_words(ngram,train_id)] = 1;
- }
+ W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+ */
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ W_gradient);
+ }
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
+ {
+ update_map[input_words(ngram,train_id)] = 1;
}
+ }
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- {
- update_items.push_back(it->first);
- }
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
- W->row(update_item) += learning_rate *
- (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
- /*
- //UPDATE CLIPPING
- W->row(update_item) += (learning_rate *
- (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
- .unaryExpr(Clipper()).matrix();
- */
- W_gradient.row(update_item).setZero();
- }
- }
-
- template <typename DerivedGOut, typename DerivedIn>
- void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- double learning_rate,
- double L2_reg,
- double conditioning_constant,
- double decay)
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
{
- int embedding_dimension = W->cols();
+ update_items.push_back(it->first);
+ }
+ int num_items = update_items.size();
- //W_gradient.setZero(W->rows(), W->cols());
+#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+ W->row(update_item) += learning_rate *
+ (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
/*
- if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
- W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+ //UPDATE CLIPPING
+ W->row(update_item) += (learning_rate *
+ (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
+ .unaryExpr(Clipper()).matrix();
*/
- for (int ngram=0; ngram<context_size; ngram++)
- {
- uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
- W_gradient);
- }
- int_map update_map; //stores all the parameters that have been updated
- for (int ngram=0; ngram<context_size; ngram++)
- {
- for (int train_id=0; train_id<input_words.cols(); train_id++)
- {
- update_map[input_words(ngram,train_id)] = 1;
- }
+ W_gradient.row(update_item).setZero();
+ }
+ }
+
+ template <typename DerivedGOut, typename DerivedIn>
+ void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &input_words,
+ double learning_rate,
+ double L2_reg,
+ double conditioning_constant,
+ double decay)
+ {
+ int embedding_dimension = W->cols();
+
+ //W_gradient.setZero(W->rows(), W->cols());
+ /*
+ if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+ W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+ */
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ W_gradient);
+ }
+ int_map update_map; //stores all the parameters that have been updated
+ for (int ngram=0; ngram<context_size; ngram++)
+ {
+ for (int train_id=0; train_id<input_words.cols(); train_id++)
+ {
+ update_map[input_words(ngram,train_id)] = 1;
}
+ }
- // Convert to std::vector for parallelization
- std::vector<int> update_items;
- for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
- {
- update_items.push_back(it->first);
- }
- int num_items = update_items.size();
-
- #pragma omp parallel for
- for (int item_id=0; item_id<num_items; item_id++)
- {
-
- Array<double,1,Dynamic> W_current_parameter_update;
- int update_item = update_items[item_id];
- W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
- (1.-decay)*W_gradient.row(update_item).array().square().matrix();
-
- W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
- (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
- W_gradient.row(update_item).array();
-
- //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
- //getchar();
- W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
- (1.-decay)*W_current_parameter_update.square().matrix();
-
- W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
- //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl;
- //getchar();
- W_gradient.row(update_item).setZero();
- }
-
- }
-
- template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
- void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
- const MatrixBase<DerivedIn> &input_words,
- int x, int minibatch_size,
- const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+ // Convert to std::vector for parallelization
+ std::vector<int> update_items;
+ for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
{
- UNCONST(DerivedGW, gradient, my_gradient);
- int embedding_dimension = W->cols();
- my_gradient.setZero();
- for (int ngram=0; ngram<context_size; ngram++)
- uscgemm(1.0,
- USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
- bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
- my_gradient);
+ update_items.push_back(it->first);
}
+ int num_items = update_items.size();
+
+#pragma omp parallel for
+ for (int item_id=0; item_id<num_items; item_id++)
+ {
+
+ Array<double,1,Dynamic> W_current_parameter_update;
+ int update_item = update_items[item_id];
+ W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+ (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+
+ W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+ (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+ W_gradient.row(update_item).array();
+
+ //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
+ //getchar();
+ W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+ (1.-decay)*W_current_parameter_update.square().matrix();
+
+ W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+ //cerr<<"Input: After update, W is "<<W->row(update_item)<<endl;
+ //getchar();
+ W_gradient.row(update_item).setZero();
+ }
+
+ }
+
+ template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+ void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+ const MatrixBase<DerivedIn> &input_words,
+ int x, int minibatch_size,
+ const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+ {
+ UNCONST(DerivedGW, gradient, my_gradient);
+ int embedding_dimension = W->cols();
+ my_gradient.setZero();
+ for (int ngram=0; ngram<context_size; ngram++)
+ uscgemm(1.0,
+ USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+ bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+ my_gradient);
+ }
};
} // namespace nplm
-
diff --git a/src/neuralLM.h b/src/neuralLM.h
index 2004596..f0eebd8 100644
--- a/src/neuralLM.h
+++ b/src/neuralLM.h
@@ -6,119 +6,138 @@
#include <cstdlib>
#include <boost/shared_ptr.hpp>
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
#include "util.h"
#include "vocabulary.h"
#include "neuralNetwork.h"
+#include "replace_digits.hpp"
/*
To do:
- move digit mapping into vocabulary.h
- */
+*/
namespace nplm
{
-class neuralLM : public neuralNetwork
+class neuralLM : public neuralNetwork, graehl::replace_digits
{
- char map_digits;
- boost::shared_ptr<vocabulary> vocab;
- int start, null;
+ boost::shared_ptr<vocabulary> vocab;
+ int start, null;
-public:
- neuralLM()
+ public:
+ neuralLM()
: neuralNetwork(),
- vocab(new vocabulary()),
- map_digits(0)
- {
- }
+ graehl::replace_digits(0),
+ vocab(new vocabulary())
+ {
+ }
- void set_map_digits(char value) { map_digits = value; }
+ void set_map_digits(char value) { map_digits = value; }
- void set_vocabulary(const vocabulary &vocab)
- {
- *(this->vocab) = vocab;
- start = vocab.lookup_word("<s>");
- null = vocab.lookup_word("<null>");
- }
+ void set_vocabulary(const vocabulary &vocab)
+ {
+ *(this->vocab) = vocab;
+ start = vocab.lookup_word("<s>");
+ null = vocab.lookup_word("<null>");
+ }
- const vocabulary &get_vocabulary() const { return *(this->vocab); }
+ const vocabulary &get_vocabulary() const { return *(this->vocab); }
- int lookup_word(const std::string &word) const
- {
- if (map_digits)
- for (int i=0; i<word.length(); i++)
- if (isdigit(word[i]))
- {
- std::string mapped_word(word);
- for (; i<word.length(); i++)
- if (isdigit(word[i]))
- mapped_word[i] = map_digits;
- return vocab->lookup_word(mapped_word);
- }
- return vocab->lookup_word(word);
- }
+ int lookup_input_word(const std::string &word) const
+ {
+ return lookup_word(word);
+ }
- double lookup_ngram(const int *ngram_a, int n)
- {
- Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
- for (int i=0; i<m->ngram_size; i++)
- {
- if (i-m->ngram_size+n < 0)
- {
- if (ngram_a[0] == start)
- ngram(i) = start;
- else
- ngram(i) = null;
- }
- else
- {
- ngram(i) = ngram_a[i-m->ngram_size+n];
- }
- }
- return neuralNetwork::lookup_ngram(ngram);
- }
+ int lookup_input_word(std::pair<char const*, char const*> word) const
+ {
+ return lookup_word(word);
+ }
- double lookup_ngram(const std::vector<int> &ngram_v)
- {
- return lookup_ngram(ngram_v.data(), ngram_v.size());
- }
- template <typename Derived>
- double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
- {
- return neuralNetwork::lookup_ngram(ngram);
- }
-
- template <typename DerivedA, typename DerivedB>
- void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
- {
- return neuralNetwork::lookup_ngram(ngram, log_probs_const);
- }
+ int lookup_word(const std::string &word) const
+ {
+ if (map_digits)
+ for (int i=0, n=word.size(); i<n; ++i)
+ if (graehl::ascii_digit(word[i])) {
+ std::string mapped_word(word);
+ replace(mapped_word, i);
+ return vocab->lookup_word(mapped_word);
+ }
+ return vocab->lookup_word(word);
+ }
+
+ int lookup_word(std::pair<char const*, char const*> slice) const
+ {
+ if (map_digits)
+ for (char const* i = slice.first; i != slice.second; ++i)
+ if (graehl::ascii_digit(*i)) {
+ std::string mapped_word(slice.first, slice.second);
+ replace(mapped_word, i - slice.first);
+ return vocab->lookup_word(mapped_word);
+ }
+ return vocab->lookup_word(slice);
+ }
- void read(const std::string &filename)
+ double lookup_ngram(const int *ngram_a, int n)
+ {
+ Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+ for (int i=0; i<m->ngram_size; ++i)
{
- std::vector<std::string> words;
- m->read(filename, words);
- set_vocabulary(vocabulary(words));
- resize();
- // this is faster but takes more memory
- //m->premultiply();
+ if (i-m->ngram_size+n < 0)
+ {
+ if (ngram_a[0] == start)
+ ngram(i) = start;
+ else
+ ngram(i) = null;
+ }
+ else
+ {
+ ngram(i) = ngram_a[i-m->ngram_size+n];
+ }
}
+ return neuralNetwork::lookup_ngram(ngram);
+ }
+
+ double lookup_ngram(const std::vector<int> &ngram_v)
+ {
+ return lookup_ngram(ngram_v.data(), ngram_v.size());
+ }
+
+ template <typename Derived>
+ double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+ {
+ return neuralNetwork::lookup_ngram(ngram);
+ }
+
+ template <typename DerivedA, typename DerivedB>
+ void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+ {
+ return neuralNetwork::lookup_ngram(ngram, log_probs_const);
+ }
+
+ void read(const std::string &filename)
+ {
+ std::vector<std::string> words;
+ m->read(filename, words);
+ set_vocabulary(vocabulary(words));
+ resize();
+ // this is faster but takes more memory
+ //m->premultiply();
+ }
};
template <typename T>
void addStartStop(std::vector<T> &input, std::vector<T> &output, int ngram_size, const T &start, const T &stop)
{
- output.clear();
- output.resize(input.size()+ngram_size);
- for (int i=0; i<ngram_size-1; i++)
- output[i] = start;
- std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
- output[output.size()-1] = stop;
+ output.clear();
+ output.resize(input.size()+ngram_size);
+ for (int i=0; i<ngram_size-1; ++i)
+ output[i] = start;
+ std::copy(input.begin(), input.end(), output.begin()+ngram_size-1);
+ output[output.size()-1] = stop;
}
template <typename T>
@@ -127,21 +146,21 @@ void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &outpu
output.clear();
for (int j=ngram_size-1; j<input.size(); j++)
{
- std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1);
- output.push_back(ngram);
+ std::vector<T> ngram(input.begin() + (j-ngram_size+1), input.begin() + j+1);
+ output.push_back(ngram);
}
}
-inline void preprocessWords(const std::vector<std::string> &words,
- std::vector< std::vector<int> > &ngrams,
- int ngram_size,
- const vocabulary &vocab,
- bool numberize,
- bool add_start_stop,
- bool ngramize) {
+inline void preprocessWords(const std::vector<std::string> &words,
+ std::vector< std::vector<int> > &ngrams,
+ int ngram_size,
+ const vocabulary &vocab,
+ bool numberize,
+ bool add_start_stop,
+ bool ngramize) {
int start = vocab.lookup_word("<s>");
int stop = vocab.lookup_word("</s>");
-
+
// convert words to ints
std::vector<int> nums;
if (numberize) {
@@ -152,9 +171,9 @@ inline void preprocessWords(const std::vector<std::string> &words,
else {
for (int j=0; j<words.size(); j++) {
nums.push_back(boost::lexical_cast<int>(words[j]));
- }
+ }
}
-
+
// convert sequence to n-grams
ngrams.clear();
if (ngramize) {
@@ -168,10 +187,10 @@ inline void preprocessWords(const std::vector<std::string> &words,
}
else {
if (nums.size() != ngram_size)
- {
- std::cerr << "error: wrong number of fields in line" << std::endl;
- std::exit(1);
- }
+ {
+ std::cerr << "error: wrong number of fields in line\n";
+ std::exit(1);
+ }
ngrams.push_back(nums);
}
}
diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h
index ef96488..6386a0f 100644
--- a/src/neuralNetwork.h
+++ b/src/neuralNetwork.h
@@ -3,7 +3,6 @@
#include <vector>
#include <boost/shared_ptr.hpp>
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
#include "util.h"
@@ -16,191 +15,191 @@ namespace nplm
class neuralNetwork
{
-protected:
- boost::shared_ptr<model> m;
+ protected:
+ boost::shared_ptr<model> m;
-private:
- bool normalization;
- double weight;
+ private:
+ bool normalization;
+ double weight;
- propagator prop;
+ propagator prop;
- std::size_t cache_size;
- Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
- std::vector<double> cache_values;
- int cache_lookups, cache_hits;
+ std::size_t cache_size;
+ Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
+ std::vector<double> cache_values;
+ int cache_lookups, cache_hits;
-public:
- neuralNetwork()
+ public:
+ neuralNetwork()
: m(new model()),
normalization(false),
- weight(1.),
- prop(*m, 1),
+ weight(1.),
+ prop(*m, 1),
cache_size(0)
- {
- }
+ {
+ }
- void set_normalization(bool value) { normalization = value; }
- void set_log_base(double value) { weight = 1./std::log(value); }
-
- // This must be called if the underlying model is resized.
- void resize() {
- if (cache_size)
- {
- cache_keys.resize(m->ngram_size, cache_size);
- cache_keys.fill(-1);
- }
- prop.resize();
- }
+ void set_normalization(bool value) { normalization = value; }
+ void set_log_base(double value) { weight = 1./std::log(value); }
- void set_width(int width)
+ // This must be called if the underlying model is resized.
+ void resize() {
+ if (cache_size)
{
- prop.resize(width);
+ cache_keys.resize(m->ngram_size, cache_size);
+ cache_keys.fill(-1);
}
-
- template <typename Derived>
- double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+ prop.resize();
+ }
+
+ void set_width(int width)
+ {
+ prop.resize(width);
+ }
+
+ template <typename Derived>
+ double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+ {
+ assert (ngram.rows() == m->ngram_size);
+ assert (ngram.cols() == 1);
+
+ std::size_t hash;
+ if (cache_size)
{
- assert (ngram.rows() == m->ngram_size);
- assert (ngram.cols() == 1);
-
- std::size_t hash;
- if (cache_size)
- {
- // First look in cache
- hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
- cache_lookups++;
- if (cache_keys.col(hash) == ngram)
- {
- cache_hits++;
- return cache_values[hash];
- }
- }
-
- // Make sure that we're single threaded. Multithreading doesn't help,
- // and in some cases can hurt quite a lot
- int save_threads = omp_get_max_threads();
- omp_set_num_threads(1);
- int save_eigen_threads = Eigen::nbThreads();
- Eigen::setNbThreads(1);
- #ifdef __INTEL_MKL__
- int save_mkl_threads = mkl_get_max_threads();
- mkl_set_num_threads(1);
- #endif
-
- prop.fProp(ngram.col(0));
-
- int output = ngram(m->ngram_size-1, 0);
- double log_prob;
-
- start_timer(3);
- if (normalization)
- {
- Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
- if (prop.skip_hidden)
- prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
- else
- prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
- double logz = logsum(scores.col(0));
- log_prob = weight * (scores(output, 0) - logz);
- }
- else
- {
- if (prop.skip_hidden)
- log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0);
- else
- log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
- }
- stop_timer(3);
-
- if (cache_size)
- {
- // Update cache
- cache_keys.col(hash) = ngram;
- cache_values[hash] = log_prob;
- }
-
- #ifdef __INTEL_MKL__
- mkl_set_num_threads(save_mkl_threads);
- #endif
- Eigen::setNbThreads(save_eigen_threads);
- omp_set_num_threads(save_threads);
-
- return log_prob;
+ // First look in cache
+ hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
+ cache_lookups++;
+ if (cache_keys.col(hash) == ngram)
+ {
+ cache_hits++;
+ return cache_values[hash];
+ }
}
- // Look up many n-grams in parallel.
- template <typename DerivedA, typename DerivedB>
- void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
- {
- UNCONST(DerivedB, log_probs_const, log_probs);
- assert (ngram.rows() == m->ngram_size);
- //assert (ngram.cols() <= prop.get_minibatch_size());
-
- prop.fProp(ngram);
-
- if (normalization)
- {
- Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
- if (prop.skip_hidden)
- prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
- else
- prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
-
- // And softmax and loss
- Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
- double minibatch_log_likelihood;
- SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood);
- for (int j=0; j<ngram.cols(); j++)
- {
- int output = ngram(m->ngram_size-1, j);
- log_probs(0, j) = weight * output_probs(output, j);
- }
- }
- else
- {
- for (int j=0; j<ngram.cols(); j++)
- {
- int output = ngram(m->ngram_size-1, j);
- if (prop.skip_hidden)
- log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j);
- else
- log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
- }
- }
- }
+ // Make sure that we're single threaded. Multithreading doesn't help,
+ // and in some cases can hurt quite a lot
+ int save_threads = omp_get_max_threads();
+ omp_set_num_threads(1);
+ int save_eigen_threads = Eigen::nbThreads();
+ Eigen::setNbThreads(1);
+#ifdef __INTEL_MKL__
+ int save_mkl_threads = mkl_get_max_threads();
+ mkl_set_num_threads(1);
+#endif
+
+ prop.fProp(ngram.col(0));
- int get_order() const { return m->ngram_size; }
+ int output = ngram(m->ngram_size-1, 0);
+ double log_prob;
- void read(const std::string &filename)
+ start_timer(3);
+ if (normalization)
{
- m->read(filename);
- resize();
- // this is faster but takes more memory
- //m->premultiply();
+ Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
+ if (prop.skip_hidden)
+ prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+ else
+ prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+ double logz = logsum(scores.col(0));
+ log_prob = weight * (scores(output, 0) - logz);
}
-
- void set_cache(std::size_t cache_size)
+ else
{
- this->cache_size = cache_size;
- cache_keys.resize(m->ngram_size, cache_size);
- cache_keys.fill(-1); // clears cache
- cache_values.resize(cache_size);
- cache_lookups = cache_hits = 0;
+ if (prop.skip_hidden)
+ log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0);
+ else
+ log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
}
+ stop_timer(3);
- double cache_hit_rate()
+ if (cache_size)
{
- return static_cast<double>(cache_hits)/cache_lookups;
+ // Update cache
+ cache_keys.col(hash) = ngram;
+ cache_values[hash] = log_prob;
}
- void premultiply()
+#ifdef __INTEL_MKL__
+ mkl_set_num_threads(save_mkl_threads);
+#endif
+ Eigen::setNbThreads(save_eigen_threads);
+ omp_set_num_threads(save_threads);
+
+ return log_prob;
+ }
+
+ // Look up many n-grams in parallel.
+ template <typename DerivedA, typename DerivedB>
+ void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+ {
+ UNCONST(DerivedB, log_probs_const, log_probs);
+ assert (ngram.rows() == m->ngram_size);
+ //assert (ngram.cols() <= prop.get_minibatch_size());
+
+ prop.fProp(ngram);
+
+ if (normalization)
+ {
+ Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
+ if (prop.skip_hidden)
+ prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
+ else
+ prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+
+ // And softmax and loss
+ Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
+ double minibatch_log_likelihood;
+ SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood);
+ for (int j=0; j<ngram.cols(); j++)
+ {
+ int output = ngram(m->ngram_size-1, j);
+ log_probs(0, j) = weight * output_probs(output, j);
+ }
+ }
+ else
+ {
+ for (int j=0; j<ngram.cols(); j++)
+ {
+ int output = ngram(m->ngram_size-1, j);
+ if (prop.skip_hidden)
+ log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j);
+ else
+ log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
+ }
+ }
+ }
+
+ int get_order() const { return m->ngram_size; }
+
+ void read(const std::string &filename)
+ {
+ m->read(filename);
+ resize();
+ // this is faster but takes more memory
+ //m->premultiply();
+ }
+
+ void set_cache(std::size_t cache_size)
+ {
+ this->cache_size = cache_size;
+ cache_keys.resize(m->ngram_size, cache_size);
+ cache_keys.fill(-1); // clears cache
+ cache_values.resize(cache_size);
+ cache_lookups = cache_hits = 0;
+ }
+
+ double cache_hit_rate()
+ {
+ return static_cast<double>(cache_hits)/cache_lookups;
+ }
+
+ void premultiply()
+ {
+ if (!m->premultiplied)
{
- if (!m->premultiplied)
- {
- m->premultiply();
- }
+ m->premultiply();
}
+ }
};
diff --git a/src/neuralTM.h b/src/neuralTM.h
index 14bc7bf..9bb6d16 100644
--- a/src/neuralTM.h
+++ b/src/neuralTM.h
@@ -6,125 +6,139 @@
#include <cstdlib>
#include <boost/shared_ptr.hpp>
-#include <../3rdparty/Eigen/Dense>
+#include <Eigen/Dense>
#include "util.h"
#include "vocabulary.h"
#include "neuralNetwork.h"
+#include "replace_digits.hpp"
namespace nplm
{
-class neuralTM : public neuralNetwork
+class neuralTM : public neuralNetwork, graehl::replace_digits
{
- char map_digits;
- boost::shared_ptr<vocabulary> input_vocab, output_vocab;
- int start, null;
+ boost::shared_ptr<vocabulary> input_vocab, output_vocab;
+ int start, null;
-public:
- neuralTM()
+ public:
+ neuralTM()
: neuralNetwork(),
- map_digits(0),
+ graehl::replace_digits(0),
input_vocab(new vocabulary()),
output_vocab(new vocabulary())
- {
- }
-
- void set_map_digits(char value) { map_digits = value; }
-
- void set_input_vocabulary(const vocabulary &vocab)
- {
- *(this->input_vocab) = vocab;
- start = vocab.lookup_word("<s>");
- null = vocab.lookup_word("<null>");
- }
-
- void set_output_vocabulary(const vocabulary &vocab)
- {
- *(this->output_vocab) = vocab;
- }
-
- const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
- const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); }
-
- int lookup_input_word(const std::string &word) const
- {
- if (map_digits)
- for (int i=0; i<word.length(); i++)
- if (isdigit(word[i]))
- {
- std::string mapped_word(word);
- for (; i<word.length(); i++)
- if (isdigit(word[i]))
- mapped_word[i] = map_digits;
- return input_vocab->lookup_word(mapped_word);
- }
- return input_vocab->lookup_word(word);
- }
-
- int lookup_output_word(const std::string &word) const
- {
- if (map_digits)
- for (int i=0; i<word.length(); i++)
- if (isdigit(word[i]))
- {
- std::string mapped_word(word);
- for (; i<word.length(); i++)
- if (isdigit(word[i]))
- mapped_word[i] = map_digits;
- return output_vocab->lookup_word(mapped_word);
- }
- return output_vocab->lookup_word(word);
- }
-
- double lookup_ngram(const int *ngram_a, int n)
- {
- Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
- for (int i=0; i<m->ngram_size; i++)
- {
- if (i-m->ngram_size+n < 0)
- {
- if (ngram_a[0] == start)
- ngram(i) = start;
- else
- ngram(i) = null;
- }
- else
- {
- ngram(i) = ngram_a[i-m->ngram_size+n];
- }
- }
- return neuralNetwork::lookup_ngram(ngram);
- }
-
- double lookup_ngram(const std::vector<int> &ngram_v)
- {
- return lookup_ngram(ngram_v.data(), ngram_v.size());
- }
-
- template <typename Derived>
- double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
- {
- return neuralNetwork::lookup_ngram(ngram);
- }
-
- template <typename DerivedA, typename DerivedB>
- void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
- {
- return neuralNetwork::lookup_ngram(ngram, log_probs_const);
- }
-
- void read(const std::string &filename)
+ {
+ }
+
+ void set_map_digits(char value) { map_digits = value; }
+
+ void set_input_vocabulary(const vocabulary &vocab)
+ {
+ *(this->input_vocab) = vocab;
+ start = vocab.lookup_word("<s>");
+ null = vocab.lookup_word("<null>");
+ }
+
+ void set_output_vocabulary(const vocabulary &vocab)
+ {
+ *(this->output_vocab) = vocab;
+ }
+
+ const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
+ const vocabulary &get_output_vocabulary() const { return *(this->output_vocab); }
+
+ int lookup_word(const std::string &word, vocabulary const& vocab) const
+ {
+ if (map_digits)
+ for (int i=0, n=word.size(); i<n; ++i)
+ if (graehl::ascii_digit(word[i])) {
+ std::string mapped_word(word);
+ replace(mapped_word, i);
+ return vocab.lookup_word(mapped_word);
+ }
+ return vocab.lookup_word(word);
+ }
+
+ int lookup_word(std::pair<char const*, char const*> slice, vocabulary const& vocab) const
+ {
+ if (map_digits)
+ for (char const* i = slice.first; i != slice.second; ++i)
+ if (graehl::ascii_digit(*i)) {
+ std::string mapped_word(slice.first, slice.second);
+ replace(mapped_word, i - slice.first);
+ return vocab.lookup_word(mapped_word);
+ }
+ return vocab.lookup_word(slice);
+ }
+
+ int lookup_input_word(const std::string &word) const
+ {
+ return lookup_word(word, *input_vocab);
+ }
+
+ int lookup_output_word(const std::string &word) const
+ {
+ return lookup_word(word, *output_vocab);
+ }
+
+ int lookup_input_word(std::pair<char const*, char const*> word) const
+ {
+ return lookup_word(word, *input_vocab);
+ }
+
+ int lookup_output_word(std::pair<char const*, char const*> word) const
+ {
+ return lookup_word(word, *output_vocab);
+ }
+
+ double lookup_ngram(const int *ngram_a, int n)
+ {
+ Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+ for (int i=0; i<m->ngram_size; i++)
{
- std::vector<std::string> input_words;
- std::vector<std::string> output_words;
- m->read(filename, input_words, output_words);
- set_input_vocabulary(vocabulary(input_words));
- set_output_vocabulary(vocabulary(output_words));
- resize();
- // this is faster but takes more memory
- //m->premultiply();
+ if (i-m->ngram_size+n < 0)
+ {
+ if (ngram_a[0] == start)
+ ngram(i) = start;
+ else
+ ngram(i) = null;
+ }
+ else
+ {
+ ngram(i) = ngram_a[i-m->ngram_size+n];
+ }
}
+ return neuralNetwork::lookup_ngram(ngram);
+ }
+
+ double lookup_ngram(const std::vector<int> &ngram_v)
+ {
+ return lookup_ngram(ngram_v.data(), ngram_v.size());
+ }
+
+ template <typename Derived>
+ double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+ {
+ return neuralNetwork::lookup_ngram(ngram);
+ }
+
+ template <typename DerivedA, typename DerivedB>
+ void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+ {
+ return neuralNetwork::lookup_ngram(ngram, log_probs_const);
+ }
+
+ void read(const std::string &filename)
+ {
+ std::vector<std::string> input_words;
+ std::vector<std::string> output_words;
+ m->read(filename, input_words, output_words);
+ set_input_vocabulary(vocabulary(input_words));
+ set_output_vocabulary(vocabulary(output_words));
+ resize();
+ // this is faster but takes more memory
+ //m->premultiply();
+ }
};
diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp
index adedc72..d5fc16b 100644
--- a/src/prepareNeuralLM.cpp
+++ b/src/prepareNeuralLM.cpp
@@ -2,19 +2,19 @@
#include <vector>
#include <queue>
#include <deque>
-# include <fstream>
-# include <iterator>
-
-# include <boost/unordered_map.hpp>
-# include <boost/algorithm/string/join.hpp>
-# include <boost/interprocess/managed_shared_memory.hpp>
-# include <boost/interprocess/allocators/allocator.hpp>
-# include <boost/interprocess/managed_mapped_file.hpp>
+#include <fstream>
+#include <iterator>
+
+#include <boost/unordered_map.hpp>
+#include <boost/algorithm/string/join.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
#include <boost/interprocess/containers/vector.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int_distribution.hpp>
-# include <tclap/CmdLine.h>
+#include <tclap/CmdLine.h>
#include "neuralLM.h"
#include "util.h"
@@ -36,314 +36,313 @@ typedef std::vector<vec,vecAllocator> vecvec;
typedef long long int data_size_t; // training data can easily exceed 2G instances
template<typename T>
-void writeNgrams(const T &data,
- int ngram_size,
- const vocabulary &vocab,
- bool numberize,
- bool add_start_stop,
- bool ngramize,
- const string &filename)
- {
- ofstream file(filename.c_str());
- if (!file)
+void writeNgrams(const T &data,
+ int ngram_size,
+ const vocabulary &vocab,
+ bool numberize,
+ bool add_start_stop,
+ bool ngramize,
+ const string &filename)
+{
+ ofstream file(filename.c_str());
+ if (!file)
+ {
+ cerr << "error: could not open " << filename << endl;
+ exit(1);
+ }
+
+ vector<vector<int> > ngrams;
+
+ for (int i=0; i<data.size(); i++) {
+ preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
+ // write out n-grams
+ for (int j=0; j<ngrams.size(); j++)
{
- cerr << "error: could not open " << filename << endl;
- exit(1);
- }
-
- vector<vector<int> > ngrams;
-
- for (int i=0; i<data.size(); i++) {
- preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
- // write out n-grams
- for (int j=0; j<ngrams.size(); j++)
- {
- for (int k=0; k<ngram_size; k++)
- {
- file << ngrams[j][k] << " ";
- }
- file << endl;
- }
+ for (int k=0; k<ngram_size; k++)
+ {
+ file << ngrams[j][k] << " ";
+ }
+ file << endl;
}
- file.close();
+ }
+ file.close();
}
// Space efficient version for writing the n-grams.
// They are not read into memory.
-void writeNgrams(const string &input_filename,
- int ngram_size,
- const vocabulary &vocab,
- bool numberize,
- bool add_start_stop,
- bool ngramize,
- const string &filename,
- int train_data_size,
- vector<float> &sent_weights,
- const string &sent_weights_filename)
+void writeNgrams(const string &input_filename,
+ int ngram_size,
+ const vocabulary &vocab,
+ bool numberize,
+ bool add_start_stop,
+ bool ngramize,
+ const string &filename,
+ int train_data_size,
+ vector<float> &sent_weights,
+ const string &sent_weights_filename)
{
- ofstream file(filename.c_str());
- ofstream output_sent_weights_file(sent_weights_filename.c_str());
- if (!file)
- {
- cerr << "error: could not open " << filename << endl;
- exit(1);
+ ofstream file(filename.c_str());
+ ofstream output_sent_weights_file(sent_weights_filename.c_str());
+ if (!file)
+ {
+ cerr << "error: could not open " << filename << endl;
+ exit(1);
+ }
+
+ ifstream input_file(input_filename.c_str());
+ vector<vector<int> > ngrams;
+ //for (int i=0; i<train_data.size(); i++) {
+ string line;
+ int counter = 0;
+ cerr<<"Processed ... ";
+ while (getline(input_file,line) && train_data_size-- > 0) {
+ counter++;
+ if ((counter % 100000) == 0) {
+ cerr<<counter<<" training lines ... ";
}
-
- ifstream input_file(input_filename.c_str());
- vector<vector<int> > ngrams;
- //for (int i=0; i<train_data.size(); i++) {
- string line;
- int counter = 0;
- cerr<<"Processed ... ";
- while (getline(input_file,line) && train_data_size-- > 0) {
- counter++;
- if ((counter % 100000) == 0) {
- cerr<<counter<<" training lines ... ";
- }
- //stringstream lstr(line);
- vector<string> lstr_items;
- splitBySpace(line,lstr_items);
+ //stringstream lstr(line);
+ vector<string> lstr_items;
+ splitBySpace(line,lstr_items);
//for (int i=0; i<data.size(); i++) {
- preprocessWords(lstr_items,
- ngrams,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize);
-
- // write out n-grams
- for (int j=0; j<ngrams.size(); j++)
- {
- if (sent_weights.size() != 0) {
- output_sent_weights_file <<sent_weights[counter-1]<<endl;
- }
- for (int k=0; k<ngram_size; k++)
- {
- file << ngrams[j][k] << " ";
- }
- file << endl;
- }
+ preprocessWords(lstr_items,
+ ngrams,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize);
+
+ // write out n-grams
+ for (int j=0; j<ngrams.size(); j++)
+ {
+ if (sent_weights.size() != 0) {
+ output_sent_weights_file <<sent_weights[counter-1]<<endl;
+ }
+ for (int k=0; k<ngram_size; k++)
+ {
+ file << ngrams[j][k] << " ";
+ }
+ file << endl;
}
- cerr<<endl;
- input_file.close();
- file.close();
- output_sent_weights_file.close();
+ }
+ cerr<<endl;
+ input_file.close();
+ file.close();
+ output_sent_weights_file.close();
}
// Space efficient version for writing the n-grams.
// They are not read into memory.
-void writeMmapNgrams(const string &input_filename,
- int ngram_size,
- const vocabulary &vocab,
- bool numberize,
- bool add_start_stop,
- bool ngramize,
- const string &filename,
- unsigned long train_data_size,
- data_size_t num_tokens,
- bool randomize)
+void writeMmapNgrams(const string &input_filename,
+ int ngram_size,
+ const vocabulary &vocab,
+ bool numberize,
+ bool add_start_stop,
+ bool ngramize,
+ const string &filename,
+ unsigned long train_data_size,
+ data_size_t num_tokens,
+ bool randomize)
{
- cerr<<"Num tokens is "<<num_tokens<<endl;
- cerr<<"Training data size is "<<train_data_size<<endl;
- // Open the memory mapped file and create the allocators
- ip::managed_mapped_file mfile(ip::create_only,
- filename.c_str(),
- num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
- intAllocator ialloc(mfile.get_segment_manager());
- vecAllocator valloc (mfile.get_segment_manager());
- //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
-
- vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
-
- cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
- // Going over every line in the input file and
- // printing the memory mapped ngrams into the
- // output file
- ifstream input_file(input_filename.c_str());
- //for (int i=0; i<train_data.size(); i++) {
- string line;
- int counter = 0;
- cerr<<"Processed ... ";
- long int train_ngram_counter = 0;
- vector<vector<int> > ngrams;
- while (getline(input_file,line) && train_data_size-- > 0) {
- counter++;
- if ((counter % 100000) ==0) {
- //cerr<<"counter is "<<counter<<endl;
- cerr<<counter<<" training lines ... ";
- }
- //stringstream lstr(line);
- vector<string> lstr_items;
- splitBySpace(line,lstr_items);
+ cerr<<"Num tokens is "<<num_tokens<<endl;
+ cerr<<"Training data size is "<<train_data_size<<endl;
+ // Open the memory mapped file and create the allocators
+ ip::managed_mapped_file mfile(ip::create_only,
+ filename.c_str(),
+ num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
+ intAllocator ialloc(mfile.get_segment_manager());
+ vecAllocator valloc (mfile.get_segment_manager());
+ //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
+
+ vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
+
+ cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
+ // Going over every line in the input file and
+ // printing the memory mapped ngrams into the
+ // output file
+ ifstream input_file(input_filename.c_str());
+ //for (int i=0; i<train_data.size(); i++) {
+ string line;
+ int counter = 0;
+ cerr<<"Processed ... ";
+ long int train_ngram_counter = 0;
+ vector<vector<int> > ngrams;
+ while (getline(input_file,line) && train_data_size-- > 0) {
+ counter++;
+ if ((counter % 100000) ==0) {
+ //cerr<<"counter is "<<counter<<endl;
+ cerr<<counter<<" training lines ... ";
+ }
+ //stringstream lstr(line);
+ vector<string> lstr_items;
+ splitBySpace(line,lstr_items);
//for (int i=0; i<data.size(); i++) {
- preprocessWords(lstr_items, ngrams,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize);
- /*
+ preprocessWords(lstr_items, ngrams,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize);
+ /*
cerr<<"line is "<<endl;
cerr<<line<<endl;
cerr<<"Number of ngrams is "<<ngrams.size()<<endl;
- if (ngrams.size() ==1 ){
- cerr<<"The line number was "<<counter<<endl;
- cerr<<line<<endl;
+ if (ngrams.size() ==1 ){
+ cerr<<"The line number was "<<counter<<endl;
+ cerr<<line<<endl;
+ }
+ */
+ // write out n-grams in mmapped file
+ for (int j=0; j<ngrams.size(); j++)
+ {
+ /*
+ for (int k=0; k<ngram_size; k++)
+ {
+ cerr << ngrams[j][k] << " ";
}
+ cerr<< endl;
*/
- // write out n-grams in mmapped file
- for (int j=0; j<ngrams.size(); j++)
- {
- /*
- for (int k=0; k<ngram_size; k++)
- {
- cerr << ngrams[j][k] << " ";
- }
- cerr<< endl;
- */
- for (int k=0; k<ngram_size; k++) {
- mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
- }
- train_ngram_counter++;
- //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
- }
+ for (int k=0; k<ngram_size; k++) {
+ mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
+ }
+ train_ngram_counter++;
+ //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
}
- cerr<<endl;
- input_file.close();
-
- // Shrink the file if it was overused
- ip::managed_mapped_file::shrink_to_fit(filename.c_str());
- //now to randomize the items if the randomize flag was set
- if (randomize == true) {
- unsigned seed = 1234; //for testing only
- mt19937 rng(seed);
- cerr<<"Randomly shuffling data...";
- data_size_t counter =0;
- while (counter < num_tokens) {
- data_size_t upper_limit = counter+5000000;
- long int vector_size = 5000000;
- if (counter + 10000000 >= num_tokens) {
- upper_limit = num_tokens;
- vector_size = num_tokens - counter;
- }
- vector<int> temp(vector_size*ngram_size,0);
- for (int i=0;i<vector_size;i++){
- for (int k=0;k<ngram_size;k++) {
- temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
- }
- }
- for (data_size_t i=vector_size-1; i>0; i--)
- {
- if (i %500000 == 0) {
- cerr<<"Shuffled "<<num_tokens-1<<" instances...";
- }
- data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
- for (int k=0;k<ngram_size;k++) {
- int temp_val = temp.at(i*ngram_size+k);
- temp.at(i*ngram_size+k) =
- temp.at(j*ngram_size+k);
- temp.at(j*ngram_size+k) = temp_val;
- }
- }
- //Putting it back
- for (int i=0;i<vector_size;i++){
- for (int k=0;k<ngram_size;k++) {
- mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
- }
- }
- counter = upper_limit;
+ }
+ cerr<<endl;
+ input_file.close();
+
+ // Shrink the file if it was overused
+ ip::managed_mapped_file::shrink_to_fit(filename.c_str());
+ //now to randomize the items if the randomize flag was set
+ if (randomize == true) {
+ unsigned seed = 1234; //for testing only
+ boost::random::mt19937 rng(seed);
+ cerr<<"Randomly shuffling data...";
+ data_size_t counter =0;
+ while (counter < num_tokens) {
+ data_size_t upper_limit = counter+5000000;
+ long int vector_size = 5000000;
+ if (counter + 10000000 >= num_tokens) {
+ upper_limit = num_tokens;
+ vector_size = num_tokens - counter;
+ }
+ vector<int> temp(vector_size*ngram_size,0);
+ for (int i=0;i<vector_size;i++){
+ for (int k=0;k<ngram_size;k++) {
+ temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
}
-
- /*
- for (data_size_t i=num_tokens-1; i>0; i--)
+ }
+ for (data_size_t i=vector_size-1; i>0; i--)
{
if (i %500000 == 0) {
cerr<<"Shuffled "<<num_tokens-1<<" instances...";
}
data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
for (int k=0;k<ngram_size;k++) {
- int temp_val = mMapVec->at(i*ngram_size+k);
- mMapVec->at(i*ngram_size+k) =
- mMapVec->at(j*ngram_size+k);
- mMapVec->at(j*ngram_size+k) = temp_val;
+ int temp_val = temp.at(i*ngram_size+k);
+ temp.at(i*ngram_size+k) =
+ temp.at(j*ngram_size+k);
+ temp.at(j*ngram_size+k) = temp_val;
}
}
- */
- cerr<<endl;
+ //Putting it back
+ for (int i=0;i<vector_size;i++){
+ for (int k=0;k<ngram_size;k++) {
+ mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
+ }
+ }
+ counter = upper_limit;
}
+
+ /*
+ for (data_size_t i=num_tokens-1; i>0; i--)
+ {
+ if (i %500000 == 0) {
+ cerr<<"Shuffled "<<num_tokens-1<<" instances...";
+ }
+ data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
+ for (int k=0;k<ngram_size;k++) {
+ int temp_val = mMapVec->at(i*ngram_size+k);
+ mMapVec->at(i*ngram_size+k) =
+ mMapVec->at(j*ngram_size+k);
+ mMapVec->at(j*ngram_size+k) = temp_val;
+ }
+ }
+ */
+ cerr<<endl;
+ }
}
int main(int argc, char *argv[])
{
- ios::sync_with_stdio(false);
- int ngram_size, vocab_size, validation_size;
- bool numberize,
- ngramize,
- add_start_stop,
- mmap_file,
- randomize;
-
- string train_text,
- train_file,
- validation_text,
- validation_file,
- words_file,
- write_words_file,
- sent_weights_text,
- output_sent_weights_text;
-
- try
- {
- CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
-
- // The options are printed in reverse order
+ ios::sync_with_stdio(false);
+ int ngram_size, vocab_size, validation_size;
+ bool numberize,
+ ngramize,
+ add_start_stop,
+ mmap_file,
+ randomize;
+
+ string train_text,
+ train_file,
+ validation_text,
+ validation_file,
+ words_file,
+ write_words_file,
+ sent_weights_text,
+ output_sent_weights_text;
+
+ try
+ {
+ CmdLine cmd("Prepares training data for training a language model.", ' ', "0.1");
+
+ // The options are printed in reverse order
ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is "
- "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
+ "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd);
ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd);
ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd);
- ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
+ ValueArg<string> arg_write_words_file("", "write_words_file", "Output vocabulary.", false, "", "string", cmd);
ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd);
- ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
- ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
- ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
- ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
- //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
- //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
-
-
-
- cmd.parse(argc, argv);
-
- train_text = arg_train_text.getValue();
- train_file = arg_train_file.getValue();
- validation_text = arg_validation_text.getValue();
- validation_file = arg_validation_file.getValue();
- validation_size = arg_validation_size.getValue();
- write_words_file = arg_write_words_file.getValue();
- ngram_size = arg_ngram_size.getValue();
- vocab_size = arg_vocab_size.getValue();
- words_file = arg_words_file.getValue();
- numberize = arg_numberize.getValue();
- ngramize = arg_ngramize.getValue();
- add_start_stop = arg_add_start_stop.getValue();
- mmap_file = arg_mmap_file.getValue();
- randomize = arg_randomize.getValue();
- //sent_weights_text = arg_sent_weights_text.getValue();
- //output_sent_weights_text = arg_sent_weights_file.getValue();
- sent_weights_text = "";
- output_sent_weights_text = "";
+ ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
+ ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+ ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
+ ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
+ //ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
+ //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
+
+
+ cmd.parse(argc, argv);
+
+ train_text = arg_train_text.getValue();
+ train_file = arg_train_file.getValue();
+ validation_text = arg_validation_text.getValue();
+ validation_file = arg_validation_file.getValue();
+ validation_size = arg_validation_size.getValue();
+ write_words_file = arg_write_words_file.getValue();
+ ngram_size = arg_ngram_size.getValue();
+ vocab_size = arg_vocab_size.getValue();
+ words_file = arg_words_file.getValue();
+ numberize = arg_numberize.getValue();
+ ngramize = arg_ngramize.getValue();
+ add_start_stop = arg_add_start_stop.getValue();
+ mmap_file = arg_mmap_file.getValue();
+ randomize = arg_randomize.getValue();
+ //sent_weights_text = arg_sent_weights_text.getValue();
+ //output_sent_weights_text = arg_sent_weights_file.getValue();
+ sent_weights_text = "";
+ output_sent_weights_text = "";
// check command line arguments
@@ -364,292 +363,292 @@ int main(int argc, char *argv[])
cerr << "Command line: " << endl;
cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
-
- const string sep(" Value: ");
- cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
- cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
- cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
- cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
- cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
- cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
- cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
- cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
- cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
- cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
- cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
- cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
- cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
- //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
- }
- catch (TCLAP::ArgException &e)
- {
- cerr << "error: " << e.error() << " for arg " << e.argId() << endl;
- exit(1);
- }
- // VLF: why is this true?
- // DC: it's because the vocabulary has to be constructed from the training data only.
- // If the vocabulary is preset, we can't create the validation data.
- // - if --numberize 0 is set, then --validation_size cannot be used.
- // if (!numberize && (validation_size > 0)) {
- // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
- // }
-
- // Read in training data and validation data
- // vector<vector<string> > train_data;
- // readSentFile(train_text, train_data);
- // @vaswani: No more reading the entire training file into memory
- // Reading it per line with file io
-
- //for (int i=0; i<train_data.size(); i++) {
- // Go over every line in the file and
- // 1. if the !ngramize then you should check if
- // we have the correct number of items per line
- // 2. build the vocabulary if the words file has not
- // been specified.
- // Construct vocabulary
- vocabulary vocab;
- int start, stop;
- // Add start stop if the vocabulary has not been supplied
- if (words_file == "") {
- vocab.insert_word("<s>");
- vocab.insert_word("</s>");
- vocab.insert_word("<null>");
- // warn user that if --numberize is not set, there will be no vocabulary!
- if (!numberize) {
- cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
- }
- }
- if (mmap_file == false && randomize == true) {
- cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
- exit(1);
+ const string sep(" Value: ");
+ cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
+ cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
+ cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
+ cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
+ cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
+ cerr << arg_write_words_file.getDescription() << sep << arg_write_words_file.getValue() << endl;
+ cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
+ cerr << arg_vocab_size.getDescription() << sep << arg_vocab_size.getValue() << endl;
+ cerr << arg_words_file.getDescription() << sep << arg_words_file.getValue() << endl;
+ cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
+ cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+ cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+ cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
+ //cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
+ }
+ catch (TCLAP::ArgException &e)
+ {
+ cerr << "error: " << e.error() << " for arg " << e.argId() << endl;
+ exit(1);
+ }
+
+ // VLF: why is this true?
+ // DC: it's because the vocabulary has to be constructed from the training data only.
+ // If the vocabulary is preset, we can't create the validation data.
+ // - if --numberize 0 is set, then --validation_size cannot be used.
+ // if (!numberize && (validation_size > 0)) {
+ // cerr << "Warning: without setting --numberize to 1, --validation_size cannot be used." << endl;
+ // }
+
+ // Read in training data and validation data
+ // vector<vector<string> > train_data;
+ // readSentFile(train_text, train_data);
+ // @vaswani: No more reading the entire training file into memory
+ // Reading it per line with file io
+
+ //for (int i=0; i<train_data.size(); i++) {
+ // Go over every line in the file and
+ // 1. if the !ngramize then you should check if
+ // we have the correct number of items per line
+ // 2. build the vocabulary if the words file has not
+ // been specified.
+ // Construct vocabulary
+ vocabulary vocab;
+ int start, stop;
+ // Add start stop if the vocabulary has not been supplied
+ if (words_file == "") {
+ vocab.insert_word("<s>");
+ vocab.insert_word("</s>");
+ vocab.insert_word("<null>");
+ // warn user that if --numberize is not set, there will be no vocabulary!
+ if (!numberize) {
+ cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
}
- unordered_map<string,int> count; // For keeping word counts if no supplied vocab
-
- deque<vector<string> > validation_data;
- int train_data_size=0;
- cerr<<"Processed ... ";
- data_size_t num_tokens=0;
-
- ifstream training(train_text.c_str());
-
- string line;
- while (getline(training,line)) {
- train_data_size++;
- //stringstream lstr(line);
- vector<string> lstr_items;
- splitBySpace(line,lstr_items);
- // if data is already ngramized, set/check ngram_size
- if (!ngramize) {
- if (ngram_size > 0) {
- if (ngram_size != lstr_items.size()) {
- cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
- }
- }
- // else if --ngram_size has not been specified, set it now
- else {
- ngram_size=lstr_items.size();
- }
+ }
+ if (mmap_file == false && randomize == true) {
+ cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
+ exit(1);
+ }
+ unordered_map<string,int> count; // For keeping word counts if no supplied vocab
+
+ deque<vector<string> > validation_data;
+ int train_data_size=0;
+ cerr<<"Processed ... ";
+ data_size_t num_tokens=0;
+
+ ifstream training(train_text.c_str());
+
+ string line;
+ while (getline(training,line)) {
+ train_data_size++;
+ //stringstream lstr(line);
+ vector<string> lstr_items;
+ splitBySpace(line,lstr_items);
+ // if data is already ngramized, set/check ngram_size
+ if (!ngramize) {
+ if (ngram_size > 0) {
+ if (ngram_size != lstr_items.size()) {
+ cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
+ }
}
- if ((train_data_size%100000)==0){
- cerr<<train_data_size<<" lines ... ";
+ // else if --ngram_size has not been specified, set it now
+ else {
+ ngram_size=lstr_items.size();
}
- //break;
- /*
+ }
+ if ((train_data_size%100000)==0){
+ cerr<<train_data_size<<" lines ... ";
+ }
+ //break;
+ /*
if (lstr_items.size() ==1) {
- cerr<<"line :"<<endl;
- cerr<<line<<endl;
- cerr<<"The number of items was 1"<<endl;
- getchar();
- }
- */
- num_tokens += lstr_items.size()+1;
- if (words_file == "") {
- for (int j=0; j<lstr_items.size(); j++) {
- count[lstr_items[j]] += 1;
- }
+ cerr<<"line :"<<endl;
+ cerr<<line<<endl;
+ cerr<<"The number of items was 1"<<endl;
+ getchar();
}
- // Add to validation set if the validation size
- // has not been specified
- if (validation_text == "" && validation_size > 0) {
- //cerr<<"validation size is "<<validation_data.size()<<endl;
- if (validation_data.size() == validation_size) {
- //validation_data.erase(validation_data.begin());
- validation_data.pop_front();
- }
- validation_data.push_back(lstr_items);
+ */
+ num_tokens += lstr_items.size()+1;
+ if (words_file == "") {
+ for (int j=0; j<lstr_items.size(); j++) {
+ count[lstr_items[j]] += 1;
}
}
- cerr<<endl;
- training.close();
- //cerr<<"validation size is "<<validation_data.size()<<endl;
- //getchar();
- if (validation_data.size() < validation_size) {
- cerr<<"validation size is "<<validation_data.size()<<endl;
- cerr << "error: requested validation size is greater than training data size" << endl;
- exit(1);
+ // Add to validation set if the validation size
+ // has not been specified
+ if (validation_text == "" && validation_size > 0) {
+ //cerr<<"validation size is "<<validation_data.size()<<endl;
+ if (validation_data.size() == validation_size) {
+ //validation_data.erase(validation_data.begin());
+ validation_data.pop_front();
+ }
+ validation_data.push_back(lstr_items);
}
-
- train_data_size -= validation_size;
- cerr<<"Training data size is "<<train_data_size<<endl;
-
- // The items in the validation data have already been counted
- // Decrementing the counts of those words before building the vocabulary
- for(int i=0; i<validation_data.size(); i++){
- num_tokens -= (validation_data[i].size() +1);
- for (int j=0; j<validation_data[i].size();j++){
- count[validation_data[i][j]] -= 1;
- if (count[validation_data[i][j]] == 0) {
- count.erase(validation_data[i][j]);
- }
+ }
+ cerr<<endl;
+ training.close();
+ //cerr<<"validation size is "<<validation_data.size()<<endl;
+ //getchar();
+ if (validation_data.size() < validation_size) {
+ cerr<<"validation size is "<<validation_data.size()<<endl;
+ cerr << "error: requested validation size is greater than training data size" << endl;
+ exit(1);
+ }
+
+ train_data_size -= validation_size;
+ cerr<<"Training data size is "<<train_data_size<<endl;
+
+ // The items in the validation data have already been counted
+ // Decrementing the counts of those words before building the vocabulary
+ for(int i=0; i<validation_data.size(); i++){
+ num_tokens -= (validation_data[i].size() +1);
+ for (int j=0; j<validation_data[i].size();j++){
+ count[validation_data[i][j]] -= 1;
+ if (count[validation_data[i][j]] == 0) {
+ count.erase(validation_data[i][j]);
}
}
+ }
- // Getting the top n frequent words for the vocabulary
- if (words_file == "") {
- vocab.insert_most_frequent(count, vocab_size);
- if (vocab.size() < vocab_size) {
- cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
- }
+ // Getting the top n frequent words for the vocabulary
+ if (words_file == "") {
+ vocab.insert_most_frequent(count, vocab_size);
+ if (vocab.size() < vocab_size) {
+ cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
}
- //vector<vector<string> > validation_data;
- if (validation_text != "") {
- readSentFile(validation_text, validation_data);
- for (int i=0; i<validation_data.size(); i++) {
- // if data is already ngramized, set/check ngram_size
- if (!ngramize) {
- // if --ngram_size has been specified, check that it does not conflict with --ngram_size
- if (ngram_size > 0) {
- if (ngram_size != validation_data[i].size()) {
- cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
- }
- }
- // else if --ngram_size has not been specified, set it now
- else {
- ngram_size=validation_data[i].size();
- }
- }
+ }
+ //vector<vector<string> > validation_data;
+ if (validation_text != "") {
+ readSentFile(validation_text, validation_data);
+ for (int i=0; i<validation_data.size(); i++) {
+ // if data is already ngramized, set/check ngram_size
+ if (!ngramize) {
+ // if --ngram_size has been specified, check that it does not conflict with --ngram_size
+ if (ngram_size > 0) {
+ if (ngram_size != validation_data[i].size()) {
+ cerr << "Error: size of validation ngrams does not match specified value of --ngram_size!" << endl;
+ }
}
+ // else if --ngram_size has not been specified, set it now
+ else {
+ ngram_size=validation_data[i].size();
+ }
+ }
}
- //READING SENTENCE WEIGHTS IF THERE ARE ANY
- vector<float> sent_weights;
- if (sent_weights_text != "") {
- cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
- ifstream sent_weights_file(sent_weights_text.c_str());
- string line;
- readWeightsFile(sent_weights_file,sent_weights);
- sent_weights_file.close();
- if (sent_weights_text.size() != train_data_size) {
- cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
- }
+ }
+ //READING SENTENCE WEIGHTS IF THERE ARE ANY
+ vector<float> sent_weights;
+ if (sent_weights_text != "") {
+ cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
+ ifstream sent_weights_file(sent_weights_text.c_str());
+ string line;
+ readWeightsFile(sent_weights_file,sent_weights);
+ sent_weights_file.close();
+ if (sent_weights_text.size() != train_data_size) {
+ cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
}
-
- /*
+ }
+
+ /*
else if (validation_size > 0)
{
- // Create validation data
- if (validation_size > train_data.size())
- {
- cerr << "error: requested validation size is greater than training data size" << endl;
- exit(1);
- }
- validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
- train_data.resize(train_data.size() - validation_size);
+ // Create validation data
+ if (validation_size > train_data.size())
+ {
+ cerr << "error: requested validation size is greater than training data size" << endl;
+ exit(1);
}
- */
-
- // Construct vocabulary
- //vocabulary vocab;
- //int start, stop;
-
- // read vocabulary from file
- if (words_file != "") {
- vector<string> words;
- readWordsFile(words_file,words);
- for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
- vocab.insert_word(*it);
- }
-
- // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
- if (vocab_size > 0) {
- if (vocab.size() != vocab_size) {
- cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
- }
- }
- // else, set it to the size of vocabulary read from file
- else {
- vocab_size = vocab.size();
- }
-
+ validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
+ train_data.resize(train_data.size() - validation_size);
}
- /*
- // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
- else {
- vocab.insert_word("<s>");
- vocab.insert_word("</s>");
- vocab.insert_word("<null>");
-
- // warn user that if --numberize is not set, there will be no vocabulary!
- if (!numberize) {
- cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
- }
- unordered_map<string,int> count;
- for (int i=0; i<train_data.size(); i++) {
- for (int j=0; j<train_data[i].size(); j++) {
- count[train_data[i][j]] += 1;
- }
- }
-
- vocab.insert_most_frequent(count, vocab_size);
- if (vocab.size() < vocab_size) {
- cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
- }
+ */
+
+ // Construct vocabulary
+ //vocabulary vocab;
+ //int start, stop;
+
+ // read vocabulary from file
+ if (words_file != "") {
+ vector<string> words;
+ readWordsFile(words_file,words);
+ for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
+ vocab.insert_word(*it);
}
- */
- // write vocabulary to file
- if (write_words_file != "") {
- cerr << "Writing vocabulary to " << write_words_file << endl;
- writeWordsFile(vocab.words(), write_words_file);
+ // was vocab_size set? if so, verify that it does not conflict with size of vocabulary read from file
+ if (vocab_size > 0) {
+ if (vocab.size() != vocab_size) {
+ cerr << "Error: size of vocabulary file " << vocab.size() << " != --vocab_size " << vocab_size << endl;
+ }
}
-
- // Write out numberized n-grams
- if (train_file != "")
- {
- cerr << "Writing training data to " << train_file << endl;
- if (mmap_file == true) {
- writeMmapNgrams(train_text,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize,
- train_file,
- train_data_size,
- num_tokens,
- randomize);
- } else {
- writeNgrams(train_text,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize,
- train_file,
- train_data_size,
- sent_weights,
- output_sent_weights_text);
- }
+ // else, set it to the size of vocabulary read from file
+ else {
+ vocab_size = vocab.size();
}
- if (validation_file != "")
- {
- cerr << "Writing validation data to " << validation_file << endl;
- writeNgrams(validation_data,
- ngram_size,
- vocab,
- numberize,
- add_start_stop,
- ngramize,
- validation_file);
+
+ }
+ /*
+ // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
+ else {
+ vocab.insert_word("<s>");
+ vocab.insert_word("</s>");
+ vocab.insert_word("<null>");
+
+ // warn user that if --numberize is not set, there will be no vocabulary!
+ if (!numberize) {
+ cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
+ }
+ unordered_map<string,int> count;
+ for (int i=0; i<train_data.size(); i++) {
+ for (int j=0; j<train_data[i].size(); j++) {
+ count[train_data[i][j]] += 1;
+ }
+ }
+
+ vocab.insert_most_frequent(count, vocab_size);
+ if (vocab.size() < vocab_size) {
+ cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
+ }
+ }
+ */
+
+ // write vocabulary to file
+ if (write_words_file != "") {
+ cerr << "Writing vocabulary to " << write_words_file << endl;
+ writeWordsFile(vocab.words(), write_words_file);
+ }
+
+ // Write out numberized n-grams
+ if (train_file != "")
+ {
+ cerr << "Writing training data to " << train_file << endl;
+ if (mmap_file == true) {
+ writeMmapNgrams(train_text,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize,
+ train_file,
+ train_data_size,
+ num_tokens,
+ randomize);
+ } else {
+ writeNgrams(train_text,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize,
+ train_file,
+ train_data_size,
+ sent_weights,
+ output_sent_weights_text);
}
+ }
+ if (validation_file != "")
+ {
+ cerr << "Writing validation data to " << validation_file << endl;
+ writeNgrams(validation_data,
+ ngram_size,
+ vocab,
+ numberize,
+ add_start_stop,
+ ngramize,
+ validation_file);
+ }
}
diff --git a/src/propagator.h b/src/propagator.h
index 9f214de..6344f2f 100644
--- a/src/propagator.h
+++ b/src/propagator.h
@@ -13,360 +13,359 @@ using Eigen::MatrixBase;
using Eigen::Dynamic;
class propagator {
- int minibatch_size;
- model *pnn;
-
-public:
- Node<Input_word_embeddings> input_layer_node;
- Node<Linear_layer> first_hidden_linear_node;
- Node<Activation_function> first_hidden_activation_node;
- Node<Linear_layer> second_hidden_linear_node;
- Node<Activation_function> second_hidden_activation_node;
- Node<Output_word_embeddings> output_layer_node;
- bool skip_hidden;
-
-public:
- propagator () : minibatch_size(0), pnn(0) { }
-
- propagator (model &nn, int minibatch_size)
+ int minibatch_size;
+ model *pnn;
+
+ public:
+ Node<Input_word_embeddings> input_layer_node;
+ Node<Linear_layer> first_hidden_linear_node;
+ Node<Activation_function> first_hidden_activation_node;
+ Node<Linear_layer> second_hidden_linear_node;
+ Node<Activation_function> second_hidden_activation_node;
+ Node<Output_word_embeddings> output_layer_node;
+ bool skip_hidden;
+
+ public:
+ propagator () : minibatch_size(0), pnn(0) { }
+
+ propagator (model &nn, int minibatch_size)
:
- pnn(&nn),
- input_layer_node(&nn.input_layer, minibatch_size),
- first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size),
- first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size),
- second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size),
- second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size),
- output_layer_node(&nn.output_layer, minibatch_size),
- minibatch_size(minibatch_size)
- {
- skip_hidden = (nn.num_hidden == 0);
- }
+ pnn(&nn),
+ input_layer_node(&nn.input_layer, minibatch_size),
+ first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size),
+ first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size),
+ second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size),
+ second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size),
+ output_layer_node(&nn.output_layer, minibatch_size),
+ minibatch_size(minibatch_size)
+ {
+ skip_hidden = (nn.num_hidden == 0);
+ }
- // This must be called if the underlying model is resized.
- void resize(int minibatch_size) {
- this->minibatch_size = minibatch_size;
- input_layer_node.resize(minibatch_size);
- first_hidden_linear_node.resize(minibatch_size);
- first_hidden_activation_node.resize(minibatch_size);
- second_hidden_linear_node.resize(minibatch_size);
- second_hidden_activation_node.resize(minibatch_size);
- output_layer_node.resize(minibatch_size);
- }
+ // This must be called if the underlying model is resized.
+ void resize(int minibatch_size) {
+ this->minibatch_size = minibatch_size;
+ input_layer_node.resize(minibatch_size);
+ first_hidden_linear_node.resize(minibatch_size);
+ first_hidden_activation_node.resize(minibatch_size);
+ second_hidden_linear_node.resize(minibatch_size);
+ second_hidden_activation_node.resize(minibatch_size);
+ output_layer_node.resize(minibatch_size);
+ }
- void resize() { resize(minibatch_size); }
+ void resize() { resize(minibatch_size); }
- template <typename Derived>
- void fProp(const MatrixBase<Derived> &data)
+ template <typename Derived>
+ void fProp(const MatrixBase<Derived> &data)
+ {
+ if (!pnn->premultiplied)
{
- if (!pnn->premultiplied)
- {
- start_timer(0);
- input_layer_node.param->fProp(data, input_layer_node.fProp_matrix);
- stop_timer(0);
-
- start_timer(1);
- first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix,
- first_hidden_linear_node.fProp_matrix);
- }
- else
- {
- int n_inputs = first_hidden_linear_node.param->n_inputs();
- USCMatrix<double> sparse_data;
- input_layer_node.param->munge(data, sparse_data);
-
- start_timer(1);
- first_hidden_linear_node.param->fProp(sparse_data,
- first_hidden_linear_node.fProp_matrix);
- }
- first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix,
- first_hidden_activation_node.fProp_matrix);
- //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl;
- //std::getchar();
- stop_timer(1);
-
-
- if (!skip_hidden) {
- start_timer(2);
- second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
- second_hidden_linear_node.fProp_matrix);
- second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
- second_hidden_activation_node.fProp_matrix);
- stop_timer(2);
- }
-
- // The propagation stops here because the last layer is very expensive.
- }
+ start_timer(0);
+ input_layer_node.param->fProp(data, input_layer_node.fProp_matrix);
+ stop_timer(0);
- // Dense version (for standard log-likelihood)
- template <typename DerivedIn, typename DerivedOut>
- void bProp(const MatrixBase<DerivedIn> &data,
- const MatrixBase<DerivedOut> &output,
- double learning_rate,
- double momentum,
- double L2_reg,
- std::string &parameter_update,
- double conditioning_constant,
- double decay)
+ start_timer(1);
+ first_hidden_linear_node.param->fProp(input_layer_node.fProp_matrix,
+ first_hidden_linear_node.fProp_matrix);
+ }
+ else
{
- // Output embedding layer
-
- start_timer(7);
- output_layer_node.param->bProp(output,
- output_layer_node.bProp_matrix);
- stop_timer(7);
-
- start_timer(8);
- Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
- if (parameter_update == "SGD") {
- output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
- output,
- learning_rate,
- momentum);
- } else if (parameter_update == "ADA") {
- output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
- output,
- learning_rate);
- } else if (parameter_update == "ADAD") {
- //std::cerr<<"Adadelta gradient"<<endl;
- int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
- output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
- output,
- 1.0/current_minibatch_size,
- conditioning_constant,
- decay);
- } else {
- std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
- }
- stop_timer(8);
-
- bPropRest(data,
- learning_rate,
- momentum,
- L2_reg,
- parameter_update,
- conditioning_constant,
- decay);
+ int n_inputs = first_hidden_linear_node.param->n_inputs();
+ USCMatrix<double> sparse_data;
+ input_layer_node.param->munge(data, sparse_data);
+
+ start_timer(1);
+ first_hidden_linear_node.param->fProp(sparse_data,
+ first_hidden_linear_node.fProp_matrix);
+ }
+ first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix,
+ first_hidden_activation_node.fProp_matrix);
+ //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl;
+ //std::getchar();
+ stop_timer(1);
+
+
+ if (!skip_hidden) {
+ start_timer(2);
+ second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
+ second_hidden_linear_node.fProp_matrix);
+ second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
+ second_hidden_activation_node.fProp_matrix);
+ stop_timer(2);
}
- // Sparse version (for NCE log-likelihood)
- template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
- void bProp(const MatrixBase<DerivedIn> &data,
- const MatrixBase<DerivedOutI> &samples,
- const MatrixBase<DerivedOutV> &weights,
- double learning_rate,
- double momentum,
- double L2_reg,
- std::string &parameter_update,
- double conditioning_constant,
- double decay)
- {
+ // The propagation stops here because the last layer is very expensive.
+ }
+
+ // Dense version (for standard log-likelihood)
+ template <typename DerivedIn, typename DerivedOut>
+ void bProp(const MatrixBase<DerivedIn> &data,
+ const MatrixBase<DerivedOut> &output,
+ double learning_rate,
+ double momentum,
+ double L2_reg,
+ std::string &parameter_update,
+ double conditioning_constant,
+ double decay)
+ {
+ // Output embedding layer
+
+ start_timer(7);
+ output_layer_node.param->bProp(output,
+ output_layer_node.bProp_matrix);
+ stop_timer(7);
+
+ start_timer(8);
+ Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
+ if (parameter_update == "SGD") {
+ output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
+ output,
+ learning_rate,
+ momentum);
+ } else if (parameter_update == "ADA") {
+ output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
+ output,
+ learning_rate);
+ } else if (parameter_update == "ADAD") {
+ //std::cerr<<"Adadelta gradient"<<endl;
+ int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
+ output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
+ output,
+ 1.0/current_minibatch_size,
+ conditioning_constant,
+ decay);
+ } else {
+ std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+ }
+ stop_timer(8);
- // Output embedding layer
-
- start_timer(7);
- output_layer_node.param->bProp(samples,
- weights,
- output_layer_node.bProp_matrix);
- stop_timer(7);
-
-
- start_timer(8);
- Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
- if (parameter_update == "SGD") {
- output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
- samples,
- weights,
- learning_rate,
- momentum);
- } else if (parameter_update == "ADA") {
- output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
- samples,
- weights,
- learning_rate);
- } else if (parameter_update == "ADAD") {
- int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
- //std::cerr<<"Adadelta gradient"<<endl;
- output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
- samples,
- weights,
- 1.0/current_minibatch_size,
- conditioning_constant,
- decay);
- } else {
- std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+ bPropRest(data,
+ learning_rate,
+ momentum,
+ L2_reg,
+ parameter_update,
+ conditioning_constant,
+ decay);
}
- stop_timer(8);
+ // Sparse version (for NCE log-likelihood)
+ template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
+ void bProp(const MatrixBase<DerivedIn> &data,
+ const MatrixBase<DerivedOutI> &samples,
+ const MatrixBase<DerivedOutV> &weights,
+ double learning_rate,
+ double momentum,
+ double L2_reg,
+ std::string &parameter_update,
+ double conditioning_constant,
+ double decay)
+ {
- bPropRest(data,
- learning_rate,
- momentum,
- L2_reg,
- parameter_update,
- conditioning_constant,
- decay);
+ // Output embedding layer
+
+ start_timer(7);
+ output_layer_node.param->bProp(samples,
+ weights,
+ output_layer_node.bProp_matrix);
+ stop_timer(7);
+
+
+ start_timer(8);
+ Node<Activation_function> & final_hidden_activation_node = skip_hidden ? first_hidden_activation_node : second_hidden_activation_node;
+ if (parameter_update == "SGD") {
+ output_layer_node.param->computeGradient(final_hidden_activation_node.fProp_matrix,
+ samples,
+ weights,
+ learning_rate,
+ momentum);
+ } else if (parameter_update == "ADA") {
+ output_layer_node.param->computeGradientAdagrad(final_hidden_activation_node.fProp_matrix,
+ samples,
+ weights,
+ learning_rate);
+ } else if (parameter_update == "ADAD") {
+ int current_minibatch_size = final_hidden_activation_node.fProp_matrix.cols();
+ //std::cerr<<"Adadelta gradient"<<endl;
+ output_layer_node.param->computeGradientAdadelta(final_hidden_activation_node.fProp_matrix,
+ samples,
+ weights,
+ 1.0/current_minibatch_size,
+ conditioning_constant,
+ decay);
+ } else {
+ std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
}
-private:
- template <typename DerivedIn>
- void bPropRest(const MatrixBase<DerivedIn> &data,
- double learning_rate, double momentum, double L2_reg,
- std::string &parameter_update,
- double conditioning_constant,
- double decay)
- {
- // Second hidden layer
+ stop_timer(8);
+ bPropRest(data,
+ learning_rate,
+ momentum,
+ L2_reg,
+ parameter_update,
+ conditioning_constant,
+ decay);
+ }
-
- // All the compute gradient functions are together and the backprop
- // functions are together
- ////////BACKPROP////////////
- start_timer(9);
- if (skip_hidden)
+ private:
+ template <typename DerivedIn>
+ void bPropRest(const MatrixBase<DerivedIn> &data,
+ double learning_rate, double momentum, double L2_reg,
+ std::string &parameter_update,
+ double conditioning_constant,
+ double decay)
{
- start_timer(9);
- first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+ // Second hidden layer
+
+
+
+ // All the compute gradient functions are together and the backprop
+ // functions are together
+ ////////BACKPROP////////////
+ start_timer(9);
+ if (skip_hidden)
+ {
+ start_timer(9);
+ first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
first_hidden_activation_node.bProp_matrix,
first_hidden_linear_node.fProp_matrix,
first_hidden_activation_node.fProp_matrix);
- first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
- first_hidden_linear_node.bProp_matrix);
- stop_timer(9);
+ first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+ first_hidden_linear_node.bProp_matrix);
+ stop_timer(9);
- }
- else
- {
- second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
- second_hidden_activation_node.bProp_matrix,
- second_hidden_linear_node.fProp_matrix,
- second_hidden_activation_node.fProp_matrix);
+ }
+ else
+ {
+ second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+ second_hidden_activation_node.bProp_matrix,
+ second_hidden_linear_node.fProp_matrix,
+ second_hidden_activation_node.fProp_matrix);
- second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
- second_hidden_linear_node.bProp_matrix);
- stop_timer(9);
+ second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
+ second_hidden_linear_node.bProp_matrix);
+ stop_timer(9);
- start_timer(11);
- first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
- first_hidden_activation_node.bProp_matrix,
- first_hidden_linear_node.fProp_matrix,
- first_hidden_activation_node.fProp_matrix);
+ start_timer(11);
+ first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
+ first_hidden_activation_node.bProp_matrix,
+ first_hidden_linear_node.fProp_matrix,
+ first_hidden_activation_node.fProp_matrix);
- first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
- first_hidden_linear_node.bProp_matrix);
- stop_timer(11);
- }
- //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl;
- //std::getchar();
- ////COMPUTE GRADIENT/////////
- if (parameter_update == "SGD") {
- if (!skip_hidden)
- {
- start_timer(10);
- second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
- first_hidden_activation_node.fProp_matrix,
- learning_rate,
- momentum,
- L2_reg);
- stop_timer(10);
+ first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+ first_hidden_linear_node.bProp_matrix);
+ stop_timer(11);
}
-
- // First hidden layer
-
-
- start_timer(12);
- first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
- input_layer_node.fProp_matrix,
- learning_rate, momentum, L2_reg);
- stop_timer(12);
-
- // Input word embeddings
-
- start_timer(13);
- input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
- data,
- learning_rate, momentum, L2_reg);
- stop_timer(13);
- } else if (parameter_update == "ADA") {
- if (!skip_hidden)
- {
- start_timer(10);
- second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix,
- first_hidden_activation_node.fProp_matrix,
- learning_rate,
- L2_reg);
- stop_timer(10);
+ //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl;
+ //std::getchar();
+ ////COMPUTE GRADIENT/////////
+ if (parameter_update == "SGD") {
+ if (!skip_hidden)
+ {
+ start_timer(10);
+ second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
+ first_hidden_activation_node.fProp_matrix,
+ learning_rate,
+ momentum,
+ L2_reg);
+ stop_timer(10);
+ }
+
+ // First hidden layer
+
+
+ start_timer(12);
+ first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
+ input_layer_node.fProp_matrix,
+ learning_rate, momentum, L2_reg);
+ stop_timer(12);
+
+ // Input word embeddings
+
+ start_timer(13);
+ input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
+ data,
+ learning_rate, momentum, L2_reg);
+ stop_timer(13);
+ } else if (parameter_update == "ADA") {
+ if (!skip_hidden)
+ {
+ start_timer(10);
+ second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix,
+ first_hidden_activation_node.fProp_matrix,
+ learning_rate,
+ L2_reg);
+ stop_timer(10);
+ }
+
+ // First hidden layer
+
+
+ start_timer(12);
+ first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix,
+ input_layer_node.fProp_matrix,
+ learning_rate,
+ L2_reg);
+ stop_timer(12);
+
+ // Input word embeddings
+
+ start_timer(13);
+ input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix,
+ data,
+ learning_rate,
+ L2_reg);
+ stop_timer(13);
+ } else if (parameter_update == "ADAD") {
+ int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols();
+ //std::cerr<<"Adadelta gradient"<<endl;
+ if (!skip_hidden)
+ {
+ start_timer(10);
+ second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix,
+ first_hidden_activation_node.fProp_matrix,
+ 1.0/current_minibatch_size,
+ L2_reg,
+ conditioning_constant,
+ decay);
+ stop_timer(10);
+ }
+ //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl;
+
+ // First hidden layer
+
+
+ start_timer(12);
+ first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix,
+ input_layer_node.fProp_matrix,
+ 1.0/current_minibatch_size,
+ L2_reg,
+ conditioning_constant,
+ decay);
+ stop_timer(12);
+
+ //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl;
+ // Input word embeddings
+
+ start_timer(13);
+ input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix,
+ data,
+ 1.0/current_minibatch_size,
+ L2_reg,
+ conditioning_constant,
+ decay);
+ stop_timer(13);
+
+ //std::cerr<<"Finished gradient for first input layer"<<std::endl;
+ } else {
+ std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
}
- // First hidden layer
-
-
- start_timer(12);
- first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix,
- input_layer_node.fProp_matrix,
- learning_rate,
- L2_reg);
- stop_timer(12);
-
- // Input word embeddings
-
- start_timer(13);
- input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix,
- data,
- learning_rate,
- L2_reg);
- stop_timer(13);
- } else if (parameter_update == "ADAD") {
- int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols();
- //std::cerr<<"Adadelta gradient"<<endl;
- if (!skip_hidden)
- {
- start_timer(10);
- second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix,
- first_hidden_activation_node.fProp_matrix,
- 1.0/current_minibatch_size,
- L2_reg,
- conditioning_constant,
- decay);
- stop_timer(10);
- }
- //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl;
-
- // First hidden layer
-
-
- start_timer(12);
- first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix,
- input_layer_node.fProp_matrix,
- 1.0/current_minibatch_size,
- L2_reg,
- conditioning_constant,
- decay);
- stop_timer(12);
-
- //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl;
- // Input word embeddings
-
- start_timer(13);
- input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix,
- data,
- 1.0/current_minibatch_size,
- L2_reg,
- conditioning_constant,
- decay);
- stop_timer(13);
-
- //std::cerr<<"Finished gradient for first input layer"<<std::endl;
- } else {
- std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
}
-
- }
};
} // namespace nplm
#endif
-
diff --git a/src/replace_digits.hpp b/src/replace_digits.hpp
new file mode 100644
index 0000000..e8ac957
--- /dev/null
+++ b/src/replace_digits.hpp
@@ -0,0 +1,62 @@
+/** \file \author Jonathan Graehl <graehl@gmail.com>
+
+ replace 0-9 ascii chars with another ascii replacement
+
+ To the extent possible under law, the author(s) have dedicated all copyright
+ and related and neighboring rights to this software to the public domain
+ worldwide. This software is distributed without any warranty.
+*/
+
+#ifndef REPLACEDIGITS_GRAEHL_2015_06_25_H
+#define REPLACEDIGITS_GRAEHL_2015_06_25_H
+#pragma once
+
+#include <string>
+#include <utility>
+
+namespace graehl {
+
+inline bool ascii_digit(char c) {
+ return c >= '0' && c <= '9';
+}
+
+struct replace_digits {
+ char map_digits;
+ replace_digits(char map_digits = '@') : map_digits(map_digits) {}
+
+ /// \return whether anything was replaced
+ bool replaced(char* i, char* end) const {
+ for (; i != end; ++i)
+ if (ascii_digit(*i)) {
+ *i = map_digits;
+ while (++i != end)
+ if (ascii_digit(*i)) *i = map_digits;
+ return true;
+ }
+ return false;
+ }
+ /// maybe: only if non-0 map_digits, do the thing
+ bool maybe_replaced(char* i, char* end) const { return map_digits && replaced(i, end); }
+
+ void replace(char* i, char* end) const {
+ for (; i != end; ++i)
+ if (ascii_digit(*i)) *i = map_digits;
+ }
+ void maybe_replace(char* i, char* end) const {
+ if (map_digits) replace(i, end);
+ }
+
+ void replace(std::string& str, std::string::size_type i = 0) const {
+ std::string::size_type n = str.size();
+ char* d = (char *)str.data(); // although only C++11 officially allows this, in reality everyone does
+ replace(d + i, d + n);
+ }
+ void maybe_replace(std::string& str, std::string::size_type i = 0) const {
+ if (map_digits) replace(str, i);
+ }
+};
+
+
+}
+
+#endif
diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp
index 4f3713d..abaab34 100644
--- a/src/testNeuralLM.cpp
+++ b/src/testNeuralLM.cpp
@@ -6,7 +6,6 @@
#include <tclap/CmdLine.h>
#include <Eigen/Core>
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
#include "param.h"
@@ -21,174 +20,174 @@ using namespace Eigen;
using namespace nplm;
void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams,
- vector<double> &out) {
- if (ngrams.size() == 0) return;
- int ngram_size = ngrams[0].size();
-
- if (minibatch_size == 0)
+ vector<double> &out) {
+ if (ngrams.size() == 0) return;
+ int ngram_size = ngrams[0].size();
+
+ if (minibatch_size == 0)
+ {
+ // Score one n-gram at a time. This is how the LM would be queried from a decoder.
+ for (int sent_id=0; sent_id<start.size()-1; sent_id++)
{
- // Score one n-gram at a time. This is how the LM would be queried from a decoder.
- for (int sent_id=0; sent_id<start.size()-1; sent_id++)
- {
- double sent_log_prob = 0.0;
- for (int j=start[sent_id]; j<start[sent_id+1]; j++)
- sent_log_prob += lm.lookup_ngram(ngrams[j]);
- out.push_back(sent_log_prob);
- }
+ double sent_log_prob = 0.0;
+ for (int j=start[sent_id]; j<start[sent_id+1]; j++)
+ sent_log_prob += lm.lookup_ngram(ngrams[j]);
+ out.push_back(sent_log_prob);
}
- else
+ }
+ else
+ {
+ // Score a whole minibatch at a time.
+ Matrix<double,1,Dynamic> log_probs(ngrams.size());
+
+ Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
+ minibatch.setZero();
+ for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
{
- // Score a whole minibatch at a time.
- Matrix<double,1,Dynamic> log_probs(ngrams.size());
-
- Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
- minibatch.setZero();
- for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
- {
- int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
- for (int j=0; j<current_minibatch_size; j++)
- minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
- lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
- }
-
- for (int sent_id=0; sent_id<start.size()-1; sent_id++)
- {
- double sent_log_prob = 0.0;
- for (int j=start[sent_id]; j<start[sent_id+1]; j++)
- sent_log_prob += log_probs[j];
- out.push_back(sent_log_prob);
- }
+ int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
+ for (int j=0; j<current_minibatch_size; j++)
+ minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
+ lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
}
+
+ for (int sent_id=0; sent_id<start.size()-1; sent_id++)
+ {
+ double sent_log_prob = 0.0;
+ for (int j=start[sent_id]; j<start[sent_id+1]; j++)
+ sent_log_prob += log_probs[j];
+ out.push_back(sent_log_prob);
+ }
+ }
}
-int main (int argc, char *argv[])
+int main (int argc, char *argv[])
{
- param myParam;
- bool normalization;
- bool numberize, ngramize, add_start_stop;
+ param myParam;
+ bool normalization;
+ bool numberize, ngramize, add_start_stop;
- try {
- // program options //
- CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1");
+ try {
+ // program options //
+ CmdLine cmd("Tests a two-layer neural probabilistic language model.", ' ' , "0.1");
- ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
- ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd);
+ ValueArg<int> num_threads("", "num_threads", "Number of threads. Default: maximum.", false, 0, "int", cmd);
+ ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size. Default: none.", false, 0, "int", cmd);
- ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
- ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
- ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
+ ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
+ ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
+ ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
- ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
+ ValueArg<bool> arg_normalization("", "normalization", "Normalize probabilities. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
- ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd);
+ ValueArg<string> arg_test_file("", "test_file", "Test file (one tokenized sentence per line).", true, "", "string", cmd);
- ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd);
+ ValueArg<string> arg_model_file("", "model_file", "Language model file.", true, "", "string", cmd);
- cmd.parse(argc, argv);
+ cmd.parse(argc, argv);
- myParam.model_file = arg_model_file.getValue();
- myParam.test_file = arg_test_file.getValue();
+ myParam.model_file = arg_model_file.getValue();
+ myParam.test_file = arg_test_file.getValue();
- normalization = arg_normalization.getValue();
- numberize = arg_numberize.getValue();
- ngramize = arg_ngramize.getValue();
- add_start_stop = arg_add_start_stop.getValue();
+ normalization = arg_normalization.getValue();
+ numberize = arg_numberize.getValue();
+ ngramize = arg_ngramize.getValue();
+ add_start_stop = arg_add_start_stop.getValue();
- myParam.minibatch_size = minibatch_size.getValue();
- myParam.num_threads = num_threads.getValue();
+ myParam.minibatch_size = minibatch_size.getValue();
+ myParam.num_threads = num_threads.getValue();
- cerr << "Command line: " << endl;
- cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
-
- const string sep(" Value: ");
- cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl;
- cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl;
+ cerr << "Command line: " << endl;
+ cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
- cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl;
- cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
- cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+ const string sep(" Value: ");
+ cerr << arg_test_file.getDescription() << sep << arg_test_file.getValue() << endl;
+ cerr << arg_model_file.getDescription() << sep << arg_model_file.getValue() << endl;
- cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
- cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
- }
- catch (TCLAP::ArgException &e)
- {
- cerr << "error: " << e.error() << " for arg " << e.argId() << endl;
- exit(1);
- }
+ cerr << arg_normalization.getDescription() << sep << arg_normalization.getValue() << endl;
+ cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
+ cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
- myParam.num_threads = setup_threads(myParam.num_threads);
+ cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
+ cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
+ }
+ catch (TCLAP::ArgException &e)
+ {
+ cerr << "error: " << e.error() << " for arg " << e.argId() << endl;
+ exit(1);
+ }
- ///// Create language model
+ myParam.num_threads = setup_threads(myParam.num_threads);
- neuralLM lm;
- lm.read(myParam.model_file);
- lm.set_normalization(normalization);
- lm.set_log_base(10);
- lm.set_cache(1048576);
- int ngram_size = lm.get_order();
- int minibatch_size = myParam.minibatch_size;
- if (minibatch_size)
- lm.set_width(minibatch_size);
+ ///// Create language model
- ///// Read test data
-
- ifstream test_file(myParam.test_file.c_str());
- if (!test_file)
- {
- cerr << "error: could not open " << myParam.test_file << endl;
- exit(1);
- }
- string line;
+ neuralLM lm;
+ lm.read(myParam.model_file);
+ lm.set_normalization(normalization);
+ lm.set_log_base(10);
+ lm.set_cache(1048576);
+ int ngram_size = lm.get_order();
+ int minibatch_size = myParam.minibatch_size;
+ if (minibatch_size)
+ lm.set_width(minibatch_size);
- vector<int> start;
- vector<vector<int> > ngrams;
+ ///// Read test data
- while (getline(test_file, line))
- {
- vector<string> words;
- splitBySpace(line, words);
+ ifstream test_file(myParam.test_file.c_str());
+ if (!test_file)
+ {
+ cerr << "error: could not open " << myParam.test_file << endl;
+ exit(1);
+ }
+ string line;
- vector<vector<int> > sent_ngrams;
- preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize);
+ vector<int> start;
+ vector<vector<int> > ngrams;
- start.push_back(ngrams.size());
- copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams));
- }
- start.push_back(ngrams.size());
+ while (getline(test_file, line))
+ {
+ vector<string> words;
+ splitBySpace(line, words);
- int num_threads = 1;
- vector< vector<double> > sent_log_probs(num_threads);
+ vector<vector<int> > sent_ngrams;
+ preprocessWords(words, sent_ngrams, ngram_size, lm.get_vocabulary(), numberize, add_start_stop, ngramize);
- /*
- // Test thread safety
- boost::thread_group tg;
- for (int t=0; t < num_threads; t++) {
- tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm
- }
- tg.join_all();
- */
- score(lm, minibatch_size, start, ngrams, sent_log_probs[0]);
-
- vector<double> log_likelihood(num_threads);
- std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0);
- for (int i=0; i<sent_log_probs[0].size(); i++) {
- for (int t=0; t<num_threads; t++)
- cout << sent_log_probs[t][i] << "\t";
- cout << endl;
- for (int t=0; t<num_threads; t++)
- log_likelihood[t] += sent_log_probs[t][i];
- }
-
- cerr << "Test log10-likelihood: ";
+ start.push_back(ngrams.size());
+ copy(sent_ngrams.begin(), sent_ngrams.end(), back_inserter(ngrams));
+ }
+ start.push_back(ngrams.size());
+
+ int num_threads = 1;
+ vector< vector<double> > sent_log_probs(num_threads);
+
+ /*
+ // Test thread safety
+ boost::thread_group tg;
+ for (int t=0; t < num_threads; t++) {
+ tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm
+ }
+ tg.join_all();
+ */
+ score(lm, minibatch_size, start, ngrams, sent_log_probs[0]);
+
+ vector<double> log_likelihood(num_threads);
+ std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0);
+ for (int i=0; i<sent_log_probs[0].size(); i++) {
for (int t=0; t<num_threads; t++)
- cerr << log_likelihood[t] << " ";
- cerr << endl;
- #ifdef USE_CHRONO
- cerr << "Propagation times:";
- for (int i=0; i<timer.size(); i++)
- cerr << " " << timer.get(i);
- cerr << endl;
- #endif
-
+ cout << sent_log_probs[t][i] << "\t";
+ cout << endl;
+ for (int t=0; t<num_threads; t++)
+ log_likelihood[t] += sent_log_probs[t][i];
+ }
+
+ cerr << "Test log10-likelihood: ";
+ for (int t=0; t<num_threads; t++)
+ cerr << log_likelihood[t] << " ";
+ cerr << endl;
+#ifdef USE_CHRONO
+ cerr << "Propagation times:";
+ for (int i=0; i<timer.size(); i++)
+ cerr << " " << timer.get(i);
+ cerr << endl;
+#endif
+
}
diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp
index 97af03b..d4720ef 100644
--- a/src/trainNeuralNetwork.cpp
+++ b/src/trainNeuralNetwork.cpp
@@ -6,17 +6,16 @@
#include <vector>
#include <algorithm>
-#include <boost/unordered_map.hpp>
+#include <boost/unordered_map.hpp>
#include <boost/functional.hpp>
#include <boost/lexical_cast.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <boost/algorithm/string/join.hpp>
-# include <boost/interprocess/managed_shared_memory.hpp>
-# include <boost/interprocess/allocators/allocator.hpp>
-# include <boost/interprocess/managed_mapped_file.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
#include <boost/interprocess/containers/vector.hpp>
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
#include <Eigen/Sparse>
#include "maybe_omp.h"
@@ -29,7 +28,6 @@
#include "graphClasses.h"
#include "util.h"
#include "multinomial.h"
-//#include "gradientCheck.h"
//#define EIGEN_DONT_PARALLELIZE
@@ -65,7 +63,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
int validation_minibatch_start_index = validation_minibatch_size * validation_batch;
int current_minibatch_size = min(validation_minibatch_size,
validation_data_size - validation_minibatch_start_index);
- minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index,
+ minibatch.leftCols(current_minibatch_size) = validation_data.middleCols(validation_minibatch_start_index,
current_minibatch_size);
prop_validation.fProp(minibatch.topRows(ngram_size-1));
@@ -80,7 +78,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
// And softmax and loss. Be careful of short minibatch
double minibatch_log_likelihood;
start_timer(5);
- SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
+ SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
minibatch.row(ngram_size-1),
output_probs,
minibatch_log_likelihood);
@@ -93,7 +91,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
// If the validation perplexity decreases, halve the learning rate.
if (current_validation_ll != 0.0 && log_likelihood < current_validation_ll && myParam.parameter_update != "ADA")
- {
+ {
current_learning_rate /= 2;
}
current_validation_ll = log_likelihood;
@@ -101,7 +99,7 @@ void compute_validation_perplexity(int ngram_size, int output_vocab_size, int va
int main(int argc, char** argv)
-{
+{
ios::sync_with_stdio(false);
bool use_mmap_file, randomize;
param myParam;
@@ -183,7 +181,7 @@ int main(int argc, char** argv)
myParam.input_words_file = input_words_file.getValue();
myParam.output_words_file = output_words_file.getValue();
if (words_file.getValue() != "")
- myParam.input_words_file = myParam.output_words_file = words_file.getValue();
+ myParam.input_words_file = myParam.output_words_file = words_file.getValue();
myParam.model_prefix = model_prefix.getValue();
@@ -192,7 +190,7 @@ int main(int argc, char** argv)
myParam.input_vocab_size = input_vocab_size.getValue();
myParam.output_vocab_size = output_vocab_size.getValue();
if (vocab_size.getValue() > 0) {
- myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
+ myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
}
myParam.num_hidden = num_hidden.getValue();
myParam.activation_function = activation_function.getValue();
@@ -205,7 +203,7 @@ int main(int argc, char** argv)
myParam.input_embedding_dimension = input_embedding_dimension.getValue();
myParam.output_embedding_dimension = output_embedding_dimension.getValue();
if (embedding_dimension.getValue() >= 0) {
- myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue();
+ myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue();
}
myParam.minibatch_size = minibatch_size.getValue();
@@ -243,33 +241,33 @@ int main(int argc, char** argv)
if (embedding_dimension.getValue() >= 0)
{
- cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
+ cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
}
else
{
- cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
- cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
+ cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
+ cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
}
cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl;
if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue())
{
- cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
- exit(1);
+ cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
+ exit(1);
}
cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl;
if (string_to_activation_function(activation_function.getValue()) == InvalidFunction)
{
- cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
- exit(1);
+ cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
+ exit(1);
}
cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl;
if (string_to_loss_function(loss_function.getValue()) == InvalidLoss)
{
- cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
- exit(1);
+ cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
+ exit(1);
}
cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl;
@@ -279,7 +277,7 @@ int main(int argc, char** argv)
cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl;
cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
if (myParam.validation_file != "") {
- cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
+ cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
}
cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl;
cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl;
@@ -288,7 +286,7 @@ int main(int argc, char** argv)
cerr << normalization.getDescription() << sep << normalization.getValue() << endl;
if (myParam.normalization){
- cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
+ cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
}
cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl;
@@ -302,7 +300,7 @@ int main(int argc, char** argv)
if (unigram_probs_file.getValue() != "")
{
- cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
+ cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
}
}
catch (TCLAP::ArgException &e)
@@ -316,7 +314,7 @@ int main(int argc, char** argv)
//unsigned seed = std::time(0);
unsigned seed = 1234; //for testing only
- mt19937 rng(seed);
+ boost::random::mt19937 rng(seed);
/////////////////////////READING IN THE TRAINING AND VALIDATION DATA///////////////////
/////////////////////////////////////////////////////////////////////////////////////
@@ -337,7 +335,7 @@ int main(int argc, char** argv)
training_data_flat_mmap = mmap_file.find<vec>("vector").first;
cerr<<"Size of mmaped vector is "<<training_data_flat_mmap->size()<<endl;
training_data_size = training_data_flat_mmap->size()/myParam.ngram_size;
- //randomly shuffle the data for better learning. The shuffling will
+ //randomly shuffle the data for better learning. The shuffling will
//be different for a standard stl vector
// Randomly shuffle training data to improve learning
if (randomize == true) {
@@ -413,10 +411,10 @@ int main(int argc, char** argv)
//cerr<<"Num tokens "<<num_tokens<<endl;
//data_size_t training_data_size = num_tokens / myParam.ngram_size;
cerr << "Number of training instances: "<< training_data_size << endl;
-
+
Matrix<int,Dynamic,Dynamic> training_data;
//(training_data_flat.data(), myParam.ngram_size, training_data_size);
-
+
#ifdef MAP
cerr<<"Setting up eigen map"<<endl;
if (use_mmap_file == false) {
@@ -425,11 +423,11 @@ int main(int argc, char** argv)
training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat_mmap->data().get(), myParam.ngram_size, training_data_size);
}
cerr<<"Created eigen map"<<endl;
- #else
+ #else
if (use_mmap_file == false) {
training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size);
}
- #endif
+ #endif
// If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index
if (myParam.input_vocab_size == 0 and myParam.input_words_file == "")
{
@@ -454,7 +452,7 @@ int main(int argc, char** argv)
// Read validation data
vector<int> validation_data_flat;
int validation_data_size = 0;
-
+
if (myParam.validation_file != "")
{
readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat);
@@ -470,16 +468,16 @@ int main(int argc, char** argv)
if (myParam.input_words_file != "")
{
readWordsFile(myParam.input_words_file, input_words);
- if (myParam.input_vocab_size == 0)
- myParam.input_vocab_size = input_words.size();
+ if (myParam.input_vocab_size == 0)
+ myParam.input_vocab_size = input_words.size();
}
vector<string> output_words;
if (myParam.output_words_file != "")
{
readWordsFile(myParam.output_words_file, output_words);
- if (myParam.output_vocab_size == 0)
- myParam.output_vocab_size = output_words.size();
+ if (myParam.output_vocab_size == 0)
+ myParam.output_vocab_size = output_words.size();
}
///// Construct unigram model and sampler that will be used for NCE
@@ -491,17 +489,17 @@ int main(int argc, char** argv)
if (use_mmap_file == false) {
output_word = training_data(myParam.ngram_size-1, train_id);
} else {
- //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl;
+ //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl;
output_word = training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1);
}
- //cerr<<"output word is "<<output_word<<endl;
- unigram_counts[output_word] += 1;
+ //cerr<<"output word is "<<output_word<<endl;
+ unigram_counts[output_word] += 1;
}
multinomial<data_size_t> unigram (unigram_counts);
///// Create and initialize the neural network and associated propagators.
model nn;
- // IF THE MODEL FILE HAS BEEN DEFINED, THEN
+ // IF THE MODEL FILE HAS BEEN DEFINED, THEN
// LOAD THE NEURAL NETWORK MODEL
if (myParam.model_file != ""){
nn.read(myParam.model_file);
@@ -529,7 +527,7 @@ int main(int argc, char** argv)
SoftmaxNCELoss<multinomial<data_size_t> > softmax_loss(unigram);
// normalization parameters
vector_map c_h, c_h_running_gradient;
-
+
///////////////////////TRAINING THE NEURAL NETWORK////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////
@@ -540,8 +538,8 @@ int main(int argc, char** argv)
if (validation_data_size > 0)
{
num_validation_batches = (validation_data_size-1)/myParam.validation_minibatch_size+1;
- cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl;
- }
+ cerr<<"Number of validation minibatches: "<<num_validation_batches<<endl;
+ }
double current_momentum = myParam.initial_momentum;
double momentum_delta = (myParam.final_momentum - myParam.initial_momentum)/(myParam.num_epochs-1);
@@ -568,36 +566,36 @@ int main(int argc, char** argv)
}
for (int epoch=0; epoch<myParam.num_epochs; epoch++)
- {
+ {
cerr << "Epoch " << epoch+1 << endl;
cerr << "Current learning rate: " << current_learning_rate << endl;
- if (myParam.use_momentum)
- cerr << "Current momentum: " << current_momentum << endl;
- else
+ if (myParam.use_momentum)
+ cerr << "Current momentum: " << current_momentum << endl;
+ else
current_momentum = -1;
- cerr << "Training minibatches: ";
+ cerr << "Training minibatches: ";
- double log_likelihood = 0.0;
+ double log_likelihood = 0.0;
- int num_samples = 0;
- if (loss_function == LogLoss)
- num_samples = output_vocab_size;
- else if (loss_function == NCELoss)
- num_samples = 1+num_noise_samples;
+ int num_samples = 0;
+ if (loss_function == LogLoss)
+ num_samples = output_vocab_size;
+ else if (loss_function == NCELoss)
+ num_samples = 1+num_noise_samples;
- Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
- Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size);
- Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size);
- Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size);
+ Matrix<double,Dynamic,Dynamic> minibatch_weights(num_samples, minibatch_size);
+ Matrix<int,Dynamic,Dynamic> minibatch_samples(num_samples, minibatch_size);
+ Matrix<double,Dynamic,Dynamic> scores(num_samples, minibatch_size);
+ Matrix<double,Dynamic,Dynamic> probs(num_samples, minibatch_size);
for(data_size_t batch=0;batch<num_batches;batch++)
{
if (batch > 0 && batch % 10000 == 0)
{
- cerr << batch <<"...";
- }
+ cerr << batch <<"...";
+ }
if (batch > 0 && batch % 500000 == 0)
{
@@ -605,31 +603,31 @@ int main(int argc, char** argv)
compute_validation_perplexity(ngram_size, output_vocab_size, validation_minibatch_size, validation_data_size, num_validation_batches, myParam, prop_validation, validation_data, current_learning_rate, current_validation_ll);
cerr << "Current learning rate: " << current_learning_rate << endl;
}
-
+
data_size_t minibatch_start_index = minibatch_size * batch;
int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index);
#ifdef MAP
- Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
- #else
+ Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+ #else
//ALTERNATIVE OPTION IF YOU'RE NOT USING eigen map interface on the mmapped file
- Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size);
- //cerr<<"Minibatch start index "<<minibatch_start_index<<endl;
- //cerr<<"Minibatch size "<<current_minibatch_size<<endl;
+ Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+ //cerr<<"Minibatch start index "<<minibatch_start_index<<endl;
+ //cerr<<"Minibatch size "<<current_minibatch_size<<endl;
if (use_mmap_file == true) {
minibatch.setZero(ngram_size,current_minibatch_size);
//now reading the ngrams from the mmaped file
for (int k=0; k<ngram_size; k++){
for (data_size_t index = 0 ; index<current_minibatch_size; index++) {
- data_size_t current_index = index + minibatch_start_index;
- //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl;
+ data_size_t current_index = index + minibatch_start_index;
+ //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl;
minibatch(k,index) = training_data_flat_mmap->at(current_index*ngram_size+k);
}
}
} else {
minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
}
- #endif
+ #endif
double adjusted_learning_rate = current_learning_rate/minibatch_size;
//cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl;
@@ -648,20 +646,20 @@ int main(int argc, char** argv)
prop.fProp(minibatch.topRows(ngram_size-1));
- if (loss_function == NCELoss)
- {
- ///// Noise-contrastive estimation
+ if (loss_function == NCELoss)
+ {
+ ///// Noise-contrastive estimation
- // Generate noise samples. Gather positive and negative samples into matrix.
+ // Generate noise samples. Gather positive and negative samples into matrix.
- start_timer(3);
+ start_timer(3);
minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1);
-
+
for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++)
for (int train_id = 0; train_id < current_minibatch_size; train_id++)
minibatch_samples(sample_id, train_id) = unigram.sample(rng);
-
+
stop_timer(3);
// Final forward propagation step (sparse)
@@ -686,7 +684,7 @@ int main(int argc, char** argv)
double minibatch_log_likelihood;
start_timer(5);
- softmax_loss.fProp(scores.leftCols(current_minibatch_size),
+ softmax_loss.fProp(scores.leftCols(current_minibatch_size),
minibatch_samples,
probs, minibatch_log_likelihood);
stop_timer(5);
@@ -697,9 +695,9 @@ int main(int argc, char** argv)
start_timer(6);
softmax_loss.bProp(probs, minibatch_weights);
stop_timer(6);
-
+
// Update the normalization parameters
-
+
if (myParam.normalization)
{
for (int train_id = 0;train_id < current_minibatch_size;train_id++)
@@ -711,19 +709,19 @@ int main(int argc, char** argv)
// Be careful of short minibatch
prop.bProp(minibatch.topRows(ngram_size-1),
- minibatch_samples.leftCols(current_minibatch_size),
+ minibatch_samples.leftCols(current_minibatch_size),
minibatch_weights.leftCols(current_minibatch_size),
- adjusted_learning_rate,
+ adjusted_learning_rate,
current_momentum,
myParam.L2_reg,
myParam.parameter_update,
myParam.conditioning_constant,
myParam.decay);
- }
- else if (loss_function == LogLoss)
- {
- ///// Standard log-likelihood
- start_timer(4);
+ }
+ else if (loss_function == LogLoss)
+ {
+ ///// Standard log-likelihood
+ start_timer(4);
if (prop.skip_hidden)
prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
else
@@ -732,21 +730,21 @@ int main(int argc, char** argv)
double minibatch_log_likelihood;
start_timer(5);
- SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
- minibatch.row(ngram_size-1),
- probs,
+ SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size),
+ minibatch.row(ngram_size-1),
+ probs,
minibatch_log_likelihood);
stop_timer(5);
log_likelihood += minibatch_log_likelihood;
///// Backward propagation
-
+
start_timer(6);
- SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size),
- probs.leftCols(current_minibatch_size),
+ SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size),
+ probs.leftCols(current_minibatch_size),
minibatch_weights);
stop_timer(6);
-
+
prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size),
minibatch_weights,
adjusted_learning_rate,
@@ -757,33 +755,33 @@ int main(int argc, char** argv)
myParam.decay);
}
}
- cerr << "done." << endl;
+ cerr << "done." << endl;
- if (loss_function == LogLoss)
- {
- cerr << "Training log-likelihood: " << log_likelihood << endl;
+ if (loss_function == LogLoss)
+ {
+ cerr << "Training log-likelihood: " << log_likelihood << endl;
cerr << " perplexity: "<< exp(-log_likelihood/training_data_size) << endl;
- }
- else if (loss_function == NCELoss)
- cerr << "Training NCE log-likelihood: " << log_likelihood << endl;
+ }
+ else if (loss_function == NCELoss)
+ cerr << "Training NCE log-likelihood: " << log_likelihood << endl;
current_momentum += momentum_delta;
- #ifdef USE_CHRONO
- cerr << "Propagation times:";
- for (int i=0; i<timer.size(); i++)
- cerr << " " << timer.get(i);
- cerr << endl;
- #endif
-
- if (myParam.model_prefix != "")
- {
- cerr << "Writing model" << endl;
- if (myParam.input_words_file != "")
- nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words);
- else
- nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1));
- }
+ #ifdef USE_CHRONO
+ cerr << "Propagation times:";
+ for (int i=0; i<timer.size(); i++)
+ cerr << " " << timer.get(i);
+ cerr << endl;
+ #endif
+
+ if (myParam.model_prefix != "")
+ {
+ cerr << "Writing model" << endl;
+ if (myParam.input_words_file != "")
+ nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1), input_words, output_words);
+ else
+ nn.write(myParam.model_prefix + "." + lexical_cast<string>(epoch+1));
+ }
if (epoch % 1 == 0 && validation_data_size > 0)
{
@@ -793,4 +791,3 @@ int main(int argc, char** argv)
}
return 0;
}
-
diff --git a/src/types.hpp b/src/types.hpp
deleted file mode 100644
index 08b010f..0000000
--- a/src/types.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef TYPES_HPP
-#define TYPES_HPP
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <boost/cstdint.hpp>
-#include <limits>
-
-namespace biglm{
-
-typedef double weight_type;
-const weight_type IMPOSSIBLE = -HUGE_VAL;
-
-typedef unsigned long block_type;
-const size_t bits_per_block = (std::numeric_limits<block_type>::digits);
- //typedef std::size_t size_type;
-typedef boost::uint64_t size_type;
-typedef unsigned char byte_type;
-
-template<typename T>
-struct bytes {
- static const byte_type *data(const T& key) { return reinterpret_cast<const byte_type *>(&key); }
- static size_type size(const T& key) { return sizeof(T); }
-};
-
-template<>
-struct bytes<std::string> {
- static const byte_type *data(const std::string& key) { return reinterpret_cast<const byte_type *>(key.data()); }
- static size_type size(const std::string& key) { return key.size(); }
-};
-
-template<typename U>
-struct bytes<std::vector<U> > {
- static const byte_type *data(const std::vector<U>& key) { return reinterpret_cast<const byte_type *>(&key[0]); }
- static size_type size(const std::vector<U>& key) { return key.size() * sizeof(U); }
-};
-
-} //namespace nplm
-
-#endif
diff --git a/src/util.h b/src/util.h
index a8453aa..6cbde9d 100644
--- a/src/util.h
+++ b/src/util.h
@@ -15,7 +15,6 @@
#include <boost/chrono.hpp>
#endif
-//#include <../3rdparty/Eigen/Dense>
#include <Eigen/Dense>
#include "maybe_omp.h"
@@ -23,15 +22,15 @@
// Make matrices hashable
namespace Eigen {
- template <typename Derived>
- size_t hash_value(const DenseBase<Derived> &m)
- {
- size_t h=0;
- for (int i=0; i<m.rows(); i++)
- for (int j=0; j<m.cols(); j++)
- boost::hash_combine(h, m(i,j));
- return h;
- }
+template <typename Derived>
+size_t hash_value(const DenseBase<Derived> &m)
+{
+ size_t h=0;
+ for (int i=0; i<m.rows(); i++)
+ for (int j=0; j<m.cols(); j++)
+ boost::hash_combine(h, m(i,j));
+ return h;
+}
}
namespace nplm
@@ -73,9 +72,9 @@ void readSentFile(const std::string &file, T &sentences)
}
inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngram){
- int ngram_size = ngram.size();
- for (int i=0;i<ngram_size;i++)
- int_ngram.push_back(boost::lexical_cast<int>(ngram[i]));
+ int ngram_size = ngram.size();
+ for (int i=0;i<ngram_size;i++)
+ int_ngram.push_back(boost::lexical_cast<int>(ngram[i]));
}
// Functions that take non-const matrices as arguments
@@ -85,194 +84,194 @@ inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngra
template <typename Derived>
void initMatrix(boost::random::mt19937 &engine,
- const Eigen::MatrixBase<Derived> &p_const,
- bool init_normal, double range)
+ const Eigen::MatrixBase<Derived> &p_const,
+ bool init_normal, double range)
{
- UNCONST(Derived, p_const, p);
- if (init_normal == 0)
- // initialize with uniform distribution in [-range, range]
+ UNCONST(Derived, p_const, p);
+ if (init_normal == 0)
+ // initialize with uniform distribution in [-range, range]
+ {
+ boost::random::uniform_real_distribution<> unif_real(-range, range);
+ for (int i = 0; i < p.rows(); i++)
{
- boost::random::uniform_real_distribution<> unif_real(-range, range);
- for (int i = 0; i < p.rows(); i++)
- {
- for (int j = 0; j< p.cols(); j++)
- {
- p(i,j) = unif_real(engine);
- }
- }
-
+ for (int j = 0; j< p.cols(); j++)
+ {
+ p(i,j) = unif_real(engine);
+ }
}
- else
- // initialize with gaussian distribution with mean 0 and stdev range
+
+ }
+ else
+ // initialize with gaussian distribution with mean 0 and stdev range
+ {
+ boost::random::normal_distribution<double> unif_normal(0., range);
+ for (int i = 0; i < p.rows(); i++)
{
- boost::random::normal_distribution<double> unif_normal(0., range);
- for (int i = 0; i < p.rows(); i++)
- {
- for (int j = 0; j < p.cols(); j++)
- {
- p(i,j) = unif_normal(engine);
- }
- }
+ for (int j = 0; j < p.cols(); j++)
+ {
+ p(i,j) = unif_normal(engine);
+ }
}
+ }
}
template <typename Derived>
void initBias(boost::random::mt19937 &engine,
- const Eigen::MatrixBase<Derived> &p_const,
- bool init_normal, double range)
+ const Eigen::MatrixBase<Derived> &p_const,
+ bool init_normal, double range)
{
- UNCONST(Derived, p_const, p);
- if (init_normal == 0)
- // initialize with uniform distribution in [-range, range]
+ UNCONST(Derived, p_const, p);
+ if (init_normal == 0)
+ // initialize with uniform distribution in [-range, range]
+ {
+ boost::random::uniform_real_distribution<> unif_real(-range, range);
+ for (int i = 0; i < p.size(); i++)
{
- boost::random::uniform_real_distribution<> unif_real(-range, range);
- for (int i = 0; i < p.size(); i++)
- {
- p(i) = unif_real(engine);
- }
-
+ p(i) = unif_real(engine);
}
- else
- // initialize with gaussian distribution with mean 0 and stdev range
+
+ }
+ else
+ // initialize with gaussian distribution with mean 0 and stdev range
+ {
+ boost::random::normal_distribution<double> unif_normal(0., range);
+ for (int i = 0; i < p.size(); i++)
{
- boost::random::normal_distribution<double> unif_normal(0., range);
- for (int i = 0; i < p.size(); i++)
- {
- p(i) = unif_normal(engine);
- }
+ p(i) = unif_normal(engine);
}
+ }
}
template <typename Derived>
void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> &param_const)
{
- UNCONST(Derived, param_const, param);
+ UNCONST(Derived, param_const, param);
+
+ int i = 0;
+ std::string line;
+ std::vector<std::string> fields;
+
+ while (std::getline(TRAININ, line) && line != "")
+ {
+ splitBySpace(line, fields);
+ if (fields.size() != param.cols())
+ {
+ std::ostringstream err;
+ err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")";
+ throw std::runtime_error(err.str());
+ }
- int i = 0;
- std::string line;
- std::vector<std::string> fields;
-
- while (std::getline(TRAININ, line) && line != "")
+ if (i >= param.rows())
{
- splitBySpace(line, fields);
- if (fields.size() != param.cols())
- {
- std::ostringstream err;
- err << "error: wrong number of columns (expected " << param.cols() << ", found " << fields.size() << ")";
- throw std::runtime_error(err.str());
- }
-
- if (i >= param.rows())
- {
- std::ostringstream err;
- err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")";
- throw std::runtime_error(err.str());
- }
-
- for (int j=0; j<fields.size(); j++)
- {
- param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]);
- }
- i++;
+ std::ostringstream err;
+ err << "error: wrong number of rows (expected " << param.rows() << ", found " << i << ")";
+ throw std::runtime_error(err.str());
}
-
- if (i != param.rows())
+
+ for (int j=0; j<fields.size(); j++)
{
- std::ostringstream err;
- err << "error: wrong number of rows (expected " << param.rows() << ", found more)";
- throw std::runtime_error(err.str());
+ param(i,j) = boost::lexical_cast<typename Derived::Scalar>(fields[j]);
}
+ i++;
+ }
+
+ if (i != param.rows())
+ {
+ std::ostringstream err;
+ err << "error: wrong number of rows (expected " << param.rows() << ", found more)";
+ throw std::runtime_error(err.str());
+ }
}
template <typename Derived>
void readMatrix(const std::string &param_file, const Eigen::MatrixBase<Derived> &param_const)
{
- UNCONST(Derived, param_const, param);
- std::cerr << "Reading data from file: " << param_file << std::endl;
-
- std::ifstream TRAININ(param_file.c_str());
- if (!TRAININ)
- {
- std::cerr << "Error: can't read training data from file " << param_file << std::endl;
- exit(-1);
- }
- readMatrix(TRAININ, param);
- TRAININ.close();
+ UNCONST(Derived, param_const, param);
+ std::cerr << "Reading data from file: " << param_file << std::endl;
+
+ std::ifstream TRAININ(param_file.c_str());
+ if (!TRAININ)
+ {
+ std::cerr << "Error: can't read training data from file " << param_file << std::endl;
+ exit(-1);
+ }
+ readMatrix(TRAININ, param);
+ TRAININ.close();
}
template <typename Derived>
void writeMatrix(const Eigen::MatrixBase<Derived> &param, const std::string &filename)
{
- std::cerr << "Writing parameters to " << filename << std::endl;
+ std::cerr << "Writing parameters to " << filename << std::endl;
- std::ofstream OUT;
- OUT.precision(16);
- OUT.open(filename.c_str());
- if (! OUT)
- {
- std::cerr << "Error: can't write to file " << filename<< std::endl;
- exit(-1);
- }
- writeMatrix(param, OUT);
- OUT.close();
+ std::ofstream OUT;
+ OUT.precision(16);
+ OUT.open(filename.c_str());
+ if (! OUT)
+ {
+ std::cerr << "Error: can't write to file " << filename<< std::endl;
+ exit(-1);
+ }
+ writeMatrix(param, OUT);
+ OUT.close();
}
template <typename Derived>
void writeMatrix(const Eigen::MatrixBase<Derived> &param, std::ofstream &OUT)
{
- for (int row = 0;row < param.rows();row++)
+ for (int row = 0;row < param.rows();row++)
+ {
+ int col;
+ for (col = 0;col < param.cols()-1;col++)
{
- int col;
- for (col = 0;col < param.cols()-1;col++)
- {
- OUT<<param(row,col)<<"\t";
- }
- //dont want an extra tab at the end
- OUT<<param(row,col)<<std::endl;
+ OUT<<param(row,col)<<"\t";
}
+ //dont want an extra tab at the end
+ OUT<<param(row,col)<<std::endl;
+ }
}
template <typename Derived>
double logsum(const Eigen::MatrixBase<Derived> &v)
{
- int mi;
- double m = v.maxCoeff(&mi);
- double logz = 0.0;
- for (int i=0; i<v.rows(); i++)
- if (i != mi)
- logz += std::exp(v(i) - m);
- logz = log1p(logz) + m;
- return logz;
+ int mi;
+ double m = v.maxCoeff(&mi);
+ double logz = 0.0;
+ for (int i=0; i<v.rows(); i++)
+ if (i != mi)
+ logz += std::exp(v(i) - m);
+ logz = log1p(logz) + m;
+ return logz;
}
double logadd(double x, double y);
#ifdef USE_CHRONO
-class Timer
+class Timer
{
- typedef boost::chrono::high_resolution_clock clock_type;
- typedef clock_type::time_point time_type;
- typedef clock_type::duration duration_type;
- std::vector<time_type> m_start;
- std::vector<duration_type> m_total;
-public:
- Timer() { }
- Timer(int n) { resize(n); }
- void resize(int n) { m_start.resize(n); m_total.resize(n); }
- int size() const { return m_start.size(); }
- void start(int i);
- void stop(int i);
- void reset(int i);
- double get(int i) const;
+ typedef boost::chrono::high_resolution_clock clock_type;
+ typedef clock_type::time_point time_type;
+ typedef clock_type::duration duration_type;
+ std::vector<time_type> m_start;
+ std::vector<duration_type> m_total;
+ public:
+ Timer() { }
+ Timer(int n) { resize(n); }
+ void resize(int n) { m_start.resize(n); m_total.resize(n); }
+ int size() const { return m_start.size(); }
+ void start(int i);
+ void stop(int i);
+ void reset(int i);
+ double get(int i) const;
};
extern Timer timer;
#define start_timer(x) timer.start(x)
#define stop_timer(x) timer.stop(x)
#else
-#define start_timer(x) 0
-#define stop_timer(x) 0
+#define start_timer(x) (void)0
+#define stop_timer(x) (void)0
#endif
int setup_threads(int n_threads);
diff --git a/src/vocabulary.h b/src/vocabulary.h
index a987522..c8cd518 100644
--- a/src/vocabulary.h
+++ b/src/vocabulary.h
@@ -5,6 +5,9 @@
#include <string>
#include <queue>
#include <boost/unordered_map.hpp>
+#include "find_string.hpp"
+
+#define NPLM_HAVE_FIND_STRING_PIECE 1
namespace nplm
{
@@ -16,80 +19,83 @@ struct compare_second
};
class vocabulary {
- std::vector<std::string> m_words;
- boost::unordered_map<std::string, int> m_index;
- int unk;
-
-public:
- vocabulary()
- {
- unk = insert_word("<unk>");
- }
-
- vocabulary(const std::vector<std::string> &words)
+ std::vector<std::string> m_words;
+ typedef boost::unordered_map<std::string, int> WordId;
+ WordId m_index;
+ int unk;
+
+ public:
+ vocabulary()
+ {
+ unk = insert_word("<unk>");
+ }
+
+ vocabulary(const std::vector<std::string> &words)
:
m_words(words)
+ {
+ for (int i=0; i<words.size(); i++)
+ m_index[words[i]] = i;
+ unk = m_index["<unk>"];
+ }
+
+ int lookup_word(const std::string &word) const
+ {
+ return lookup_word(word, unk);
+ }
+
+ // lookup word using custom unknown-word id
+ int lookup_word(const std::string &word, int unkid) const
+ {
+ WordId::const_iterator pos = m_index.find(word);
+ return pos == m_index.end() ? unkid : pos->second;
+ }
+
+ int lookup_word(std::pair<char const*, char const*> slice) const {
+ return lookup_word(slice, unk);
+ }
+
+ int lookup_word(std::pair<char const*, char const*> slice, int unkid) const
+ {
+ WordId::const_iterator pos = find_string(m_index, slice);
+ return pos == m_index.end() ? unkid : pos->second;
+ }
+
+ int insert_word(const std::string &word)
+ {
+ int i = size();
+ bool inserted = m_index.insert(make_pair(word, i)).second;
+ if (inserted)
{
- for (int i=0; i<words.size(); i++)
- m_index[words[i]] = i;
- unk = m_index["<unk>"];
- }
-
- int lookup_word(const std::string &word) const
- {
- boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
- if (pos != m_index.end())
- return pos->second;
- else
- return unk;
+ m_words.push_back(word);
}
+ return i;
+ }
- // lookup word using custom unknown-word id
- int lookup_word(const std::string &word, int unk) const
- {
- boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
- if (pos != m_index.end())
- return pos->second;
- else
- return unk;
- }
+ int size() const { return m_words.size(); }
- int insert_word(const std::string &word)
- {
- int i = size();
- bool inserted = m_index.insert(make_pair(word, i)).second;
- if (inserted)
- {
- m_words.push_back(word);
- }
- return i;
- }
+ // Inserts the most-frequent words from counts until vocab_size words are reached.
+ // counts is a collection of pair<string,int>
+ template <typename Map>
+ int insert_most_frequent(const Map &counts, int vocab_size)
+ {
+ typedef std::pair<std::string,int> stringint;
- int size() const { return m_words.size(); }
+ std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> >
+ q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end()));
- // Inserts the most-frequent words from counts until vocab_size words are reached.
- // counts is a collection of pair<string,int>
- template <typename Map>
- int insert_most_frequent(const Map &counts, int vocab_size)
+ int inserted = 0;
+ while (size() < vocab_size && !q.empty())
{
- typedef std::pair<std::string,int> stringint;
-
- std::priority_queue<stringint,std::vector<stringint>,compare_second<stringint> >
- q(compare_second<stringint>(), std::vector<stringint>(counts.begin(), counts.end()));
-
- int inserted = 0;
- while (size() < vocab_size && !q.empty())
- {
- insert_word(q.top().first);
- q.pop();
- inserted++;
- }
- return inserted;
+ insert_word(q.top().first);
+ q.pop();
+ inserted++;
}
+ return inserted;
+ }
- const std::vector<std::string> &words() const { return m_words; }
+ const std::vector<std::string> &words() const { return m_words; }
- const boost::unordered_map<std::string, int>& get_idmap() const { return m_index; }
};
} // namespace nplm