Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/vowpal_wabbit.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--Makefile.am2
-rwxr-xr-xtest/RunTests7
-rw-r--r--test/train-sets/3parity8
-rw-r--r--test/train-sets/ref/3parity.stderr32
-rw-r--r--test/train-sets/ref/3parity.stdout0
-rw-r--r--vowpalwabbit/Makefile.am2
-rw-r--r--vowpalwabbit/nn.cc378
-rw-r--r--vowpalwabbit/nn.h22
-rw-r--r--vowpalwabbit/parse_args.cc14
-rw-r--r--vowpalwabbit/parser.h1
11 files changed, 462 insertions, 6 deletions
diff --git a/Makefile b/Makefile
index 5f758d20..36bde0b6 100644
--- a/Makefile
+++ b/Makefile
@@ -33,7 +33,7 @@ FLAGS = $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_FILE_OFFSET_BITS=64 -I $(BOOST_I
#FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -pg -g
# for valgrind
-FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O0
+#FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O0
# for valgrind profiling: run 'valgrind --tool=callgrind PROGRAM' then 'callgrind_annotate --tree=both --inclusive=yes'
#FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O3 -fomit-frame-pointer -ffast-math -fno-strict-aliasing
diff --git a/Makefile.am b/Makefile.am
index 648de3b7..d09026c8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -17,7 +17,7 @@ noinst_HEADERS = vowpalwabbit/accumulate.h vowpalwabbit/oaa.h \
vowpalwabbit/io.h vowpalwabbit/v_array.h \
vowpalwabbit/lda_core.h vowpalwabbit/v_hashmap.h \
vowpalwabbit/loss_functions.h vowpalwabbit/network.h \
- vowpalwabbit/wap.h vowpalwabbit/noop.h
+ vowpalwabbit/wap.h vowpalwabbit/noop.h vowpalwabbit/nn.h
ACLOCAL_AMFLAGS = -I acinclude.d
diff --git a/test/RunTests b/test/RunTests
index cb37a92a..cefeb5e2 100755
--- a/test/RunTests
+++ b/test/RunTests
@@ -343,7 +343,7 @@ sub check_for_time_regression() {
" try removing it\n"
}
$overall_time0 = $cuser0 + $csystem0;
- $pct_change = 100 * ($overall_time1 - $overall_time0) / $overall_time0;
+ $pct_change = 100 * ($overall_time1 - $overall_time0) / (1e-4+$overall_time0);
if ($overall_time0 == 0) {
die "$0: Bad times in saved times file: $LastTimeFile," .
@@ -580,3 +580,8 @@ __DATA__
{VW} -k -c -d train-sets/seq_small --passes 12 --invariant --searn_passes_per_policy 4 --searn 4 --searn_task sequence
train-sets/ref/searn_small.stdout
train-sets/ref/searn_small.stderr
+
+# Test 21: neural network 3-parity with 2 hidden units
+{VW} -k -c -d train-sets/3parity --hash all --passes 2000 -b 16 --nn 2 -l 10 --invariant
+ train-sets/ref/3parity.stdout
+ train-sets/ref/3parity.stderr
diff --git a/test/train-sets/3parity b/test/train-sets/3parity
new file mode 100644
index 00000000..74fe86b4
--- /dev/null
+++ b/test/train-sets/3parity
@@ -0,0 +1,8 @@
+-1 |f 1:1 2:-1 3:-1
+-1 |f 1:-1 2:1 3:-1
+1 |f 1:1 2:1 3:-1
+-1 |f 1:-1 2:-1 3:1
+1 |f 1:1 2:-1 3:1
+1 |f 1:-1 2:1 3:1
+-1 |f 1:1 2:1 3:1
+1 |f 1:-1 2:-1 3:-1
diff --git a/test/train-sets/ref/3parity.stderr b/test/train-sets/ref/3parity.stderr
new file mode 100644
index 00000000..295b006b
--- /dev/null
+++ b/test/train-sets/ref/3parity.stderr
@@ -0,0 +1,32 @@
+Num weight bits = 16
+learning rate = 10
+initial_t = 1
+power_t = 0.5
+decay_learning_rate = 1
+randomly initializing neural network output weights and hidden bias
+creating cache_file = train-sets/3parity.cache
+Reading from train-sets/3parity
+num sources = 1
+average since example example current current current
+loss last counter weight label predict features
+1.550870 1.550870 3 3.0 1.0000 -1.0000 4
+1.919601 2.288332 6 6.0 1.0000 0.7762 4
+2.011137 2.120980 11 11.0 1.0000 -1.0000 4
+2.154878 2.298620 22 22.0 1.0000 0.3713 4
+2.354256 2.553635 44 44.0 -1.0000 1.0000 4
+2.286332 2.216827 87 87.0 -1.0000 1.0000 4
+2.222494 2.158657 174 174.0 1.0000 0.8935 4
+1.716414 1.210335 348 348.0 -1.0000 -0.9598 4
+1.368982 1.021549 696 696.0 1.0000 0.9744 4
+1.151838 0.934694 1392 1392.0 1.0000 1.0000 4
+0.976327 0.800816 2784 2784.0 1.0000 1.0000 4
+0.756642 0.536958 5568 5568.0 1.0000 1.0000 4
+0.378355 0.000000 11135 11135.0 -1.0000 -1.0000 4
+
+finished run
+number of examples = 16000
+weighted example sum = 1.6e+04
+weighted label sum = 0
+average loss = 0.2633
+best constant = -6.25e-05
+total feature number = 64000
diff --git a/test/train-sets/ref/3parity.stdout b/test/train-sets/ref/3parity.stdout
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/test/train-sets/ref/3parity.stdout
diff --git a/vowpalwabbit/Makefile.am b/vowpalwabbit/Makefile.am
index dc307f67..22aa0eb4 100644
--- a/vowpalwabbit/Makefile.am
+++ b/vowpalwabbit/Makefile.am
@@ -5,7 +5,7 @@ include_HEADERS = allreduce.h
bin_PROGRAMS = vw active_interactor
-libvw_la_SOURCES = hash.cc global_data.cc io.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc simple_label.cc oaa.cc ect.cc csoaa.cc cb.cc wap.cc beam.cc searn.cc searn_sequencetask.cc sequence.cc parse_example.cc sparse_dense.cc network.cc parse_args.cc accumulate.cc gd.cc lda_core.cc gd_mf.cc bfgs.cc noop.cc example.cc parser.cc vw.cc loss_functions.cc sender.cc
+libvw_la_SOURCES = hash.cc global_data.cc io.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc simple_label.cc oaa.cc ect.cc csoaa.cc cb.cc wap.cc beam.cc searn.cc searn_sequencetask.cc sequence.cc parse_example.cc sparse_dense.cc network.cc parse_args.cc accumulate.cc gd.cc lda_core.cc gd_mf.cc bfgs.cc noop.cc example.cc parser.cc vw.cc loss_functions.cc sender.cc nn.cc
vw_SOURCES = vw.cc
vw_CXXFLAGS = $(AM_CXXFLAGS)
diff --git a/vowpalwabbit/nn.cc b/vowpalwabbit/nn.cc
new file mode 100644
index 00000000..6c4de7ee
--- /dev/null
+++ b/vowpalwabbit/nn.cc
@@ -0,0 +1,378 @@
+/*
+Copyright (c) by respective owners including Yahoo!, Microsoft, and
+individual contributors. All rights reserved. Released under a BSD (revised)
+license as described in the file LICENSE.
+ */
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <sstream>
+
+#include "constant.h"
+#include "oaa.h"
+#include "simple_label.h"
+#include "cache.h"
+#include "v_hashmap.h"
+
+using namespace std;
+
+namespace NN {
+ //nonreentrant
+ size_t k=0;
+ size_t increment=0;
+ size_t total_increment=0;
+ loss_function* squared_loss;
+ example output_layer;
+ const float hidden_min_activation = -3;
+ const float hidden_max_activation = 3;
+ const int nn_constant = 533357803;
+ bool dropout = false;
+ unsigned short xsubi[3];
+ unsigned short save_xsubi[3];
+ size_t nn_current_pass = 0;
+
+ static void
+ free_squared_loss (void)
+ {
+ delete squared_loss;
+ }
+
+ static void
+ free_output_layer (void)
+ {
+ free (output_layer.indices.begin);
+ free (output_layer.atomics[nn_output_namespace].begin);
+ }
+
+#define cast_uint32_t static_cast<uint32_t>
+
+ static inline float
+ fastpow2 (float p)
+ {
+ float offset = (p < 0) ? 1.0f : 0.0f;
+ float clipp = (p < -126) ? -126.0f : p;
+ int w = clipp;
+ float z = clipp - w + offset;
+ union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
+
+ return v.f;
+ }
+
+ static inline float
+ fastexp (float p)
+ {
+ return fastpow2 (1.442695040f * p);
+ }
+
+ static inline float
+ fasttanh (float p)
+ {
+ return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
+ }
+
+ void scale_example_indicies(bool audit, example* ec, size_t amount)
+ {
+ for (size_t* i = ec->indices.begin; i != ec->indices.end; i++)
+ {
+ feature* end = ec->atomics[*i].end;
+ for (feature* f = ec->atomics[*i].begin; f!= end; f++)
+ f->weight_index *= amount;
+ }
+ if (audit)
+ {
+ for (size_t* i = ec->indices.begin; i != ec->indices.end; i++)
+ if (ec->audit_features[*i].begin != ec->audit_features[*i].end)
+ for (audit_data *f = ec->audit_features[*i].begin; f != ec->audit_features[*i].end; f++)
+ f->weight_index *= amount;
+ }
+ }
+
+ void (*base_learner)(void*,example*) = NULL;
+
+ void learn_with_output(vw*all, example* ec, bool shouldOutput)
+ {
+ if (command_example(*all, ec)) {
+ return;
+ }
+
+ if (all->bfgs && ec->pass != nn_current_pass) {
+ memcpy (xsubi, save_xsubi, sizeof (xsubi));
+ nn_current_pass = ec->pass;
+ }
+
+ label_data* ld = (label_data*)ec->ld;
+ float save_label = ld->label;
+ void (*save_set_minmax) (shared_data*, float) = all->set_minmax;
+ float save_min_label;
+ float save_max_label;
+ float dropscale = dropout ? 2.0 : 1.0;
+ loss_function* save_loss = all->loss;
+
+ float* hidden_units = (float*) alloca (k * sizeof (float));
+ bool* dropped_out = (bool*) alloca (k * sizeof (bool));
+
+ string outputString;
+ stringstream outputStringStream(outputString);
+
+ all->set_minmax = noop_mm;
+ all->loss = squared_loss;
+ save_min_label = all->sd->min_label;
+ all->sd->min_label = hidden_min_activation;
+ save_max_label = all->sd->max_label;
+ all->sd->max_label = hidden_max_activation;
+ ld->label = FLT_MAX;
+ for (unsigned int i = 0; i < k; ++i)
+ {
+ if (i != 0)
+ update_example_indicies(all->audit, ec, increment);
+
+ base_learner(all,ec);
+ hidden_units[i] = finalize_prediction (*all, ec->partial_prediction);
+
+ dropped_out[i] = (dropout && erand48 (xsubi) < 0.5);
+
+ if (shouldOutput) {
+ if (i > 0) outputStringStream << ' ';
+ outputStringStream << i << ':' << ec->partial_prediction << ',' << fasttanh (hidden_units[i]);
+ }
+
+ ec->partial_prediction = 0;
+ }
+ ld->label = save_label;
+ all->loss = save_loss;
+ all->set_minmax = save_set_minmax;
+ all->sd->min_label = save_min_label;
+ all->sd->max_label = save_max_label;
+
+ bool converse = false;
+
+CONVERSE: // That's right, I'm using goto. So sue me.
+
+ output_layer.ld = ec->ld;
+ output_layer.total_sum_feat_sq = 1;
+ output_layer.sum_feat_sq[nn_output_namespace] = 1;
+
+ for (unsigned int i = 0; i < k; ++i)
+ {
+ float sigmah =
+ (dropped_out[i]) ? 0.0 : dropscale * fasttanh (hidden_units[i]);
+ output_layer.atomics[nn_output_namespace][i+1].x = sigmah;
+
+ output_layer.total_sum_feat_sq += sigmah * sigmah;
+ output_layer.sum_feat_sq[nn_output_namespace] += sigmah * sigmah;
+ }
+
+ output_layer.pass = ec->pass;
+ output_layer.partial_prediction = 0;
+ output_layer.eta_round = ec->eta_round;
+ output_layer.eta_global = ec->eta_global;
+ output_layer.global_weight = ec->global_weight;
+ output_layer.example_t = ec->example_t;
+ base_learner(all,&output_layer);
+ output_layer.final_prediction = finalize_prediction (*all, output_layer.partial_prediction);
+ output_layer.ld = 0;
+
+ if (shouldOutput) {
+ outputStringStream << ' ' << output_layer.partial_prediction;
+ all->print_text(all->raw_prediction, outputStringStream.str(), ec->tag);
+ }
+
+ if (all->training && ld->label != FLT_MAX) {
+ float gradient = all->loss->first_derivative(all->sd,
+ output_layer.final_prediction,
+ ld->label);
+
+ if (fabs (gradient) > 0) {
+ all->loss = squared_loss;
+ all->set_minmax = noop_mm;
+ save_min_label = all->sd->min_label;
+ all->sd->min_label = hidden_min_activation;
+ save_max_label = all->sd->max_label;
+ all->sd->max_label = hidden_max_activation;
+
+ for (unsigned int i = k; i > 0; --i) {
+ if (! dropped_out[i-1]) {
+ float sigmah =
+ output_layer.atomics[nn_output_namespace][i].x / dropscale;
+ float sigmahprime = dropscale * (1.0 - sigmah * sigmah);
+ float nu = all->reg.weight_vectors[output_layer.atomics[nn_output_namespace][i].weight_index & all->weight_mask];
+ float gradhw = 0.5 * nu * gradient * sigmahprime;
+
+ ld->label = finalize_prediction (*all, hidden_units[i-1] - gradhw);
+ if (ld->label != hidden_units[i-1]) {
+ ec->partial_prediction = 0.;
+ base_learner(all,ec);
+ }
+ }
+ if (i != 1) {
+ update_example_indicies(all->audit, ec, -increment);
+ }
+ }
+
+ all->loss = save_loss;
+ all->set_minmax = save_set_minmax;
+ all->sd->min_label = save_min_label;
+ all->sd->max_label = save_max_label;
+ }
+ else {
+ update_example_indicies(all->audit, ec, -total_increment);
+ }
+ }
+
+ ec->partial_prediction = output_layer.partial_prediction;
+ ec->final_prediction = output_layer.final_prediction;
+ ec->loss = output_layer.loss;
+ ld->label = save_label;
+
+ if (dropout && ! converse)
+ {
+ update_example_indicies (all->audit, ec, total_increment);
+
+ for (unsigned int i = 0; i < k; ++i)
+ {
+ dropped_out[i] = ! dropped_out[i];
+ }
+
+ converse = true;
+ goto CONVERSE;
+ }
+ }
+
+ void learn(void*a, example* ec) {
+ vw* all = (vw*)a;
+ learn_with_output(all, ec, false);
+ }
+
+ void drive_nn(void *in)
+ {
+ vw* all = (vw*)in;
+ example* ec = NULL;
+ while ( true )
+ {
+ if ((ec = get_example(all->p)) != NULL)//semiblocking operation.
+ {
+ learn_with_output(all, ec, all->raw_prediction > 0);
+ int save_raw_prediction = all->raw_prediction;
+ all->raw_prediction = -1;
+ return_simple_example(*all, ec);
+ all->raw_prediction = save_raw_prediction;
+ }
+ else if (parser_done(all->p))
+ return;
+ else
+ ;
+ }
+ }
+
+ void parse_flags(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+ {
+ //first parse for number of hidden units
+ k = 0;
+ if( vm_file.count("nn") ) {
+ k = vm_file["nn"].as<size_t>();
+ if( vm.count("nn") && vm["nn"].as<size_t>() != k )
+ std::cerr << "warning: you specified a different number of hidden units through --nn than the one loaded from predictor. Pursuing with loaded value of: " << k << endl;
+ }
+ else {
+ k = vm["nn"].as<size_t>();
+
+ //append nn with nb_actions to options_from_file so it is saved to regressor later
+ std::stringstream ss;
+ ss << " --nn " << k;
+ all.options_from_file.append(ss.str());
+ }
+
+ if( vm_file.count("dropout") ) {
+ dropout = true;
+ }
+ else if ( vm.count("dropout") ) {
+ dropout = true;
+
+ std::stringstream ss;
+ ss << " --dropout ";
+ all.options_from_file.append(ss.str());
+ }
+
+ if ( vm.count("meanfield") ) {
+ dropout = false;
+ if (! all.quiet)
+ std::cerr << "using mean field for neural network "
+ << (all.training ? "training" : "testing")
+ << std::endl;
+ }
+
+ if (dropout) {
+ if (! all.quiet)
+ std::cerr << "using dropout for neural network "
+ << (all.training ? "training" : "testing")
+ << std::endl;
+ }
+
+ all.driver = drive_nn;
+ base_learner = all.learn;
+ all.base_learn = all.learn;
+ all.learn = learn;
+
+ all.base_learner_nb_w *= k;
+ increment = (all.length()/all.base_learner_nb_w) * all.stride;
+ total_increment = increment*(k-1);
+
+ bool initialize = true;
+
+ memset (&output_layer, 0, sizeof (output_layer));
+ push(output_layer.indices, nn_output_namespace);
+ feature output = {1., nn_constant*all.stride};
+ push(output_layer.atomics[nn_output_namespace], output);
+ initialize &= (all.reg.weight_vectors[output_layer.atomics[nn_output_namespace][0].weight_index & all.weight_mask] == 0);
+
+ for (unsigned int i = 0; i < k; ++i)
+ {
+ output.weight_index += all.stride;
+ push(output_layer.atomics[nn_output_namespace], output);
+ initialize &= (all.reg.weight_vectors[output_layer.atomics[nn_output_namespace][i+1].weight_index & all.weight_mask] == 0);
+ }
+
+ output_layer.num_features = k + 1;
+ output_layer.in_use = true;
+
+ if (initialize) {
+ if (! all.quiet)
+ std::cerr << "randomly initializing neural network output weights and hidden bias" << std::endl;
+
+ // output weights
+
+ float sqrtk = sqrt (k);
+ for (unsigned int i = 0; i <= k; ++i)
+ {
+ weight* w = &all.reg.weight_vectors[output_layer.atomics[nn_output_namespace][i].weight_index & all.weight_mask];
+
+ w[0] = (float) (drand48 () - 0.5) / sqrtk;
+
+ // prevent divide by zero error
+ if (dropout && all.normalized_updates)
+ w[all.normalized_idx] = 1e-4;
+ }
+
+ // hidden biases
+
+ unsigned int weight_index = constant * all.stride;
+
+ for (unsigned int i = 0; i < k; ++i)
+ {
+ all.reg.weight_vectors[weight_index & all.weight_mask] = (float) (drand48 () - 0.5);
+ weight_index += increment;
+ }
+ }
+
+ squared_loss = getLossFunction (0, "squared", 0);
+
+ atexit (free_output_layer);
+ atexit (free_squared_loss);
+
+ memset (xsubi, 0, sizeof (xsubi));
+ unsigned short *old = seed48 (xsubi);
+ memcpy (xsubi, old, sizeof (xsubi));
+ memcpy (save_xsubi, old, sizeof (save_xsubi));
+ seed48 (xsubi);
+ }
+}
diff --git a/vowpalwabbit/nn.h b/vowpalwabbit/nn.h
new file mode 100644
index 00000000..f6f2c0cd
--- /dev/null
+++ b/vowpalwabbit/nn.h
@@ -0,0 +1,22 @@
+/*
+Copyright (c) by respective owners including Yahoo!, Microsoft, and
+individual contributors. All rights reserved. Released under a BSD
+license as described in the file LICENSE.
+ */
+#ifndef NN_H
+#define NN_H
+
+#include "io.h"
+#include "parse_primitives.h"
+#include "global_data.h"
+#include "example.h"
+#include "parse_args.h"
+#include "v_hashmap.h"
+#include "simple_label.h"
+
+namespace NN
+{
+ void parse_flags(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+}
+
+#endif
diff --git a/vowpalwabbit/parse_args.cc b/vowpalwabbit/parse_args.cc
index d0311bad..c1d2e301 100644
--- a/vowpalwabbit/parse_args.cc
+++ b/vowpalwabbit/parse_args.cc
@@ -14,6 +14,7 @@ license as described in the file LICENSE.
#include "sender.h"
#include "network.h"
#include "global_data.h"
+#include "nn.h"
#include "oaa.h"
#include "ect.h"
#include "csoaa.h"
@@ -97,6 +98,9 @@ vw parse_args(int argc, char *argv[])
("min_prediction", po::value<float>(&all.sd->min_label), "Smallest prediction to output")
("max_prediction", po::value<float>(&all.sd->max_label), "Largest prediction to output")
("mem", po::value<int>(&all.m), "memory in bfgs")
+ ("nn", po::value<size_t>(), "Use sigmoidal feedforward network with <k> hidden units")
+ ("dropout", "Train or test sigmoidal feedforward network using dropout.")
+ ("meanfield", "Train or test sigmoidal feedforward network using mean field.")
("noconstant", "Don't add a constant feature")
("noop","do no learning")
("oaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> labels")
@@ -591,7 +595,13 @@ vw parse_args(int argc, char *argv[])
bool got_mc = false;
bool got_cs = false;
bool got_cb = false;
+ bool got_nn = false;
+ if(vm.count("nn") || vm_file.count("nn") ) {
+ NN::parse_flags(all, to_pass_further, vm, vm_file);
+ got_nn = true;
+ }
+
if(vm.count("oaa") || vm_file.count("oaa") ) {
if (got_mc) { cerr << "error: cannot specify multiple MC learners" << endl; exit(-1); }
@@ -676,8 +686,8 @@ vw parse_args(int argc, char *argv[])
Searn::parse_flags(all, to_pass_further, vm, vm_file);
}
- if (got_cs && got_mc) {
- cerr << "error: doesn't make sense to do both MC learning and CS learning" << endl;
+ if (got_nn && (got_cs || got_mc)) {
+ cerr << "error: NN learning doesn't compose with other reductions (yet)" << endl;
exit(-1);
}
diff --git a/vowpalwabbit/parser.h b/vowpalwabbit/parser.h
index 07905745..5e6e16c0 100644
--- a/vowpalwabbit/parser.h
+++ b/vowpalwabbit/parser.h
@@ -13,6 +13,7 @@ license as described in the file LICENSE.
const size_t wap_ldf_namespace = 126;
const size_t history_namespace = 127;
const size_t constant_namespace = 128;
+const size_t nn_output_namespace = 129;
parser* new_parser();
#include <boost/program_options.hpp>