11 files changed, 462 insertions, 6 deletions
diff --git a/Makefile b/Makefile
index 5f758d20..36bde0b6 100644
--- a/Makefile
+++ b/Makefile
@@ -33,7 +33,7 @@ FLAGS = $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_FILE_OFFSET_BITS=64 -I $(BOOST_I
 #FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -pg -g
 
 # for valgrind
-FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O0
+#FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O0
 
 # for valgrind profiling: run 'valgrind --tool=callgrind PROGRAM' then 'callgrind_annotate --tree=both --inclusive=yes'
 #FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O3 -fomit-frame-pointer -ffast-math -fno-strict-aliasing
diff --git a/Makefile.am b/Makefile.am
index 648de3b7..d09026c8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -17,7 +17,7 @@ noinst_HEADERS = vowpalwabbit/accumulate.h vowpalwabbit/oaa.h		\
        vowpalwabbit/io.h vowpalwabbit/v_array.h				\
        vowpalwabbit/lda_core.h vowpalwabbit/v_hashmap.h			\
        vowpalwabbit/loss_functions.h vowpalwabbit/network.h		\
-       vowpalwabbit/wap.h vowpalwabbit/noop.h
+       vowpalwabbit/wap.h vowpalwabbit/noop.h vowpalwabbit/nn.h
 
 ACLOCAL_AMFLAGS = -I acinclude.d
 
diff --git a/test/RunTests b/test/RunTests
index cb37a92a..cefeb5e2 100755
--- a/test/RunTests
+++ b/test/RunTests
@@ -343,7 +343,7 @@ sub check_for_time_regression() {
                     " try removing it\n"
         }
         $overall_time0 = $cuser0 + $csystem0;
-        $pct_change = 100 * ($overall_time1 - $overall_time0) / $overall_time0;
+        $pct_change = 100 * ($overall_time1 - $overall_time0) / (1e-4+$overall_time0);
 
         if ($overall_time0 == 0) {
             die "$0: Bad times in saved times file: $LastTimeFile," .
@@ -580,3 +580,8 @@ __DATA__
 {VW} -k -c -d train-sets/seq_small --passes 12 --invariant --searn_passes_per_policy 4 --searn 4 --searn_task sequence
     train-sets/ref/searn_small.stdout
     train-sets/ref/searn_small.stderr
+
+# Test 21: neural network 3-parity with 2 hidden units
+{VW} -k -c -d train-sets/3parity --hash all --passes 2000 -b 16 --nn 2 -l 10 --invariant
+    train-sets/ref/3parity.stdout
+    train-sets/ref/3parity.stderr
diff --git a/test/train-sets/3parity b/test/train-sets/3parity
new file mode 100644
index 00000000..74fe86b4
--- /dev/null
+++ b/test/train-sets/3parity
@@ -0,0 +1,8 @@
+-1 |f  1:1 2:-1 3:-1
+-1 |f  1:-1 2:1 3:-1
+1 |f  1:1 2:1 3:-1
+-1 |f  1:-1 2:-1 3:1
+1 |f  1:1 2:-1 3:1
+1 |f  1:-1 2:1 3:1
+-1 |f  1:1 2:1 3:1
+1 |f  1:-1 2:-1 3:-1
diff --git a/test/train-sets/ref/3parity.stderr b/test/train-sets/ref/3parity.stderr
new file mode 100644
index 00000000..295b006b
--- /dev/null
+++ b/test/train-sets/ref/3parity.stderr
@@ -0,0 +1,32 @@
+Num weight bits = 16
+learning rate = 10
+initial_t = 1
+power_t = 0.5
+decay_learning_rate = 1
+randomly initializing neural network output weights and hidden bias
+creating cache_file = train-sets/3parity.cache
+Reading from train-sets/3parity
+num sources = 1
+average    since         example     example  current  current  current
+loss       last          counter      weight    label  predict features
+1.550870   1.550870            3         3.0   1.0000  -1.0000        4
+1.919601   2.288332            6         6.0   1.0000   0.7762        4
+2.011137   2.120980           11        11.0   1.0000  -1.0000        4
+2.154878   2.298620           22        22.0   1.0000   0.3713        4
+2.354256   2.553635           44        44.0  -1.0000   1.0000        4
+2.286332   2.216827           87        87.0  -1.0000   1.0000        4
+2.222494   2.158657          174       174.0   1.0000   0.8935        4
+1.716414   1.210335          348       348.0  -1.0000  -0.9598        4
+1.368982   1.021549          696       696.0   1.0000   0.9744        4
+1.151838   0.934694         1392      1392.0   1.0000   1.0000        4
+0.976327   0.800816         2784      2784.0   1.0000   1.0000        4
+0.756642   0.536958         5568      5568.0   1.0000   1.0000        4
+0.378355   0.000000        11135     11135.0  -1.0000  -1.0000        4
+
+finished run
+number of examples = 16000
+weighted example sum = 1.6e+04
+weighted label sum = 0
+average loss = 0.2633
+best constant = -6.25e-05
+total feature number = 64000
diff --git a/test/train-sets/ref/3parity.stdout b/test/train-sets/ref/3parity.stdout
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/test/train-sets/ref/3parity.stdout
diff --git a/vowpalwabbit/Makefile.am b/vowpalwabbit/Makefile.am
index dc307f67..22aa0eb4 100644
--- a/vowpalwabbit/Makefile.am
+++ b/vowpalwabbit/Makefile.am
@@ -5,7 +5,7 @@ include_HEADERS = allreduce.h
 
 bin_PROGRAMS = vw active_interactor
 
-libvw_la_SOURCES = hash.cc global_data.cc io.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc simple_label.cc oaa.cc ect.cc csoaa.cc cb.cc wap.cc beam.cc searn.cc searn_sequencetask.cc sequence.cc parse_example.cc sparse_dense.cc network.cc parse_args.cc accumulate.cc gd.cc lda_core.cc gd_mf.cc bfgs.cc noop.cc example.cc parser.cc vw.cc loss_functions.cc sender.cc
+libvw_la_SOURCES = hash.cc global_data.cc io.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc simple_label.cc oaa.cc ect.cc csoaa.cc cb.cc wap.cc beam.cc searn.cc searn_sequencetask.cc sequence.cc parse_example.cc sparse_dense.cc network.cc parse_args.cc accumulate.cc gd.cc lda_core.cc gd_mf.cc bfgs.cc noop.cc example.cc parser.cc vw.cc loss_functions.cc sender.cc nn.cc 
 
 vw_SOURCES = vw.cc
 vw_CXXFLAGS = $(AM_CXXFLAGS)
diff --git a/vowpalwabbit/nn.cc b/vowpalwabbit/nn.cc
new file mode 100644
index 00000000..6c4de7ee
--- /dev/null
+++ b/vowpalwabbit/nn.cc
@@ -0,0 +1,378 @@
+/*
+Copyright (c) by respective owners including Yahoo!, Microsoft, and
+individual contributors. All rights reserved.  Released under a BSD (revised)
+license as described in the file LICENSE.
+ */
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <sstream>
+
+#include "constant.h"
+#include "oaa.h"
+#include "simple_label.h"
+#include "cache.h"
+#include "v_hashmap.h"
+
+using namespace std;
+
+namespace NN {
+  //nonreentrant
+  size_t k=0;
+  size_t increment=0;
+  size_t total_increment=0;
+  loss_function* squared_loss;
+  example output_layer;
+  const float hidden_min_activation = -3;
+  const float hidden_max_activation = 3;
+  const int nn_constant = 533357803;
+  bool dropout = false;
+  unsigned short xsubi[3];
+  unsigned short save_xsubi[3];
+  size_t nn_current_pass = 0;
+
+  static void
+  free_squared_loss (void)
+  {
+    delete squared_loss;
+  }
+
+  static void
+  free_output_layer (void)
+  {
+    free (output_layer.indices.begin);
+    free (output_layer.atomics[nn_output_namespace].begin);
+  }
+
+#define cast_uint32_t static_cast<uint32_t>
+
+  static inline float
+  fastpow2 (float p)
+  {
+    float offset = (p < 0) ? 1.0f : 0.0f;
+    float clipp = (p < -126) ? -126.0f : p;
+    int w = clipp;
+    float z = clipp - w + offset;
+    union { uint32_t i; float f; } v = { cast_uint32_t ( (1 << 23) * (clipp + 121.2740575f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z) ) };
+
+    return v.f;
+  }
+
+  static inline float
+  fastexp (float p)
+  {
+    return fastpow2 (1.442695040f * p);
+  }
+
+  static inline float
+  fasttanh (float p)
+  {
+    return -1.0f + 2.0f / (1.0f + fastexp (-2.0f * p));
+  }
+
+  void scale_example_indicies(bool audit, example* ec, size_t amount)
+  {
+    for (size_t* i = ec->indices.begin; i != ec->indices.end; i++) 
+      {
+        feature* end = ec->atomics[*i].end;
+        for (feature* f = ec->atomics[*i].begin; f!= end; f++)
+          f->weight_index *= amount;
+      }
+    if (audit)
+      {
+        for (size_t* i = ec->indices.begin; i != ec->indices.end; i++) 
+          if (ec->audit_features[*i].begin != ec->audit_features[*i].end)
+            for (audit_data *f = ec->audit_features[*i].begin; f != ec->audit_features[*i].end; f++)
+              f->weight_index *= amount;
+      }
+  }
+
+  void (*base_learner)(void*,example*) = NULL;
+
+  void learn_with_output(vw*all, example* ec, bool shouldOutput)
+  {
+    if (command_example(*all, ec)) {
+      return;
+    }
+
+    if (all->bfgs && ec->pass != nn_current_pass) {
+      memcpy (xsubi, save_xsubi, sizeof (xsubi));
+      nn_current_pass = ec->pass;
+    }
+
+    label_data* ld = (label_data*)ec->ld;
+    float save_label = ld->label;
+    void (*save_set_minmax) (shared_data*, float) = all->set_minmax;
+    float save_min_label;
+    float save_max_label;
+    float dropscale = dropout ? 2.0 : 1.0;
+    loss_function* save_loss = all->loss;
+
+    float* hidden_units = (float*) alloca (k * sizeof (float));
+    bool* dropped_out = (bool*) alloca (k * sizeof (bool));
+  
+    string outputString;
+    stringstream outputStringStream(outputString);
+
+    all->set_minmax = noop_mm;
+    all->loss = squared_loss;
+    save_min_label = all->sd->min_label;
+    all->sd->min_label = hidden_min_activation;
+    save_max_label = all->sd->max_label;
+    all->sd->max_label = hidden_max_activation;
+    ld->label = FLT_MAX;
+    for (unsigned int i = 0; i < k; ++i)
+      {
+        if (i != 0)
+          update_example_indicies(all->audit, ec, increment);
+
+        base_learner(all,ec);
+        hidden_units[i] = finalize_prediction (*all, ec->partial_prediction);
+
+        dropped_out[i] = (dropout && erand48 (xsubi) < 0.5);
+
+        if (shouldOutput) {
+          if (i > 0) outputStringStream << ' ';
+          outputStringStream << i << ':' << ec->partial_prediction << ',' << fasttanh (hidden_units[i]);
+        }
+
+        ec->partial_prediction = 0;
+      }
+    ld->label = save_label;
+    all->loss = save_loss;
+    all->set_minmax = save_set_minmax;
+    all->sd->min_label = save_min_label;
+    all->sd->max_label = save_max_label;
+
+    bool converse = false;
+
+CONVERSE: // That's right, I'm using goto.  So sue me.
+
+    output_layer.ld = ec->ld;
+    output_layer.total_sum_feat_sq = 1;
+    output_layer.sum_feat_sq[nn_output_namespace] = 1;
+
+    for (unsigned int i = 0; i < k; ++i)
+      {
+        float sigmah = 
+          (dropped_out[i]) ? 0.0 : dropscale * fasttanh (hidden_units[i]);
+        output_layer.atomics[nn_output_namespace][i+1].x = sigmah;
+
+        output_layer.total_sum_feat_sq += sigmah * sigmah;
+        output_layer.sum_feat_sq[nn_output_namespace] += sigmah * sigmah;
+      }
+
+    output_layer.pass = ec->pass;
+    output_layer.partial_prediction = 0;
+    output_layer.eta_round = ec->eta_round;
+    output_layer.eta_global = ec->eta_global;
+    output_layer.global_weight = ec->global_weight;
+    output_layer.example_t = ec->example_t;
+    base_learner(all,&output_layer);
+    output_layer.final_prediction = finalize_prediction (*all, output_layer.partial_prediction);
+    output_layer.ld = 0;
+
+    if (shouldOutput) {
+      outputStringStream << ' ' << output_layer.partial_prediction;
+      all->print_text(all->raw_prediction, outputStringStream.str(), ec->tag);
+    }
+
+    if (all->training && ld->label != FLT_MAX) {
+      float gradient = all->loss->first_derivative(all->sd, 
+                                                   output_layer.final_prediction,
+                                                   ld->label);
+
+      if (fabs (gradient) > 0) {
+        all->loss = squared_loss;
+        all->set_minmax = noop_mm;
+        save_min_label = all->sd->min_label;
+        all->sd->min_label = hidden_min_activation;
+        save_max_label = all->sd->max_label;
+        all->sd->max_label = hidden_max_activation;
+
+        for (unsigned int i = k; i > 0; --i) {
+          if (! dropped_out[i-1]) {
+            float sigmah = 
+              output_layer.atomics[nn_output_namespace][i].x / dropscale;
+            float sigmahprime = dropscale * (1.0 - sigmah * sigmah);
+            float nu = all->reg.weight_vectors[output_layer.atomics[nn_output_namespace][i].weight_index & all->weight_mask];
+            float gradhw = 0.5 * nu * gradient * sigmahprime;
+
+            ld->label = finalize_prediction (*all, hidden_units[i-1] - gradhw);
+            if (ld->label != hidden_units[i-1]) {
+              ec->partial_prediction = 0.;
+              base_learner(all,ec);
+            }
+          }
+          if (i != 1) {
+            update_example_indicies(all->audit, ec, -increment);
+          }
+        }
+
+        all->loss = save_loss;
+        all->set_minmax = save_set_minmax;
+        all->sd->min_label = save_min_label;
+        all->sd->max_label = save_max_label;
+      }
+      else {
+        update_example_indicies(all->audit, ec, -total_increment);
+      }
+    }
+
+    ec->partial_prediction = output_layer.partial_prediction;
+    ec->final_prediction = output_layer.final_prediction;
+    ec->loss = output_layer.loss;
+    ld->label = save_label;
+
+    if (dropout && ! converse)
+      {
+        update_example_indicies (all->audit, ec, total_increment);
+
+        for (unsigned int i = 0; i < k; ++i)
+          {
+            dropped_out[i] = ! dropped_out[i];
+          }
+
+        converse = true;
+        goto CONVERSE;
+      }
+  }
+
+  void learn(void*a, example* ec) {
+    vw* all = (vw*)a;
+    learn_with_output(all, ec, false);
+  }
+
+  void drive_nn(void *in)
+  {
+    vw* all = (vw*)in;
+    example* ec = NULL;
+    while ( true )
+      {
+        if ((ec = get_example(all->p)) != NULL)//semiblocking operation.
+          {
+            learn_with_output(all, ec, all->raw_prediction > 0);
+            int save_raw_prediction = all->raw_prediction;
+            all->raw_prediction = -1;
+            return_simple_example(*all, ec);
+            all->raw_prediction = save_raw_prediction;
+          }
+        else if (parser_done(all->p))
+	  return;
+        else 
+          ;
+      }
+  }
+
+  void parse_flags(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  {
+    //first parse for number of hidden units
+    k = 0;
+    if( vm_file.count("nn") ) {
+      k = vm_file["nn"].as<size_t>();
+      if( vm.count("nn") && vm["nn"].as<size_t>() != k )
+        std::cerr << "warning: you specified a different number of hidden units through --nn than the one loaded from predictor. Pursuing with loaded value of: " << k << endl;
+    }
+    else {
+      k = vm["nn"].as<size_t>();
+
+      //append nn with nb_actions to options_from_file so it is saved to regressor later
+      std::stringstream ss;
+      ss << " --nn " << k;
+      all.options_from_file.append(ss.str());
+    }
+
+    if( vm_file.count("dropout") ) {
+      dropout = true;
+    }
+    else if ( vm.count("dropout") ) {
+      dropout = true;
+
+      std::stringstream ss;
+      ss << " --dropout ";
+      all.options_from_file.append(ss.str());
+    }
+
+    if ( vm.count("meanfield") ) {
+      dropout = false;
+      if (! all.quiet) 
+        std::cerr << "using mean field for neural network " 
+                  << (all.training ? "training" : "testing") 
+                  << std::endl;
+    }
+
+    if (dropout) {
+      if (! all.quiet)
+        std::cerr << "using dropout for neural network "
+                  << (all.training ? "training" : "testing") 
+                  << std::endl;
+    }
+
+    all.driver = drive_nn;
+    base_learner = all.learn;
+    all.base_learn = all.learn;
+    all.learn = learn;
+
+    all.base_learner_nb_w *= k;
+    increment = (all.length()/all.base_learner_nb_w) * all.stride;
+    total_increment = increment*(k-1);
+
+    bool initialize = true;
+
+    memset (&output_layer, 0, sizeof (output_layer));
+    push(output_layer.indices, nn_output_namespace);
+    feature output = {1., nn_constant*all.stride};
+    push(output_layer.atomics[nn_output_namespace], output);
+    initialize &= (all.reg.weight_vectors[output_layer.atomics[nn_output_namespace][0].weight_index & all.weight_mask] == 0);
+
+    for (unsigned int i = 0; i < k; ++i)
+      {
+        output.weight_index += all.stride;
+        push(output_layer.atomics[nn_output_namespace], output);
+        initialize &= (all.reg.weight_vectors[output_layer.atomics[nn_output_namespace][i+1].weight_index & all.weight_mask] == 0);
+      }
+
+    output_layer.num_features = k + 1;
+    output_layer.in_use = true;
+
+    if (initialize) {
+      if (! all.quiet) 
+        std::cerr << "randomly initializing neural network output weights and hidden bias" << std::endl;
+
+      // output weights
+
+      float sqrtk = sqrt (k);
+      for (unsigned int i = 0; i <= k; ++i)
+        {
+          weight* w = &all.reg.weight_vectors[output_layer.atomics[nn_output_namespace][i].weight_index & all.weight_mask];
+
+          w[0] = (float) (drand48 () - 0.5) / sqrtk;
+
+          // prevent divide by zero error
+          if (dropout && all.normalized_updates)
+            w[all.normalized_idx] = 1e-4;
+        }
+
+      // hidden biases
+
+      unsigned int weight_index = constant * all.stride;
+
+      for (unsigned int i = 0; i < k; ++i)
+        {
+          all.reg.weight_vectors[weight_index & all.weight_mask] = (float) (drand48 () - 0.5);
+          weight_index += increment;
+        }
+    }
+
+    squared_loss = getLossFunction (0, "squared", 0);
+
+    atexit (free_output_layer);
+    atexit (free_squared_loss);
+
+    memset (xsubi, 0, sizeof (xsubi));
+    unsigned short *old = seed48 (xsubi);
+    memcpy (xsubi, old, sizeof (xsubi));
+    memcpy (save_xsubi, old, sizeof (save_xsubi));
+    seed48 (xsubi);
+  }
+}
diff --git a/vowpalwabbit/nn.h b/vowpalwabbit/nn.h
new file mode 100644
index 00000000..f6f2c0cd
--- /dev/null
+++ b/vowpalwabbit/nn.h
@@ -0,0 +1,22 @@
+/*
+Copyright (c) by respective owners including Yahoo!, Microsoft, and
+individual contributors. All rights reserved.  Released under a BSD
+license as described in the file LICENSE.
+ */
+#ifndef NN_H
+#define NN_H
+
+#include "io.h"
+#include "parse_primitives.h"
+#include "global_data.h"
+#include "example.h"
+#include "parse_args.h"
+#include "v_hashmap.h"
+#include "simple_label.h"
+
+namespace NN
+{
+  void parse_flags(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+}
+
+#endif
diff --git a/vowpalwabbit/parse_args.cc b/vowpalwabbit/parse_args.cc
index d0311bad..c1d2e301 100644
--- a/vowpalwabbit/parse_args.cc
+++ b/vowpalwabbit/parse_args.cc
@@ -14,6 +14,7 @@ license as described in the file LICENSE.
 #include "sender.h"
 #include "network.h"
 #include "global_data.h"
+#include "nn.h"
 #include "oaa.h"
 #include "ect.h"
 #include "csoaa.h"
@@ -97,6 +98,9 @@ vw parse_args(int argc, char *argv[])
     ("min_prediction", po::value<float>(&all.sd->min_label), "Smallest prediction to output")
     ("max_prediction", po::value<float>(&all.sd->max_label), "Largest prediction to output")
     ("mem", po::value<int>(&all.m), "memory in bfgs")
+    ("nn", po::value<size_t>(), "Use sigmoidal feedforward network with <k> hidden units")
+    ("dropout", "Train or test sigmoidal feedforward network using dropout.")
+    ("meanfield", "Train or test sigmoidal feedforward network using mean field.")
     ("noconstant", "Don't add a constant feature")
     ("noop","do no learning")
     ("oaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> labels")
@@ -591,7 +595,13 @@ vw parse_args(int argc, char *argv[])
   bool got_mc = false;
   bool got_cs = false;
   bool got_cb = false;
+  bool got_nn = false;
 
+  if(vm.count("nn") || vm_file.count("nn") ) {
+    NN::parse_flags(all, to_pass_further, vm, vm_file);
+    got_nn = true;
+  }
+  
   if(vm.count("oaa") || vm_file.count("oaa") ) {
     if (got_mc) { cerr << "error: cannot specify multiple MC learners" << endl; exit(-1); }
 
@@ -676,8 +686,8 @@ vw parse_args(int argc, char *argv[])
     Searn::parse_flags(all, to_pass_further, vm, vm_file);
   }
 
-  if (got_cs && got_mc) {
-    cerr << "error: doesn't make sense to do both MC learning and CS learning" << endl;
+  if (got_nn && (got_cs || got_mc)) {
+    cerr << "error: NN learning doesn't compose with other reductions (yet)" << endl;
     exit(-1);
   }
 
diff --git a/vowpalwabbit/parser.h b/vowpalwabbit/parser.h
index 07905745..5e6e16c0 100644
--- a/vowpalwabbit/parser.h
+++ b/vowpalwabbit/parser.h
@@ -13,6 +13,7 @@ license as described in the file LICENSE.
 const size_t wap_ldf_namespace  = 126;
 const size_t history_namespace  = 127;
 const size_t constant_namespace = 128;
+const size_t nn_output_namespace  = 129;
 
 parser* new_parser();
 #include <boost/program_options.hpp>