Merge branch 'master' of git://github.com/JohnLangford/vowpal_wabbit

author: ariel faigon <github.2009@yendor.com> 2015-01-01 14:50:40 +0300
committer: ariel faigon <github.2009@yendor.com> 2015-01-01 14:50:40 +0300
commit: f1f859a19c8a3c70ad7b4706d78ae5cac6bdca36 (patch)
tree: f014f81b35fd0c25b1fec5eebe5380d0a3a4dd4e
parent: 165198fc8d2c8e25e2af274a65eb0b30d6fa5c6d (diff)
parent: ca852d8cfeef77ca60aa426c9070f1568ecf644e (diff)
94 files changed, 1501 insertions, 1322 deletions
diff --git a/Makefile b/Makefile
index 9a9a14b6..0d53bb54 100644
--- a/Makefile
+++ b/Makefile
@@ -94,7 +94,7 @@ library_example: vw
 
 python: vw
 	cd python; $(MAKE) things
-	
+
 ifneq ($(JAVA_HOME),)
 java: vw
 	cd java; $(MAKE) things
diff --git a/README.windows.txt b/README.windows.txt
index 8ca5560c..3775ba12 100644
--- a/README.windows.txt
+++ b/README.windows.txt
@@ -118,12 +118,11 @@ If you use another directory modify the vw solution and project macro definition
 			
 
 	   open a  Windows command shell
-	  (a) mkdir c:\boost 
-			
+      (a) mkdir c:\boost 
       (b) Download boost_1_56_0.zip from http://sourceforge.net/projects/boost/files/boost/1.56.0/boost_1_56_0.zip/download
-	  (c) Unzip it which creates the directory boost_1_56_0
+      (c) Unzip it which creates the directory boost_1_56_0
       (d) mkdir c:\boost\x86
-	  (e) mkdir c:\boost\x64
+      (e) mkdir c:\boost\x64
  	  
 build the x86 binaries
 	  (f)"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat x86"
@@ -135,10 +134,11 @@ build the x86 binaries
 
 	
 build the x64 binaries
-	  (k) mkdir c:\boost\x64	
-	  (l) "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat"  x86_amd64"
-	  (m) "bootstrap.bat"
-	  (n) ".\b2 --prefix=c:\boost\x64 --build-dir=x64 --toolset=msvc-12.0 address-model=64 install --with-program_options"
+	  (k) "C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\vcvarsall.bat"  x86_amd64"	
+	  (l) mkdir c:\boost\x64
+	  (m) cd c:\boost\boost_1_56_0
+	  (n) "bootstrap.bat"
+	  (o) ".\b2 --prefix=c:\boost\x64 --build-dir=x64 --toolset=msvc-12.0 address-model=64 install --with-program_options"
 
 	  
 	  
diff --git a/python/Learning_to_Search.ipynb b/python/Learning_to_Search.ipynb
index 263a3326..961b6c11 100644
--- a/python/Learning_to_Search.ipynb
+++ b/python/Learning_to_Search.ipynb
@@ -137,7 +137,7 @@
      "source": [
       "Let's unpack this a bit.\n",
       "\n",
-      "The `__init__` function is simple. It first calls the parent initializer and then sets some options. The options is sets are two things designed to make the programmer's life easier. The first is `AUTO_HAMMING_LOSS`. Remember earlier we said that when the sequence of decision is made, you have to say how bad it was? This says that we want this to be computed automatically by comparing the individual decisions to the oracle decisions, and defining the loss to be the sum of incorrect decisions.\n",
+      "The `__init__` function is simple. It first calls the parent initializer and then sets some options. The options it sets are two things designed to make the programmer's life easier. The first is `AUTO_HAMMING_LOSS`. Remember earlier we said that when the sequence of decision is made, you have to say how bad it was? This says that we want this to be computed automatically by comparing the individual decisions to the oracle decisions, and defining the loss to be the sum of incorrect decisions.\n",
       "\n",
       "The second is `AUTO_CONDITION_FEATURES`. This is a bit subtler. Later in the `_run` function, we will say that the label of the `n`th word depends on the label of the `n-1`th word. In order to get the underlying classifier to *pay attention* to that conditioning, we need to add features. We could do that manually (we'll do this later) or we can ask vw to do it automatically for us. For simplicity, we choose the latter.\n",
       "\n",
@@ -778,4 +778,4 @@
    "metadata": {}
   }
  ]
-}
-\ No newline at end of file
+}
diff --git a/test/RunTests b/test/RunTests
index 518f9df1..2d04a6f5 100755
--- a/test/RunTests
+++ b/test/RunTests
@@ -1077,3 +1077,12 @@ __DATA__
 {VW} -d train-sets/0002.dat --autolink 1 --examples 100 -p 0002.autolink.predict
     train-sets/ref/0002.autolink.stderr
     train-sets/ref/0002.autolink.predict
+
+# Test 72: train FTRL-Proximal
+{VW} -k -d train-sets/0001.dat -f models/0001_ftrl.model --passes 1 --ftrl --ftrl_alpha 0.01 --ftrl_beta 0 --l1 2
+    train-sets/ref/0001_ftrl.stderr
+
+# Test 73: test FTRL-Proximal
+{VW} -k -t train-sets/0001.dat -i models/0001_ftrl.model -p ftrl_001.predict.tmp
+    test-sets/ref/0001_ftrl.stderr
+    pred-sets/ref/0001_ftrl.predict
diff --git a/test/daemon-test.sh b/test/daemon-test.sh
index 92267508..8c35fc1f 100755
--- a/test/daemon-test.sh
+++ b/test/daemon-test.sh
@@ -1,38 +1,16 @@
-#!/bin/sh
+#!/bin/bash
 # -- vw daemon test
 #
 NAME='vw-daemon-test'
 
-# This is a ugly hack:
-# Travis doesn't like this test, possibly because of firewall rules
-# on the travis-ci env, so don't bother running it on travis machines.
-HOSTNAME=`hostname`
-case $HOSTNAME in
-    *worker-linux*|*travis-ci.org)
-        # Don't generate anything to STDERR or it'll fail
-        : "travis host: $HOSTNAME detected, skipping test: $0"
-        echo "$NAME: OK"
-        exit 0
-        ;;
-esac
-
 export PATH="vowpalwabbit:../vowpalwabbit:${PATH}"
 # The VW under test
 VW=`which vw`
-#
-# VW=vw-7.20140627    Good
-# VW=vw-7.20140709    Bad
-#
-#   7e138ac19bb3e4be88201d521249d87f52e378f3    BAD
-#   cad00a0dd558a34f210b712b34da26e31374b8b9    GOOD
-#
-
 
 MODEL=$NAME.model
 TRAINSET=$NAME.train
 PREDREF=$NAME.predref
 PREDOUT=$NAME.predict
-LOCALHOST=0
 PORT=54248
 
 # -- make sure we can find vw first
@@ -87,8 +65,8 @@ stop_daemon() {
 start_daemon() {
     # echo starting daemon
     $DaemonCmd </dev/null >/dev/null &
-    # give vw some time to load the model and be ready
-    mysleep 0.05
+    # give it time to be ready
+    wait; wait; wait
 }
 
 cleanup() {
@@ -116,18 +94,20 @@ $VW -b 10 --quiet -d $TRAINSET -f $MODEL
 
 start_daemon
 
-# Test on train-set, gnu netcat returns immediately, but OpenBSD netcat
-# hangs unless we use '-q 0' (which is GNU netcat incompatible)
-# Hacky solution is to start netcat in the background and wait for
-# it to output two lines.
-touch $PREDOUT      # must exist
-$NETCAT -n $LOCALHOST $PORT < $TRAINSET >> $PREDOUT &
-
+# Test on train-set
+# OpenBSD netcat quits immediately after stdin EOF
+# nc.traditional does not, so let's use -q 1.
+#$NETCAT -q 1 localhost $PORT < $TRAINSET > $PREDOUT
+#wait
+# However, GNU netcat does not know -q, so let's do a work-around
+touch $PREDOUT
+$NETCAT localhost $PORT < $TRAINSET > $PREDOUT &
 # Wait until we recieve a prediction from the vw daemon then kill netcat
-until [ `wc -l < $PREDOUT` -eq 2 ]; do mysleep 0.05; done
+until [ `wc -l < $PREDOUT` -eq 2 ]; do :; done
 $PKILL -9 $NETCAT
 
-diff $PREDREF $PREDOUT
+# We should ignore small (< $Epsilon) floating-point differences (fuzzy compare)
+diff <(cut -c-5 $PREDREF) <(cut -c-5 $PREDOUT)
 case $? in
     0)  echo "$NAME: OK"
         cleanup
diff --git a/test/pred-sets/ref/0001_ftrl.predict b/test/pred-sets/ref/0001_ftrl.predict
new file mode 100644
index 00000000..b25c5f14
--- /dev/null
+++ b/test/pred-sets/ref/0001_ftrl.predict
@@ -0,0 +1,200 @@
+0.148139
+0.149886
+0.146102
+0.146261
+0.146004
+0.150632
+0.146453
+0.149597
+0.146937
+0.151828
+0.145943
+0.149081
+0.146763
+0.149353
+0.151105
+0.151723
+0.146879
+0.147548
+0.149135
+0.146180
+0.147393
+0.148680
+0.147577
+0.147491
+0.145806
+0.146659
+0.146162
+0.146864
+0.148203
+0.146874
+0.154894
+0.145806
+0.146263
+0.146565
+0.147723
+0.149017
+0.148744
+0.148413
+0.147824
+0.164194
+0.145806
+0.147537
+0.146497
+0.147713
+0.146387
+0.145806
+0.147941
+0.145994
+0.147120
+0.145839
+0.146759
+0.146780
+0.146082
+0.148553
+0.158496
+0.149754
+0.148530
+0.149789
+0.147992
+0.146164
+0.147383
+0.147015
+0.151542
+0.145806
+0.150415
+0.146394
+0.145806
+0.146673
+0.148820
+0.148958
+0.147902
+0.149351
+0.146609
+0.147084
+0.153530
+0.147889
+0.147304
+0.147790
+0.145806
+0.146484
+0.145951
+0.146190
+0.157696
+0.145881
+0.145916
+0.145806
+0.145841
+0.148447
+0.151770
+0.147781
+0.145867
+0.149931
+0.160551
+0.146358
+0.148946
+0.145857
+0.148735
+0.145806
+0.146633
+0.147461
+0.146732
+0.147819
+0.146551
+0.147912
+0.147477
+0.149064
+0.145985
+0.146388
+0.146095
+0.146254
+0.150747
+0.145985
+0.147522
+0.147671
+0.145806
+0.149411
+0.146787
+0.147408
+0.146400
+0.147492
+0.146148
+0.145990
+0.147976
+0.145956
+0.149059
+0.157779
+0.146221
+0.146270
+0.146986
+0.149931
+0.146057
+0.149236
+0.145806
+0.145806
+0.146095
+0.146229
+0.147008
+0.147346
+0.156727
+0.146704
+0.146059
+0.145983
+0.150402
+0.152494
+0.149853
+0.147685
+0.149237
+0.164478
+0.146583
+0.146944
+0.148574
+0.145806
+0.147213
+0.146807
+0.147313
+0.147939
+0.158719
+0.147603
+0.167189
+0.145958
+0.161452
+0.146334
+0.146107
+0.156393
+0.146060
+0.147053
+0.158706
+0.147079
+0.146230
+0.149644
+0.150721
+0.146386
+0.146222
+0.147244
+0.145906
+0.150127
+0.148533
+0.156075
+0.148192
+0.153242
+0.149490
+0.150472
+0.147509
+0.145806
+0.146288
+0.146351
+0.148459
+0.149844
+0.147283
+0.151779
+0.158779
+0.145806
+0.145806
+0.145913
+0.145806
+0.147957
+0.149762
+0.146178
+0.145888
+0.146106
diff --git a/test/test-sets/ref/0001_ftrl.stderr b/test/test-sets/ref/0001_ftrl.stderr
new file mode 100644
index 00000000..4d9c1c54
--- /dev/null
+++ b/test/test-sets/ref/0001_ftrl.stderr
@@ -0,0 +1,29 @@
+only testing
+Num weight bits = 18
+learning rate = 10
+initial_t = 1
+power_t = 0.5
+predictions = ftrl_001.predict.tmp
+using no cache
+Reading datafile = train-sets/0001.dat
+num sources = 1
+average    since         example     example  current  current  current
+loss       last          counter      weight    label  predict features
+0.725668   0.725668            1         1.0   1.0000   0.1481       51
+0.374067   0.022466            2         2.0   0.0000   0.1499      104
+0.197718   0.021369            4         4.0   0.0000   0.1463      135
+0.197180   0.196643            8         8.0   0.0000   0.1496      146
+0.240375   0.283569           16        16.0   1.0000   0.1517       24
+0.262741   0.285108           32        32.0   0.0000   0.1458       32
+0.273560   0.284380           64        64.0   0.0000   0.1458       61
+0.312129   0.350697          128       128.0   1.0000   0.1463      106
+
+finished run
+number of examples per pass = 200
+passes used = 1
+weighted example sum = 200
+weighted label sum = 91
+average loss = 0.3403
+best constant = 0.455
+best constant's loss = 0.247975
+total feature number = 15482
diff --git a/test/train-sets/ref/0001_ftrl.stderr b/test/train-sets/ref/0001_ftrl.stderr
new file mode 100644
index 00000000..637e4c39
--- /dev/null
+++ b/test/train-sets/ref/0001_ftrl.stderr
@@ -0,0 +1,31 @@
+using l1 regularization = 2
+final_regressor = models/0001_ftrl.model
+Enabling FTRL-Proximal based optimization
+ftrl_alpha = 0.01
+ftrl_beta = 0
+Num weight bits = 18
+learning rate = 0.5
+initial_t = 0
+power_t = 0.5
+using no cache
+Reading datafile = train-sets/0001.dat
+num sources = 1
+average    since         example     example  current  current  current
+loss       last          counter      weight    label  predict features
+1.000000   1.000000            1         1.0   1.0000   0.0000       51
+0.500000   0.000000            2         2.0   0.0000   0.0000      104
+0.250000   0.000000            4         4.0   0.0000   0.0000      135
+0.250012   0.250025            8         8.0   0.0000   0.0070      146
+0.308000   0.365988           16        16.0   1.0000   0.0173       24
+0.330613   0.353225           32        32.0   0.0000   0.0413       32
+0.334181   0.337749           64        64.0   0.0000   0.0670       61
+0.363877   0.393573          128       128.0   1.0000   0.1077      106
+
+finished run
+number of examples = 200
+weighted example sum = 200
+weighted label sum = 91
+average loss = 0.379717
+best constant = 0.455
+best constant's loss = 0.247975
+total feature number = 15482
diff --git a/test/train-sets/ref/rcv1_small.stderr b/test/train-sets/ref/rcv1_small.stderr
index 388df567..e5c59fef 100644
--- a/test/train-sets/ref/rcv1_small.stderr
+++ b/test/train-sets/ref/rcv1_small.stderr
@@ -17,13 +17,13 @@ num sources = 1
  5 0.47993   	0.00007   	0.00448   	 0.617627  	0.225808  	          	          	0.71197   	1.00000   
  6 0.47794   	0.00001   	0.00174   	 0.691628  	0.377800  	          	          	0.93214   	1.00000   
  7 0.47685   	0.00001   	0.00041   	 0.606087  	0.209707  	          	          	0.18141   	1.00000   
- 8 0.47668   	0.00000   	0.00001   	 0.538841  	0.077584  	          	          	0.00292   	1.00000   
+ 8 0.47668   	0.00000   	0.00001   	 0.538842  	0.077584  	          	          	0.00292   	1.00000   
 
 finished run
-number of examples = 8256
-weighted example sum = 8256
-weighted label sum = -686
-average loss = 0.458492
-best constant = -0.166566
-best constant's loss = 0.689691
-total feature number = 649768
+number of examples = 8000
+weighted example sum = 8000
+weighted label sum = -656
+average loss = 0.461878
+best constant = -0.164369
+best constant's loss = 0.689781
+total feature number = 629912
diff --git a/test/train-sets/ref/search_er.stderr b/test/train-sets/ref/search_er.stderr
index dafd80f6..27204d74 100644
--- a/test/train-sets/ref/search_er.stderr
+++ b/test/train-sets/ref/search_er.stderr
@@ -12,12 +12,11 @@ loss       last        counter           output prefix          output prefix  p
 2.000000   3.000000          2  [2 4 2 5 10 10       ] [4 4 4 10 10 10      ]     0     0        7        0        7  0.000000
 2.875000   3.750000          4  [1 4 4 1 10 10 10 1..] [4 4 4 4 10 10 10 1..]     0     0       32        0       32  0.000000
 1.437500   0.000000          8  [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..]     1     0       42        0       64  0.000001
-0.718750   0.000000         16  [1 4 4 1 10 10 10 1..] [1 4 4 1 10 10 10 1..]     3     0       52        0      128  0.000001
 
 finished run
 number of examples per pass = 4
-passes used = 6
-weighted example sum = 24
+passes used = 3
+weighted example sum = 12
 weighted label sum = 0
-average loss = 0.479167
-total feature number = 649
+average loss = 0.958333
+total feature number = 522
diff --git a/test/train-sets/ref/zero.stderr b/test/train-sets/ref/zero.stderr
index ea8f11b1..afcf80f1 100644
--- a/test/train-sets/ref/zero.stderr
+++ b/test/train-sets/ref/zero.stderr
@@ -14,8 +14,8 @@ num sources = 1
  1 0.00000   	0.00000   	0.00000   	          	          	          	0.00000   	0.00000   	0.00000   
 
 finished run
-number of examples = 25
-weighted example sum = 25
+number of examples = 10
+weighted example sum = 10
 weighted label sum = 0
 average loss = 0
-total feature number = 15005
+total feature number = 6002
diff --git a/vowpalwabbit/Makefile b/vowpalwabbit/Makefile
index 7cf003ea..79893902 100644
--- a/vowpalwabbit/Makefile
+++ b/vowpalwabbit/Makefile
@@ -10,6 +10,7 @@ all:
 test:
 	cd ..; $(MAKE) test
 
+
 things:	config.h $(BINARIES)
 
 %.1:	%
diff --git a/vowpalwabbit/Makefile.am b/vowpalwabbit/Makefile.am
index dd8a9787..676d6f4e 100644
--- a/vowpalwabbit/Makefile.am
+++ b/vowpalwabbit/Makefile.am
@@ -4,7 +4,7 @@ liballreduce_la_SOURCES = allreduce.cc
 
 bin_PROGRAMS = vw active_interactor
 
-libvw_la_SOURCES = hash.cc memory.cc global_data.cc io_buf.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc rand48.cc simple_label.cc multiclass.cc oaa.cc ect.cc autolink.cc binary.cc lrq.cc cost_sensitive.cc csoaa.cc cb.cc cb_algs.cc search.cc search_sequencetask.cc search_dep_parser.cc search_hooktask.cc search_multiclasstask.cc search_entityrelationtask.cc parse_example.cc scorer.cc network.cc parse_args.cc accumulate.cc gd.cc learner.cc lda_core.cc gd_mf.cc mf.cc bfgs.cc noop.cc print.cc example.cc parser.cc loss_functions.cc sender.cc nn.cc bs.cc cbify.cc topk.cc stagewise_poly.cc log_multi.cc active.cc kernel_svm.cc best_constant.cc
+libvw_la_SOURCES = hash.cc memory.cc global_data.cc io_buf.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc rand48.cc simple_label.cc multiclass.cc oaa.cc ect.cc autolink.cc binary.cc lrq.cc cost_sensitive.cc csoaa.cc cb.cc cb_algs.cc search.cc search_sequencetask.cc search_dep_parser.cc search_hooktask.cc search_multiclasstask.cc search_entityrelationtask.cc parse_example.cc scorer.cc network.cc parse_args.cc accumulate.cc gd.cc learner.cc lda_core.cc gd_mf.cc mf.cc bfgs.cc noop.cc print.cc example.cc parser.cc loss_functions.cc sender.cc nn.cc bs.cc cbify.cc topk.cc stagewise_poly.cc log_multi.cc active.cc kernel_svm.cc best_constant.cc ftrl_proximal.cc
 
 libvw_c_wrapper_la_SOURCES = vwdll.cpp
 
diff --git a/vowpalwabbit/accumulate.cc b/vowpalwabbit/accumulate.cc
index d6c5e71f..49d2e969 100644
--- a/vowpalwabbit/accumulate.cc
+++ b/vowpalwabbit/accumulate.cc
@@ -17,9 +17,7 @@ Alekh Agarwal and John Langford, with help Olivier Chapelle.
    
 using namespace std;
 
-void add_float(float& c1, const float& c2) {
-  c1 += c2;
-}
+void add_float(float& c1, const float& c2) { c1 += c2; }
 
 void accumulate(vw& all, string master_location, regressor& reg, size_t o) {
   uint32_t length = 1 << all.num_bits; //This is size of gradient
@@ -27,15 +25,11 @@ void accumulate(vw& all, string master_location, regressor& reg, size_t o) {
   float* local_grad = new float[length];
   weight* weights = reg.weight_vector;
   for(uint32_t i = 0;i < length;i++) 
-    {
-      local_grad[i] = weights[stride*i+o];
-    }
+    local_grad[i] = weights[stride*i+o];
 
   all_reduce<float, add_float>(local_grad, length, master_location, all.unique_id, all.total, all.node, all.socks);
   for(uint32_t i = 0;i < length;i++) 
-    {
-      weights[stride*i+o] = local_grad[i];
-    }
+    weights[stride*i+o] = local_grad[i];
   delete[] local_grad;
 }
 
@@ -53,11 +47,11 @@ void accumulate_avg(vw& all, string master_location, regressor& reg, size_t o) {
   float numnodes = (float)all.total;
 
   for(uint32_t i = 0;i < length;i++) 
-      local_grad[i] = weights[stride*i+o];
+    local_grad[i] = weights[stride*i+o];
 
   all_reduce<float, add_float>(local_grad, length, master_location, all.unique_id, all.total, all.node, all.socks);
   for(uint32_t i = 0;i < length;i++) 
-      weights[stride*i+o] = local_grad[i]/numnodes;
+    weights[stride*i+o] = local_grad[i]/numnodes;
   delete[] local_grad;
 }
 
@@ -83,17 +77,14 @@ void accumulate_weighted_avg(vw& all, string master_location, regressor& reg) {
   uint32_t length = 1 << all.num_bits; //This is the number of parameters
   size_t stride = 1 << all.reg.stride_shift;
   weight* weights = reg.weight_vector;
-
-
   float* local_weights = new float[length];
 
   for(uint32_t i = 0;i < length;i++) 
     local_weights[i] = weights[stride*i+1];
   
-
   //First compute weights for averaging
   all_reduce<float, add_float>(local_weights, length, master_location, all.unique_id, all.total, all.node, all.socks);
-
+  
   for(uint32_t i = 0;i < length;i++) //Compute weighted versions
     if(local_weights[i] > 0) {
       float ratio = weights[stride*i+1]/local_weights[i];
@@ -107,7 +98,7 @@ void accumulate_weighted_avg(vw& all, string master_location, regressor& reg) {
       local_weights[i] = 0;
       weights[stride*i] = 0;
     }
-
+  
   all_reduce<float, add_float>(weights, length*stride, master_location, all.unique_id, all.total, all.node, all.socks);
   
   delete[] local_weights;
diff --git a/vowpalwabbit/active.cc b/vowpalwabbit/active.cc
index 78e8e0e1..a1070be3 100644
--- a/vowpalwabbit/active.cc
+++ b/vowpalwabbit/active.cc
@@ -45,7 +45,7 @@ namespace ACTIVE {
     }
 
   template <bool is_learn>
-  void predict_or_learn_simulation(active& a, learner& base, example& ec) {
+  void predict_or_learn_simulation(active& a, base_learner& base, example& ec) {
     base.predict(ec);
     
     if (is_learn)
@@ -67,7 +67,7 @@ namespace ACTIVE {
   }
   
   template <bool is_learn>
-  void predict_or_learn_active(active& a, learner& base, example& ec) {
+  void predict_or_learn_active(active& a, base_learner& base, example& ec) {
     if (is_learn)
       base.learn(ec);
     else
@@ -151,35 +151,31 @@ namespace ACTIVE {
     VW::finish_example(all,&ec);
   }
   
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {//parse and set arguments
-    active* data = (active*)calloc_or_die(1, sizeof(active));
+    active& data = calloc_or_die<active>();
 
     po::options_description active_opts("Active Learning options");
     active_opts.add_options()
       ("simulation", "active learning simulation mode")
-      ("mellowness", po::value<float>(&(data->active_c0)), "active learning mellowness parameter c_0. Default 8")
+      ("mellowness", po::value<float>(&(data.active_c0)), "active learning mellowness parameter c_0. Default 8")
       ;
-
     vm = add_options(all, active_opts);
-
-    data->all=&all;
+    data.all=&all;
 
     //Create new learner
-    learner* ret = new learner(data, all.l);
+    learner<active>* ret;
     if (vm.count("simulation"))
-      {
-	ret->set_learn<active, predict_or_learn_simulation<true> >();
-	ret->set_predict<active, predict_or_learn_simulation<false> >();
-      }
+      ret = &init_learner(&data, all.l, predict_or_learn_simulation<true>, 
+			  predict_or_learn_simulation<false>);
     else
       {
 	all.active = true;
-	ret->set_learn<active, predict_or_learn_active<true> >();
-	ret->set_predict<active, predict_or_learn_active<false> >();
-	ret->set_finish_example<active, return_active_example>();
+	ret = &init_learner(&data, all.l, predict_or_learn_active<true>, 
+			    predict_or_learn_active<false>);
+	ret->set_finish_example(return_active_example);
       }
 
-    return ret;
+    return make_base(*ret);
   }
 }
diff --git a/vowpalwabbit/active.h b/vowpalwabbit/active.h
index e8883ae9..d71950ab 100644
--- a/vowpalwabbit/active.h
+++ b/vowpalwabbit/active.h
@@ -1,4 +1,4 @@
 #pragma once
 namespace ACTIVE {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/active_interactor.cc b/vowpalwabbit/active_interactor.cc
index 1f67fa69..ad4f5869 100644
--- a/vowpalwabbit/active_interactor.cc
+++ b/vowpalwabbit/active_interactor.cc
@@ -19,12 +19,6 @@ license as described in the file LICENSE.
 #include <netdb.h>
 #endif
 
-using std::cin;
-using std::endl;
-using std::cout;
-using std::cerr;
-using std::string;
-
 using namespace std;
 
 int open_socket(const char* host, unsigned short port)
diff --git a/vowpalwabbit/autolink.cc b/vowpalwabbit/autolink.cc
index 4eebe6d4..7cdaecef 100644
--- a/vowpalwabbit/autolink.cc
+++ b/vowpalwabbit/autolink.cc
@@ -1,8 +1,6 @@
 #include "reductions.h"
 #include "simple_label.h"
 
-using namespace LEARNER;
-
 namespace ALINK {
   const int autoconstant = 524267083;
   
@@ -11,12 +9,11 @@ namespace ALINK {
     uint32_t stride_shift;
   };
 
-  template <bool is_learn>
-  void predict_or_learn(autolink& b, learner& base, example& ec)
+  template <bool is_learn> 
+  void predict_or_learn(autolink& b, LEARNER::base_learner& base, example& ec)
   {
     base.predict(ec);
     float base_pred = ec.pred.scalar;
-
     // add features of label
     ec.indices.push_back(autolink_namespace);
     float sum_sq = 0;
@@ -30,7 +27,6 @@ namespace ALINK {
 	}
     ec.total_sum_feat_sq += sum_sq;
 
-    // apply predict or learn
     if (is_learn)
       base.learn(ec);
     else
@@ -41,19 +37,16 @@ namespace ALINK {
     ec.total_sum_feat_sq -= sum_sq;
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm)
   {
-    autolink* data = (autolink*)calloc_or_die(1,sizeof(autolink));
-    data->d = (uint32_t)vm["autolink"].as<size_t>();
-    data->stride_shift = all.reg.stride_shift;
+    autolink& data = calloc_or_die<autolink>();
+    data.d = (uint32_t)vm["autolink"].as<size_t>();
+    data.stride_shift = all.reg.stride_shift;
     
-    std::stringstream ss;
-    ss << " --autolink " << data->d;
-    all.file_options = all.file_options+ss.str();
+    *all.file_options << " --autolink " << data.d;
 
-    learner* ret = new learner(data, all.l);
-    ret->set_learn<autolink, predict_or_learn<true> >();
-    ret->set_predict<autolink, predict_or_learn<false> >();
-    return ret;
+    LEARNER::learner<autolink>& ret = init_learner(&data, all.l, predict_or_learn<true>, 
+					  predict_or_learn<false>);
+    return make_base(ret);
   }
 }
diff --git a/vowpalwabbit/autolink.h b/vowpalwabbit/autolink.h
index 830c0088..3bb70dc1 100644
--- a/vowpalwabbit/autolink.h
+++ b/vowpalwabbit/autolink.h
@@ -1,4 +1,4 @@
 #pragma once
 namespace ALINK {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/best_constant.cc b/vowpalwabbit/best_constant.cc
index 3642a014..c56cb336 100644
--- a/vowpalwabbit/best_constant.cc
+++ b/vowpalwabbit/best_constant.cc
@@ -76,13 +76,11 @@ bool get_best_constant(vw& all, float& best_constant, float& best_constant_loss)
     } else
         return false;
 
-
     if (!is_more_than_two_labels_observed)
     best_constant_loss = ( all.loss->getLoss(all.sd, best_constant, label1) * label1_cnt +
                            all.loss->getLoss(all.sd, best_constant, label2) * label2_cnt )
             / (label1_cnt + label2_cnt);
     else best_constant_loss = FLT_MIN;
 
-
     return true;
 }
diff --git a/vowpalwabbit/bfgs.cc b/vowpalwabbit/bfgs.cc
index b964c218..fc4ee851 100644
--- a/vowpalwabbit/bfgs.cc
+++ b/vowpalwabbit/bfgs.cc
@@ -476,7 +476,7 @@ void preconditioner_to_regularizer(vw& all, bfgs& b, float regularization)
   weight* weights = all.reg.weight_vector;
   if (b.regularizers == NULL)
     {
-      b.regularizers = (weight *)calloc_or_die(2*length, sizeof(weight));
+      b.regularizers = calloc_or_die<weight>(2*length);
       
       if (b.regularizers == NULL)
 	{
@@ -818,13 +818,13 @@ void end_pass(bfgs& b)
 }
 
 // placeholder
-void predict(bfgs& b, learner& base, example& ec)
+void predict(bfgs& b, base_learner& base, example& ec)
 {
   vw* all = b.all;
   ec.pred.scalar = bfgs_predict(*all,ec);
 }
 
-void learn(bfgs& b, learner& base, example& ec)
+void learn(bfgs& b, base_learner& base, example& ec)
 {
   vw* all = b.all;
   assert(ec.in_use);
@@ -906,7 +906,7 @@ void save_load(bfgs& b, io_buf& model_file, bool read, bool text)
       initialize_regressor(*all);
       if (all->per_feature_regularizer_input != "")
 	{
-	  b.regularizers = (weight *)calloc_or_die(2*length, sizeof(weight));
+	  b.regularizers = calloc_or_die<weight>(2*length);
 	  if (b.regularizers == NULL)
 	    {
 	      cerr << all->program_name << ": Failed to allocate regularizers array: try decreasing -b <bits>" << endl;
@@ -965,19 +965,19 @@ void save_load(bfgs& b, io_buf& model_file, bool read, bool text)
     b.backstep_on = true;
   }
 
-learner* setup(vw& all, po::variables_map& vm)
+base_learner* setup(vw& all, po::variables_map& vm)
 {
-  bfgs* b = (bfgs*)calloc_or_die(1,sizeof(bfgs));
-  b->all = &all;
-  b->wolfe1_bound = 0.01;
-  b->first_hessian_on=true;
-  b->first_pass = true;
-  b->gradient_pass = true;
-  b->preconditioner_pass = true;
-  b->backstep_on = false;
-  b->final_pass=all.numpasses;  
-  b->no_win_counter = 0;
-  b->early_stop_thres = 3;
+  bfgs& b = calloc_or_die<bfgs>();
+  b.all = &all;
+  b.wolfe1_bound = 0.01;
+  b.first_hessian_on=true;
+  b.first_pass = true;
+  b.gradient_pass = true;
+  b.preconditioner_pass = true;
+  b.backstep_on = false;
+  b.final_pass=all.numpasses;  
+  b.no_win_counter = 0;
+  b.early_stop_thres = 3;
 
   po::options_description bfgs_opts("LBFGS options");
 
@@ -993,7 +993,7 @@ learner* setup(vw& all, po::variables_map& vm)
   {
     all.sd->holdout_best_loss = FLT_MAX;
     if(vm.count("early_terminate"))      
-      b->early_stop_thres = vm["early_terminate"].as< size_t>();     
+      b.early_stop_thres = vm["early_terminate"].as< size_t>();     
   }
   
   if (vm.count("hessian_on") || all.m==0) {
@@ -1018,14 +1018,13 @@ learner* setup(vw& all, po::variables_map& vm)
   all.bfgs = true;
   all.reg.stride_shift = 2;
 
-  learner* l = new learner(b, 1 << all.reg.stride_shift);
-  l->set_learn<bfgs, learn>();
-  l->set_predict<bfgs, predict>();
-  l->set_save_load<bfgs,save_load>();
-  l->set_init_driver<bfgs,init_driver>();
-  l->set_end_pass<bfgs,end_pass>();
-  l->set_finish<bfgs,finish>();
+  learner<bfgs>& l = init_learner(&b, learn, 1 << all.reg.stride_shift);
+  l.set_predict(predict);
+  l.set_save_load(save_load);
+  l.set_init_driver(init_driver);
+  l.set_end_pass(end_pass);
+  l.set_finish(finish);
 
-  return l;
+  return make_base(l);
 }
 }
diff --git a/vowpalwabbit/bfgs.h b/vowpalwabbit/bfgs.h
index 2f30dcb9..1960662b 100644
--- a/vowpalwabbit/bfgs.h
+++ b/vowpalwabbit/bfgs.h
@@ -5,5 +5,5 @@ license as described in the file LICENSE.
  */
 #pragma once
 namespace BFGS {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/binary.cc b/vowpalwabbit/binary.cc
index 3f78e789..04810e26 100644
--- a/vowpalwabbit/binary.cc
+++ b/vowpalwabbit/binary.cc
@@ -1,13 +1,11 @@
+#include <float.h>
 #include "reductions.h"
 #include "multiclass.h"
 #include "simple_label.h"
 
-using namespace LEARNER;
-
 namespace BINARY {
-
   template <bool is_learn>
-  void predict_or_learn(float&, learner& base, example& ec) {
+  void predict_or_learn(char&, LEARNER::base_learner& base, example& ec) {
     if (is_learn)
       base.learn(ec);
     else
@@ -18,19 +16,22 @@ namespace BINARY {
     else
       ec.pred.scalar = -1;
 
-    if (ec.l.simple.label == ec.pred.scalar)
-      ec.loss = 0.;
-    else
-      ec.loss = ec.l.simple.weight;
+    if (ec.l.simple.label != FLT_MAX)
+      {
+	if (fabs(ec.l.simple.label) != 1.f)
+	  cout << "You are using a label not -1 or 1 with a loss function expecting that!" << endl;
+	else
+	  if (ec.l.simple.label == ec.pred.scalar)
+	    ec.loss = 0.;
+	  else
+	    ec.loss = ec.l.simple.weight; 
+      }
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
-  {//parse and set arguments
-    all.sd->binary_label = true;
-    //Create new learner
-    learner* ret = new learner(NULL, all.l);
-    ret->set_learn<float, predict_or_learn<true> >();
-    ret->set_predict<float, predict_or_learn<false> >();
-    return ret;
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm)
+  {
+    LEARNER::learner<char>& ret = 
+      LEARNER::init_learner<char>(NULL, all.l, predict_or_learn<true>, predict_or_learn<false>);
+    return make_base(ret);
   }
 }
diff --git a/vowpalwabbit/binary.h b/vowpalwabbit/binary.h
index 2df4f2e0..609de90b 100644
--- a/vowpalwabbit/binary.h
+++ b/vowpalwabbit/binary.h
@@ -1,4 +1,4 @@
 #pragma once
 namespace BINARY {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/bs.cc b/vowpalwabbit/bs.cc
index 8c2ca4b9..78828632 100644
--- a/vowpalwabbit/bs.cc
+++ b/vowpalwabbit/bs.cc
@@ -180,7 +180,7 @@ namespace BS {
   }
 
   template <bool is_learn>
-  void predict_or_learn(bs& d, learner& base, example& ec)
+  void predict_or_learn(bs& d, base_learner& base, example& ec)
   {
     vw* all = d.all;
     bool shouldOutput = all->raw_prediction > 0;
@@ -239,11 +239,11 @@ namespace BS {
     d.pred_vec.~vector();
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {
-    bs* data = (bs*)calloc_or_die(1, sizeof(bs));
-    data->ub = FLT_MAX;
-    data->lb = -FLT_MAX;
+    bs& data = calloc_or_die<bs>();
+    data.ub = FLT_MAX;
+    data.lb = -FLT_MAX;
 
     po::options_description bs_options("Bootstrap options");
     bs_options.add_options()
@@ -251,12 +251,10 @@ namespace BS {
     
     vm = add_options(all, bs_options);
 
-    data->B = (uint32_t)vm["bootstrap"].as<size_t>();
+    data.B = (uint32_t)vm["bootstrap"].as<size_t>();
 
     //append bs with number of samples to options_from_file so it is saved to regressor later
-    std::stringstream ss;
-    ss << " --bootstrap " << data->B;
-    all.file_options.append(ss.str());
+    *all.file_options << " --bootstrap " << data.B;
 
     std::string type_string("mean");
 
@@ -265,30 +263,28 @@ namespace BS {
       type_string = vm["bs_type"].as<std::string>();
       
       if (type_string.compare("mean") == 0) { 
-        data->bs_type = BS_TYPE_MEAN;
+        data.bs_type = BS_TYPE_MEAN;
       }
       else if (type_string.compare("vote") == 0) {
-        data->bs_type = BS_TYPE_VOTE;
+        data.bs_type = BS_TYPE_VOTE;
       }
       else {
         std::cerr << "warning: bs_type must be in {'mean','vote'}; resetting to mean." << std::endl;
-        data->bs_type = BS_TYPE_MEAN;
+        data.bs_type = BS_TYPE_MEAN;
       }
     }
     else //by default use mean
-      data->bs_type = BS_TYPE_MEAN;
-    all.file_options.append(" --bs_type ");
-    all.file_options.append(type_string);
+      data.bs_type = BS_TYPE_MEAN;
+    *all.file_options << " --bs_type " << type_string;
 
-    data->pred_vec.reserve(data->B);
-    data->all = &all;
+    data.pred_vec.reserve(data.B);
+    data.all = &all;
 
-    learner* l = new learner(data, all.l, data->B);
-    l->set_learn<bs, predict_or_learn<true> >();
-    l->set_predict<bs, predict_or_learn<false> >();
-    l->set_finish_example<bs,finish_example>();
-    l->set_finish<bs,finish>();
+    learner<bs>& l = init_learner(&data, all.l, predict_or_learn<true>, 
+				  predict_or_learn<false>, data.B);
+    l.set_finish_example(finish_example);
+    l.set_finish(finish);
 
-    return l;
+    return make_base(l);
   }
 }
diff --git a/vowpalwabbit/bs.h b/vowpalwabbit/bs.h
index d41593ef..e05bcd05 100644
--- a/vowpalwabbit/bs.h
+++ b/vowpalwabbit/bs.h
@@ -11,7 +11,7 @@ license as described in the file LICENSE.
 
 namespace BS
 {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
   void print_result(int f, float res, float weight, v_array<char> tag, float lb, float ub);
   
   void output_example(vw& all, example* ec, float lb, float ub);
diff --git a/vowpalwabbit/cache.cc b/vowpalwabbit/cache.cc
index f91eba3c..9881e7f8 100644
--- a/vowpalwabbit/cache.cc
+++ b/vowpalwabbit/cache.cc
@@ -7,8 +7,6 @@ license as described in the file LICENSE.
 #include "unique_sort.h"
 #include "global_data.h"
 
-using namespace std;
-
 const size_t neg_1 = 1;
 const size_t general = 2;
 
@@ -40,9 +38,7 @@ size_t read_cached_tag(io_buf& cache, example* ae)
   return tag_size+sizeof(tag_size);
 }
 
-struct one_float {
-  float f;
-}
+struct one_float { float f; }
 #ifndef _WIN32
 __attribute__((packed))
 #endif
diff --git a/vowpalwabbit/cb_algs.cc b/vowpalwabbit/cb_algs.cc
index 4550efaa..bf9a6e5b 100644
--- a/vowpalwabbit/cb_algs.cc
+++ b/vowpalwabbit/cb_algs.cc
@@ -30,9 +30,6 @@ namespace CB_ALGS
     float last_pred_reg;
     float last_correct_cost;
     
-    float min_cost;
-    float max_cost;
-
     cb_class* known_cost;
     vw* all;
   };
@@ -245,70 +242,12 @@ namespace CB_ALGS
 	gen_cs_label<is_learn>(all, c, ec, cs_ld, cl->action);
   }
 
-  void cb_test_to_cs_test_label(vw& all, example& ec, COST_SENSITIVE::label& cs_ld)
-  {
-    CB::label ld = ec.l.cb;
-
-    cs_ld.costs.erase();
-    if(ld.costs.size() > 0)
-    {
-      //if this is a test example and we specified actions, this means we are only allowed to perform these actions, so copy all actions with their specified costs
-      for( cb_class* cl = ld.costs.begin; cl != ld.costs.end; cl++)
-      {
-        COST_SENSITIVE::wclass wc;
-        wc.wap_value = 0.;
-
-        wc.x = cl->cost;
-        wc.class_index = cl->action;
-        wc.partial_prediction = 0.;
-        wc.wap_value = 0.;
-        
-        cs_ld.costs.push_back(wc);
-      }
-    }
-    else
-      {
-	for (uint32_t i = 0; i < all.sd->k; i++)
-	  {
-	    COST_SENSITIVE::wclass wc;
-	    wc.wap_value = 0.;
-	    
-	    wc.x = FLT_MAX;
-	    wc.class_index = i+1;
-	    wc.partial_prediction = 0.;
-	    wc.wap_value = 0.;
-	    
-	    cs_ld.costs.push_back(wc);
-	  }
-      }
-  }
-
   template <bool is_learn>
-  void predict_or_learn(cb& c, learner& base, example& ec) {
+  void predict_or_learn(cb& c, base_learner& base, example& ec) {
     vw* all = c.all;
     CB::label ld = ec.l.cb;
 
-     //check if this is a test example where we just want a prediction
-    if( !is_learn )
-    {
-      //if so just query base cost-sensitive learner
-      cb_test_to_cs_test_label(*all,ec,c.cb_cs_ld);
-
-      ec.l.cs = c.cb_cs_ld;
-      base.predict(ec);
-      for (size_t i=0; i<ld.costs.size(); i++)
-        ld.costs[i].partial_prediction = c.cb_cs_ld.costs[i].partial_prediction;
-
-      ec.l.cb = ld;
-
-      return;
-    }
-
-    //now this is a training example
-    c.known_cost = get_observed_cost(ld);
-    c.min_cost = min (c.min_cost, c.known_cost->cost);
-    c.max_cost = max (c.max_cost, c.known_cost->cost);
-    
+    c.known_cost = get_observed_cost(ld);    
     //generate a cost-sensitive example to update classifiers
     switch(c.cb_type)
     {
@@ -341,12 +280,12 @@ namespace CB_ALGS
       }
   }
 
-  void predict_eval(cb& c, learner& base, example& ec) {
+  void predict_eval(cb& c, base_learner& base, example& ec) {
     cout << "can not use a test label for evaluation" << endl;
     throw exception();
   }
 
-  void learn_eval(cb& c, learner& base, example& ec) {
+  void learn_eval(cb& c, base_learner& base, example& ec) {
     vw* all = c.all;
     CB_EVAL::label ld = ec.l.cb_eval;
     
@@ -428,6 +367,7 @@ namespace CB_ALGS
     float loss = 0.;
     if (!is_test_label(ld))
       {//need to compute exact loss
+	c.known_cost = get_observed_cost(ld);
         float chosen_loss = FLT_MAX;
         if( know_all_cost_example(ld) ) {
           for (cb_class *cl = ld.costs.begin; cl != ld.costs.end; cl ++) {
@@ -496,12 +436,10 @@ namespace CB_ALGS
     VW::finish_example(all, &ec);
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {
-    cb* c = (cb*)calloc_or_die(1, sizeof(cb));
-    c->all = &all;
-    c->min_cost = 0.;
-    c->max_cost = 1.;
+    cb& c = calloc_or_die<cb>();
+    c.all = &all;
 
     uint32_t nb_actions = (uint32_t)vm["cb"].as<size_t>();
     //append cb with nb_actions to file_options so it is saved to regressor later
@@ -514,9 +452,7 @@ namespace CB_ALGS
 
     vm = add_options(all, cb_opts);
 
-    std::stringstream ss;
-    ss << " --cb " << nb_actions;
-    all.file_options.append(ss.str());
+    *all.file_options << " --cb " << nb_actions;
 
     all.sd->k = nb_actions;
 
@@ -530,12 +466,10 @@ namespace CB_ALGS
       std::string type_string;
 
       type_string = vm["cb_type"].as<std::string>();
+      *all.file_options << " --cb_type " << type_string;
       
-      all.file_options.append(" --cb_type ");
-      all.file_options.append(type_string);
-
       if (type_string.compare("dr") == 0) 
-	c->cb_type = CB_TYPE_DR;
+	c.cb_type = CB_TYPE_DR;
       else if (type_string.compare("dm") == 0)
 	{
 	  if (eval)
@@ -543,23 +477,23 @@ namespace CB_ALGS
 	      cout << "direct method can not be used for evaluation --- it is biased." << endl;
 	      throw exception();
 	    }
-	  c->cb_type = CB_TYPE_DM;
+	  c.cb_type = CB_TYPE_DM;
 	  problem_multiplier = 1;
 	}
       else if (type_string.compare("ips") == 0)
 	{
-	  c->cb_type = CB_TYPE_IPS;
+	  c.cb_type = CB_TYPE_IPS;
 	  problem_multiplier = 1;
 	}
       else {
         std::cerr << "warning: cb_type must be in {'ips','dm','dr'}; resetting to dr." << std::endl;
-        c->cb_type = CB_TYPE_DR;
+        c.cb_type = CB_TYPE_DR;
       }
     }
     else {
       //by default use doubly robust
-      c->cb_type = CB_TYPE_DR;
-      all.file_options.append(" --cb_type dr");
+      c.cb_type = CB_TYPE_DR;
+      *all.file_options << " --cb_type dr";
     }
 
     if (eval)
@@ -567,25 +501,24 @@ namespace CB_ALGS
     else
       all.p->lp = CB::cb_label; 
 
-    learner* l = new learner(c, all.l, problem_multiplier);
+    learner<cb>* l;
     if (eval)
       {
-	l->set_learn<cb, learn_eval>();
-	l->set_predict<cb, predict_eval>();
-	l->set_finish_example<cb,eval_finish_example>(); 
+	l = &init_learner(&c, all.l, learn_eval, predict_eval, problem_multiplier);
+	l->set_finish_example(eval_finish_example); 
       }
     else
       {
-	l->set_learn<cb, predict_or_learn<true> >();
-	l->set_predict<cb, predict_or_learn<false> >();
-	l->set_finish_example<cb,finish_example>(); 
+	l = &init_learner(&c, all.l, predict_or_learn<true>, predict_or_learn<false>, 
+			  problem_multiplier);
+	l->set_finish_example(finish_example); 
       }
-    l->set_init_driver<cb,init_driver>();
-    l->set_finish<cb,finish>();
     // preserve the increment of the base learner since we are
     // _adding_ to the number of problems rather than multiplying.
     l->increment = all.l->increment; 
-
-    return l;
+    
+    l->set_init_driver(init_driver);
+    l->set_finish(finish);
+    return make_base(*l);
   }
 }
diff --git a/vowpalwabbit/cb_algs.h b/vowpalwabbit/cb_algs.h
index 68da7610..e989b5a1 100644
--- a/vowpalwabbit/cb_algs.h
+++ b/vowpalwabbit/cb_algs.h
@@ -7,7 +7,7 @@ license as described in the file LICENSE.
 //TODO: extend to handle CSOAA_LDF and WAP_LDF
 namespace CB_ALGS {
 
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 
   template <bool is_learn>
     float get_cost_pred(vw& all, CB::cb_class* known_cost, example& ec, uint32_t index, uint32_t base)
diff --git a/vowpalwabbit/cbify.cc b/vowpalwabbit/cbify.cc
index d29174d9..d8176228 100644
--- a/vowpalwabbit/cbify.cc
+++ b/vowpalwabbit/cbify.cc
@@ -83,7 +83,7 @@ namespace CBIFY {
     COST_SENSITIVE::label cs_label;
     COST_SENSITIVE::label second_cs_label;
 
-    learner* cs;
+    base_learner* cs;
     vw* all;
 
     unique_ptr<vw_policy> policy;
@@ -106,7 +106,7 @@ namespace CBIFY {
   }
 
   struct vw_context {
-	  learner* l;
+	  base_learner* l;
 	  example* e;
 	  cbify* data;
     bool recorded;
@@ -158,7 +158,7 @@ namespace CBIFY {
   }
 
   template <bool is_learn>
-  void predict_or_learn_first(cbify& data, learner& base, example& ec)
+  void predict_or_learn_first(cbify& data, base_learner& base, example& ec)
   {//Explore tau times, then act according to optimal.
     MULTICLASS::multiclass ld = ec.l.multi;
 
@@ -186,7 +186,7 @@ namespace CBIFY {
   }
 
   template <bool is_learn>
-  void predict_or_learn_greedy(cbify& data, learner& base, example& ec)
+  void predict_or_learn_greedy(cbify& data, base_learner& base, example& ec)
   {//Explore uniform random an epsilon fraction of the time.
     MULTICLASS::multiclass ld = ec.l.multi;
 
@@ -213,7 +213,7 @@ namespace CBIFY {
   }
 
   template <bool is_learn>
-  void predict_or_learn_bag(cbify& data, learner& base, example& ec)
+  void predict_or_learn_bag(cbify& data, base_learner& base, example& ec)
   {//Randomize over predictions from a base set of predictors
     //Use CB to find current predictions.
     MULTICLASS::multiclass ld = ec.l.multi;
@@ -284,7 +284,7 @@ namespace CBIFY {
   }
 
   template <bool is_learn>
-  void predict_or_learn_cover(cbify& data, learner& base, example& ec)
+  void predict_or_learn_cover(cbify& data, base_learner& base, example& ec)
   {//Randomize over predictions from a base set of predictors
     //Use cost sensitive oracle to cover actions to form distribution.
     MULTICLASS::multiclass ld = ec.l.multi;
@@ -366,22 +366,16 @@ namespace CBIFY {
   
   void init_driver(cbify&) {}
 
-  void finish_example(vw& all, cbify&, example& ec)
-  {
-    MULTICLASS::output_example(all, ec);
-    VW::finish_example(all, &ec);
-  }
+  void finish_example(vw& all, cbify&, example& ec) { MULTICLASS::finish_example(all, ec); }
 
   void finish(cbify& data)
-  {
-    CB::cb_label.delete_label(&data.cb_label);
-  }
+  { CB::cb_label.delete_label(&data.cb_label); }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {//parse and set arguments
-    cbify* data = (cbify*)calloc_or_die(1, sizeof(cbify));
+    cbify& data = calloc_or_die<cbify>();
 
-    data->all = &all;
+    data.all = &all;
     po::options_description cb_opts("CBIFY options");
     cb_opts.add_options()
       ("first", po::value<size_t>(), "tau-first exploration")
@@ -391,69 +385,61 @@ namespace CBIFY {
     
     vm = add_options(all, cb_opts);
     
-    data->k = (uint32_t)vm["cbify"].as<size_t>();
-    
-    //appends nb_actions to options_from_file so it is saved to regressor later
-    std::stringstream ss;
-    ss << " --cbify " << data->k;
-    all.file_options.append(ss.str());
+    data.k = (uint32_t)vm["cbify"].as<size_t>();
+    *all.file_options << " --cbify " << data.k;
 
     all.p->lp = MULTICLASS::mc_label;
-    learner* l;
-    data->recorder.reset(new vw_recorder());
-    data->mwt_explorer.reset(new MwtExplorer<vw_context>("vw", *data->recorder.get()));
+    learner<cbify>* l;
+    data.recorder.reset(new vw_recorder());
+    data.mwt_explorer.reset(new MwtExplorer<vw_context>("vw", *data.recorder.get()));
     if (vm.count("cover"))
       {
 	size_t cover = (uint32_t)vm["cover"].as<size_t>();
-	data->cs = all.cost_sensitive;
-	data->second_cs_label.costs.resize(data->k);
-	data->second_cs_label.costs.end = data->second_cs_label.costs.begin+data->k;
-  float epsilon = 0.05f;
-  if (vm.count("epsilon"))
-    epsilon = vm["epsilon"].as<float>();
-  data->scorer.reset(new vw_cover_scorer(epsilon, cover, (u32)data->k));
-  data->generic_explorer.reset(new GenericExplorer<vw_context>(*data->scorer.get(), (u32)data->k));
-  l = new learner(data, all.l, cover + 1);
-	l->set_learn<cbify, predict_or_learn_cover<true> >();
-	l->set_predict<cbify, predict_or_learn_cover<false> >();
+	data.cs = all.cost_sensitive;
+	data.second_cs_label.costs.resize(data.k);
+	data.second_cs_label.costs.end = data.second_cs_label.costs.begin+data.k;
+	float epsilon = 0.05f;
+	if (vm.count("epsilon"))
+	  epsilon = vm["epsilon"].as<float>();
+	data.scorer.reset(new vw_cover_scorer(epsilon, cover, (u32)data.k));
+	data.generic_explorer.reset(new GenericExplorer<vw_context>(*data.scorer.get(), (u32)data.k));
+	l = &init_learner(&data, all.l, predict_or_learn_cover<true>, 
+			  predict_or_learn_cover<false>, cover + 1);
       }
     else if (vm.count("bag"))
       {
 	size_t bags = (uint32_t)vm["bag"].as<size_t>();
-  for (size_t i = 0; i < bags; i++)
-  {
-    data->policies.push_back(unique_ptr<IPolicy<vw_context>>(new vw_policy(i)));
-  }
-  data->bootstrap_explorer.reset(new BootstrapExplorer<vw_context>(data->policies, (u32)data->k));
-  l = new learner(data, all.l, bags);
-	l->set_learn<cbify, predict_or_learn_bag<true> >();
-	l->set_predict<cbify, predict_or_learn_bag<false> >();
+	for (size_t i = 0; i < bags; i++)
+	  {
+	    data.policies.push_back(unique_ptr<IPolicy<vw_context>>(new vw_policy(i)));
+	  }
+	data.bootstrap_explorer.reset(new BootstrapExplorer<vw_context>(data.policies, (u32)data.k));
+	l = &init_learner(&data, all.l, predict_or_learn_bag<true>, 
+			  predict_or_learn_bag<false>, bags);
       }
     else if (vm.count("first") )
       {
-  uint32_t tau = (uint32_t)vm["first"].as<size_t>();
-  data->policy.reset(new vw_policy());
-  data->tau_explorer.reset(new TauFirstExplorer<vw_context>(*data->policy.get(), (u32)tau, (u32)data->k));
-  l = new learner(data, all.l, 1);
-	l->set_learn<cbify, predict_or_learn_first<true> >();
-	l->set_predict<cbify, predict_or_learn_first<false> >();
+	uint32_t tau = (uint32_t)vm["first"].as<size_t>();
+	data.policy.reset(new vw_policy());
+	data.tau_explorer.reset(new TauFirstExplorer<vw_context>(*data.policy.get(), (u32)tau, (u32)data.k));
+	l = &init_learner(&data, all.l, predict_or_learn_first<true>, 
+			  predict_or_learn_first<false>, 1);
       }
     else
       {
-  float epsilon = 0.05f;
-  if (vm.count("epsilon"))
-    epsilon = vm["epsilon"].as<float>();
-  data->policy.reset(new vw_policy());
-  data->greedy_explorer.reset(new EpsilonGreedyExplorer<vw_context>(*data->policy.get(), epsilon, (u32)data->k));
-	l = new learner(data, all.l, 1);
-	l->set_learn<cbify, predict_or_learn_greedy<true> >();
-	l->set_predict<cbify, predict_or_learn_greedy<false> >();
+	float epsilon = 0.05f;
+	if (vm.count("epsilon"))
+	  epsilon = vm["epsilon"].as<float>();
+	data.policy.reset(new vw_policy());
+	data.greedy_explorer.reset(new EpsilonGreedyExplorer<vw_context>(*data.policy.get(), epsilon, (u32)data.k));
+	l = &init_learner(&data, all.l, predict_or_learn_greedy<true>, 
+			  predict_or_learn_greedy<false>, 1);
       }
-
-    l->set_finish_example<cbify,finish_example>();
-    l->set_finish<cbify,finish>();
-    l->set_init_driver<cbify,init_driver>();
     
-    return l;
+    l->set_finish_example(finish_example);
+    l->set_finish(finish);
+    l->set_init_driver(init_driver);
+    
+    return make_base(*l);
   }
 }
diff --git a/vowpalwabbit/cbify.h b/vowpalwabbit/cbify.h
index 97e9e6fd..aed26b75 100644
--- a/vowpalwabbit/cbify.h
+++ b/vowpalwabbit/cbify.h
@@ -5,5 +5,5 @@ license as described in the file LICENSE.
  */
 #pragma once
 namespace CBIFY {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/csoaa.cc b/vowpalwabbit/csoaa.cc
index d6b641b1..350d40a3 100644
--- a/vowpalwabbit/csoaa.cc
+++ b/vowpalwabbit/csoaa.cc
@@ -24,7 +24,7 @@ namespace CSOAA {
   };
 
   template <bool is_learn>
-  void predict_or_learn(csoaa& c, learner& base, example& ec) {
+  void predict_or_learn(csoaa& c, base_learner& base, example& ec) {
     vw* all = c.all;
     COST_SENSITIVE::label ld = ec.l.cs;
     uint32_t prediction = 1;
@@ -68,28 +68,24 @@ namespace CSOAA {
     VW::finish_example(all, &ec);
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {
-    csoaa* c=(csoaa*)calloc_or_die(1,sizeof(csoaa));
-    c->all = &all;
+    csoaa& c = calloc_or_die<csoaa>();
+    c.all = &all;
     //first parse for number of actions
     uint32_t nb_actions = 0;
 
     nb_actions = (uint32_t)vm["csoaa"].as<size_t>();
-
     //append csoaa with nb_actions to file_options so it is saved to regressor later
-    std::stringstream ss;
-    ss << " --csoaa " << nb_actions;
-    all.file_options.append(ss.str());
+    *all.file_options << " --csoaa " << nb_actions;
 
     all.p->lp = cs_label;
     all.sd->k = nb_actions;
 
-    learner* l = new learner(c, all.l, nb_actions);
-    l->set_learn<csoaa, predict_or_learn<true> >();
-    l->set_predict<csoaa, predict_or_learn<false> >();
-    l->set_finish_example<csoaa,finish_example>();
-    return l;
+    learner<csoaa>& l = init_learner(&c, all.l, predict_or_learn<true>, 
+				     predict_or_learn<false>, nb_actions);
+    l.set_finish_example(finish_example);
+    return make_base(l);
   }
 }
 
@@ -107,7 +103,7 @@ namespace CSOAA_AND_WAP_LDF {
     float csoaa_example_t;
     vw* all;
 
-    learner* base;
+    base_learner* base;
   };
 
 namespace LabelDict { 
@@ -296,7 +292,7 @@ namespace LabelDict {
     ec->indices.decr();
   }
 
-  void make_single_prediction(ldf& data, learner& base, example& ec) {
+  void make_single_prediction(ldf& data, base_learner& base, example& ec) {
     COST_SENSITIVE::label ld = ec.l.cs;
     label_data simple_label;
     simple_label.initial = 0.;
@@ -335,7 +331,7 @@ namespace LabelDict {
     return isTest;
   }
 
-  void do_actual_learning_wap(vw& all, ldf& data, learner& base, size_t start_K)
+  void do_actual_learning_wap(vw& all, ldf& data, base_learner& base, size_t start_K)
   {
     size_t K = data.ec_seq.size();
     vector<COST_SENSITIVE::wclass*> all_costs;
@@ -390,7 +386,7 @@ namespace LabelDict {
     }
   }
 
-  void do_actual_learning_oaa(vw& all, ldf& data, learner& base, size_t start_K)
+  void do_actual_learning_oaa(vw& all, ldf& data, base_learner& base, size_t start_K)
   {
     size_t K = data.ec_seq.size();
     float  min_cost  = FLT_MAX;
@@ -443,7 +439,7 @@ namespace LabelDict {
   }
 
   template <bool is_learn>
-  void do_actual_learning(vw& all, ldf& data, learner& base)
+  void do_actual_learning(vw& all, ldf& data, base_learner& base)
   {
     //cdbg << "do_actual_learning size=" << data.ec_seq.size() << endl;
     if (data.ec_seq.size() <= 0) return;  // nothing to do
@@ -616,7 +612,7 @@ namespace LabelDict {
   }
 
   template <bool is_learn>
-  void predict_or_learn(ldf& data, learner& base, example &ec) {
+  void predict_or_learn(ldf& data, base_learner& base, example &ec) {
     vw* all = data.all;
     data.base = &base;
     bool is_test_ec = COST_SENSITIVE::example_is_test(ec);
@@ -649,7 +645,7 @@ namespace LabelDict {
     }
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {
     po::options_description ldf_opts("LDF Options");
     ldf_opts.add_options()
@@ -658,24 +654,22 @@ namespace LabelDict {
 
     vm = add_options(all, ldf_opts);
     
-    ldf* ld = (ldf*)calloc_or_die(1, sizeof(ldf));
+    ldf& ld = calloc_or_die<ldf>();
 
-    ld->all = &all;
-    ld->need_to_clear = true;
-    ld->first_pass = true;
+    ld.all = &all;
+    ld.need_to_clear = true;
+    ld.first_pass = true;
  
     string ldf_arg;
 
     if( vm.count("csoaa_ldf") ){
       ldf_arg = vm["csoaa_ldf"].as<string>();
-      all.file_options.append(" --csoaa_ldf ");
-      all.file_options.append(ldf_arg);
+      *all.file_options << " --csoaa_ldf " << ldf_arg;
     }
     else {
       ldf_arg = vm["wap_ldf"].as<string>();
-      ld->is_wap = true;
-      all.file_options.append(" --wap_ldf ");
-      all.file_options.append(ldf_arg);
+      ld.is_wap = true;
+      *all.file_options << " --wap_ldf " << ldf_arg;
     }
     if ( vm.count("ldf_override") )
       ldf_arg = vm["ldf_override"].as<string>();
@@ -684,47 +678,45 @@ namespace LabelDict {
 
     all.sd->k = (uint32_t)-1;
 
-    ld->treat_as_classifier = false;
-    ld->is_singleline = false;
+    ld.treat_as_classifier = false;
+    ld.is_singleline = false;
     if (ldf_arg.compare("multiline") == 0 || ldf_arg.compare("m") == 0) {
-      ld->treat_as_classifier = false;
+      ld.treat_as_classifier = false;
     } else if (ldf_arg.compare("multiline-classifier") == 0 || ldf_arg.compare("mc") == 0) {
-      ld->treat_as_classifier = true;
+      ld.treat_as_classifier = true;
     } else {
       if (all.training) {
         cerr << "ldf requires either m/multiline or mc/multiline-classifier, except in test-mode which can be s/sc/singleline/singleline-classifier" << endl;
         throw exception();
       }
       if (ldf_arg.compare("singleline") == 0 || ldf_arg.compare("s") == 0) {
-        ld->treat_as_classifier = false;
-        ld->is_singleline = true;
+        ld.treat_as_classifier = false;
+        ld.is_singleline = true;
       } else if (ldf_arg.compare("singleline-classifier") == 0 || ldf_arg.compare("sc") == 0) {
-        ld->treat_as_classifier = true;
-        ld->is_singleline = true;
+        ld.treat_as_classifier = true;
+        ld.is_singleline = true;
       }
     }
 
-    all.p->emptylines_separate_examples = true; // TODO: check this to be sure!!!  !ld->is_singleline;
+    all.p->emptylines_separate_examples = true; // TODO: check this to be sure!!!  !ld.is_singleline;
 
     if (all.add_constant) {
       all.add_constant = false;
     }
-    ld->label_features.init(256, v_array<feature>(), LabelDict::size_t_eq);
-    ld->label_features.get(1, 94717244); // TODO: figure this out
-
-    ld->read_example_this_loop = 0;
-    ld->need_to_clear = false;
-    learner* l = new learner(ld, all.l);
-    l->set_learn<ldf, predict_or_learn<true> >();
-    l->set_predict<ldf, predict_or_learn<false> >();
-    if (ld->is_singleline)
-      l->set_finish_example<ldf,finish_singleline_example>();
+    ld.label_features.init(256, v_array<feature>(), LabelDict::size_t_eq);
+    ld.label_features.get(1, 94717244); // TODO: figure this out
+
+    ld.read_example_this_loop = 0;
+    ld.need_to_clear = false;
+    learner<ldf>& l = init_learner(&ld, all.l, predict_or_learn<true>, predict_or_learn<false>);
+    if (ld.is_singleline)
+      l.set_finish_example(finish_singleline_example);
     else
-      l->set_finish_example<ldf,finish_multiline_example>();
-    l->set_finish<ldf,finish>();
-    l->set_end_examples<ldf,end_examples>(); 
-    l->set_end_pass<ldf,end_pass>();
-    return l;
+      l.set_finish_example(finish_multiline_example);
+    l.set_finish(finish);
+    l.set_end_examples(end_examples); 
+    l.set_end_pass(end_pass);
+    return make_base(l);
   }
 
   void global_print_newline(vw& all)
diff --git a/vowpalwabbit/csoaa.h b/vowpalwabbit/csoaa.h
index bfaca0f2..79f5e4d2 100644
--- a/vowpalwabbit/csoaa.h
+++ b/vowpalwabbit/csoaa.h
@@ -5,11 +5,11 @@ license as described in the file LICENSE.
  */
 #pragma once
 namespace CSOAA {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
 
 namespace CSOAA_AND_WAP_LDF {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 
 namespace LabelDict { 
   bool ec_is_example_header(example& ec);  // example headers look like "0:-1" or just "shared"
diff --git a/vowpalwabbit/ect.cc b/vowpalwabbit/ect.cc
index 905549ec..dea87040 100644
--- a/vowpalwabbit/ect.cc
+++ b/vowpalwabbit/ect.cc
@@ -184,7 +184,7 @@ namespace ECT
     return e.last_pair + (eliminations-1);
   }
 
-  uint32_t ect_predict(vw& all, ect& e, learner& base, example& ec)
+  uint32_t ect_predict(vw& all, ect& e, base_learner& base, example& ec)
   {
     if (e.k == (size_t)1)
       return 1;
@@ -228,7 +228,7 @@ namespace ECT
     return false;
   }
 
-  void ect_train(vw& all, ect& e, learner& base, example& ec)
+  void ect_train(vw& all, ect& e, base_learner& base, example& ec)
   {
     if (e.k == 1)//nothing to do
       return;
@@ -317,7 +317,7 @@ namespace ECT
       }
   }
 
-  void predict(ect& e, learner& base, example& ec) {
+  void predict(ect& e, base_learner& base, example& ec) {
     vw* all = e.all;
 
     MULTICLASS::multiclass mc = ec.l.multi;
@@ -327,7 +327,7 @@ namespace ECT
     ec.l.multi = mc;
   }
 
-  void learn(ect& e, learner& base, example& ec)
+  void learn(ect& e, base_learner& base, example& ec)
   {
     vw* all = e.all;
 
@@ -360,15 +360,11 @@ namespace ECT
     e.tournaments_won.delete_v();
   }
 
-  void finish_example(vw& all, ect&, example& ec)
-  {
-    MULTICLASS::output_example(all, ec);
-    VW::finish_example(all, &ec);
-  }
+  void finish_example(vw& all, ect&, example& ec) { MULTICLASS::finish_example(all, ec); }
   
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {
-    ect* data = (ect*)calloc_or_die(1, sizeof(ect));
+    ect& data = calloc_or_die<ect>();
     po::options_description ect_opts("ECT options");
     ect_opts.add_options()
       ("error", po::value<size_t>(), "error in ECT");
@@ -376,30 +372,24 @@ namespace ECT
     vm = add_options(all, ect_opts);
 
     //first parse for number of actions
-    data->k = (int)vm["ect"].as<size_t>();
+    data.k = (int)vm["ect"].as<size_t>();
     
     //append ect with nb_actions to options_from_file so it is saved to regressor later
-    stringstream ss;
-    ss << " --ect " << data->k;
-
     if (vm.count("error")) {
-      data->errors = (uint32_t)vm["error"].as<size_t>();
+      data.errors = (uint32_t)vm["error"].as<size_t>();
     } else 
-      data->errors = 0;
+      data.errors = 0;
     //append error flag to options_from_file so it is saved in regressor file later
-    ss << " --error " << data->errors;
-    all.file_options.append(ss.str());
+    *all.file_options << " --ect " << data.k << " --error " << data.errors;
     
     all.p->lp = MULTICLASS::mc_label;
-    size_t wpp = create_circuit(all, *data, data->k, data->errors+1);
-    data->all = &all;
+    size_t wpp = create_circuit(all, data, data.k, data.errors+1);
+    data.all = &all;
     
-    learner* l = new learner(data, all.l, wpp);
-    l->set_learn<ect, learn>();
-    l->set_predict<ect, predict>();
-    l->set_finish_example<ect,finish_example>();
-    l->set_finish<ect,finish>();
+    learner<ect>& l = init_learner(&data, all.l, learn, predict, wpp);
+    l.set_finish_example(finish_example);
+    l.set_finish(finish);
 
-    return l;
+    return make_base(l);
   }
 }
diff --git a/vowpalwabbit/ect.h b/vowpalwabbit/ect.h
index a7c392c0..81129791 100644
--- a/vowpalwabbit/ect.h
+++ b/vowpalwabbit/ect.h
@@ -6,5 +6,5 @@ license as described in the file LICENSE.
 #pragma once
 namespace ECT
 {
-  LEARNER::learner* setup(vw&, po::variables_map&);
+  LEARNER::base_learner* setup(vw&, po::variables_map&);
 }
diff --git a/vowpalwabbit/example.cc b/vowpalwabbit/example.cc
index 2db36e46..52399839 100644
--- a/vowpalwabbit/example.cc
+++ b/vowpalwabbit/example.cc
@@ -40,9 +40,9 @@ float collision_cleanup(feature* feature_map, size_t& len) {
   
 audit_data copy_audit_data(audit_data &src) {
   audit_data dst;
-  dst.space = (char*)calloc_or_die(strlen(src.space)+1, sizeof(char));
+  dst.space = calloc_or_die<char>(strlen(src.space)+1);
   strcpy(dst.space, src.space);
-  dst.feature = (char*)calloc_or_die(strlen(src.feature)+1, sizeof(char));
+  dst.feature = calloc_or_die<char>(strlen(src.feature)+1);
   strcpy(dst.feature, src.feature);
   dst.weight_index = src.weight_index;
   dst.x = src.x;
@@ -135,23 +135,23 @@ void return_features(feature* f)
 
 flat_example* flatten_example(vw& all, example *ec) 
 {
-	flat_example* fec = (flat_example*) calloc_or_die(1,sizeof(flat_example));  
-	fec->l = ec->l;
+  flat_example& fec = calloc_or_die<flat_example>();  
+	fec.l = ec->l;
 
-	fec->tag_len = ec->tag.size();
-	if (fec->tag_len >0)
+	fec.tag_len = ec->tag.size();
+	if (fec.tag_len >0)
 	  {
-	    fec->tag = (char*)calloc_or_die(fec->tag_len+1, sizeof(char));
-	    memcpy(fec->tag,ec->tag.begin, fec->tag_len);
+	    fec.tag = calloc_or_die<char>(fec.tag_len+1);
+	    memcpy(fec.tag,ec->tag.begin, fec.tag_len);
 	  }
 
-	fec->example_counter = ec->example_counter;  
-	fec->ft_offset = ec->ft_offset;  
-	fec->num_features = ec->num_features;  
+	fec.example_counter = ec->example_counter;  
+	fec.ft_offset = ec->ft_offset;  
+	fec.num_features = ec->num_features;  
         
-	fec->feature_map = VW::get_features(all, ec, fec->feature_map_len);
+	fec.feature_map = VW::get_features(all, ec, fec.feature_map_len);
 
-	return fec;  
+	return &fec;  
 }
 
 flat_example* flatten_sort_example(vw& all, example *ec) 
@@ -176,7 +176,7 @@ void free_flatten_example(flat_example* fec)
 
 example *alloc_examples(size_t label_size, size_t count=1)
 {
-  example* ec = (example*)calloc_or_die(count, sizeof(example));
+  example* ec = calloc_or_die<example>(count);
   if (ec == NULL) return NULL;
   for (size_t i=0; i<count; i++) {
     ec[i].in_use = true;
diff --git a/vowpalwabbit/ftrl_proximal.cc b/vowpalwabbit/ftrl_proximal.cc
new file mode 100644
index 00000000..6216058b
--- /dev/null
+++ b/vowpalwabbit/ftrl_proximal.cc
@@ -0,0 +1,225 @@
+/*
+   Copyright (c) by respective owners including Yahoo!, Microsoft, and
+   individual contributors. All rights reserved.  Released under a BSD (revised)
+   license as described in the file LICENSE.
+   */
+#include <fstream>
+#include <float.h>
+#ifndef _WIN32
+#include <netdb.h>
+#endif
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <sys/timeb.h>
+#include "parse_example.h"
+#include "constant.h"
+#include "cache.h"
+#include "simple_label.h"
+#include "vw.h"
+#include "gd.h"
+#include "accumulate.h"
+#include "memory.h"
+#include <exception>
+
+using namespace std;
+using namespace LEARNER;
+
+
+#define W_XT 0   // current parameter w(XT)
+#define W_GT 1   // current gradient  g(GT)
+#define W_ZT 2   // accumulated z(t) = z(t-1) + g(t) + sigma*w(t)
+#define W_G2 3   // accumulated gradient squre n(t) = n(t-1) + g(t)*g(t)
+
+/********************************************************************/
+/* mem & w definition ***********************************************/
+/********************************************************************/ 
+// w[0] = current weight
+// w[1] = current first derivative
+// w[2] = accumulated zt
+// w[3] = accumulated g2
+
+namespace FTRL {
+
+  //nonrentrant
+  struct ftrl {
+
+    vw* all;
+    // set by initializer
+    float ftrl_alpha;
+    float ftrl_beta;
+
+    // evaluation file pointer
+    FILE* fo;
+    bool progressive_validation;
+  };
+  
+  void update_accumulated_state(weight* w, float ftrl_alpha) {
+    double ng2 = w[W_G2] + w[W_GT]*w[W_GT];
+    double sigma = (sqrt(ng2) - sqrt(w[W_G2]))/ ftrl_alpha;
+    w[W_ZT] += w[W_GT] - sigma * w[W_XT];
+    w[W_G2] = ng2;
+  }
+
+  struct update_data {
+    float update;
+    float ftrl_alpha;
+    float ftrl_beta;
+    float l1_lambda;
+    float l2_lambda;
+   };
+   
+  //void update_grad(weight* weights, size_t mask, float loss_grad)
+  void update_grad(update_data& d, float x, float& wref) {
+        float* w = &wref;
+        w[W_GT] = d.update * x;
+        update_accumulated_state(w, d.ftrl_alpha);
+   }
+
+  float ftrl_predict(vw& all, example& ec) {
+    ec.partial_prediction = GD::inline_predict(all, ec);
+    return GD::finalize_prediction(all.sd, ec.partial_prediction);
+  }
+
+  float predict_and_gradient(vw& all, ftrl &b, example& ec) {
+    float fp = ftrl_predict(all, ec);
+    ec.updated_prediction = fp;
+
+    label_data& ld = ec.l.simple;
+    all.set_minmax(all.sd, ld.label);
+
+    struct update_data data;
+    
+    data.update = all.loss->first_derivative(all.sd, fp, ld.label) * ld.weight;
+    data.ftrl_alpha = b.ftrl_alpha;
+    
+    GD::foreach_feature<update_data,update_grad>(all, ec, data);
+
+    return fp;
+  }
+
+ inline float sign(float w){ if (w < 0.) return -1.; else  return 1.;}
+
+ void update_w(update_data& d, float x, float& wref) {
+    float* w = &wref;
+    float flag = sign(w[W_ZT]);
+    float fabs_zt = w[W_ZT] * flag;
+    if (fabs_zt <= d.l1_lambda) {
+      w[W_XT] = 0.;
+    } else {
+      double step = 1/(d.l2_lambda + (d.ftrl_beta + sqrt(w[W_G2]))/d.ftrl_alpha);
+      w[W_XT] = step * flag * (d.l1_lambda - fabs_zt);
+    }
+ }
+ 
+  void update_weight(vw& all, ftrl &b, example& ec) {
+      
+    struct update_data data;
+    
+    data.ftrl_alpha = b.ftrl_alpha;
+    data.ftrl_beta = b.ftrl_beta;
+    data.l1_lambda = all.l1_lambda;
+    data.l2_lambda = all.l2_lambda;
+      
+    GD::foreach_feature<update_data, update_w>(all, ec, data);
+
+  }
+
+  void evaluate_example(vw& all, ftrl& b , example& ec) {
+    label_data& ld = ec.l.simple;
+    ec.loss = all.loss->getLoss(all.sd, ec.updated_prediction, ld.label) * ld.weight;
+    if (b.progressive_validation) {
+      float v = 1./(1 + exp(-ec.updated_prediction));
+      fprintf(b.fo, "%.6f\t%d\n", v, (int)(ld.label * ld.weight));
+    }
+  }
+
+  //void learn(void* a, void* d, example* ec) {
+  void learn(ftrl& a, base_learner& base, example& ec) {
+    vw* all = a.all;
+    assert(ec.in_use);
+ 
+    // predict w*x, compute gradient, update accumulate state
+    ec.pred.scalar = predict_and_gradient(*all, a, ec);
+    // evaluate, statistic
+    evaluate_example(*all, a, ec);
+    // update weight
+    update_weight(*all, a, ec);
+  }
+
+  void save_load(ftrl& b, io_buf& model_file, bool read, bool text) {
+    vw* all = b.all;
+    if (read) {
+      initialize_regressor(*all);
+    } 
+
+    if (model_file.files.size() > 0) {
+      bool resume = all->save_resume;
+      char buff[512];
+      uint32_t text_len = sprintf(buff, ":%d\n", resume);
+      bin_text_read_write_fixed(model_file,(char *)&resume, sizeof (resume), "", read, buff, text_len, text);
+
+      if (resume) {
+        GD::save_load_online_state(*all, model_file, read, text);
+        //save_load_online_state(*all, model_file, read, text);
+      } else {
+        GD::save_load_regressor(*all, model_file, read, text);
+      }
+    }
+
+  }
+  
+  // placeholder
+  void predict(ftrl& b, base_learner& base, example& ec)
+  {
+    vw* all = b.all;
+    //ec.l.simple.prediction = ftrl_predict(*all,ec);
+    ec.pred.scalar = ftrl_predict(*all,ec);
+  }
+  
+  base_learner* setup(vw& all, po::variables_map& vm) 
+  {
+    ftrl& b = calloc_or_die<ftrl>();
+    b.all = &all;
+    b.ftrl_beta = 0.0;
+    b.ftrl_alpha = 0.1;
+
+    po::options_description ftrl_opts("FTRL options");
+
+    ftrl_opts.add_options()
+                ("ftrl_alpha", po::value<float>(&(b.ftrl_alpha)), "Learning rate for FTRL-proximal optimization")
+                ("ftrl_beta", po::value<float>(&(b.ftrl_beta)), "FTRL beta")
+                ("progressive_validation", po::value<string>()->default_value("ftrl.evl"), "File to record progressive validation for ftrl-proximal");
+
+    vm = add_options(all, ftrl_opts);
+
+    if (vm.count("ftrl_alpha")) {
+        b.ftrl_alpha = vm["ftrl_alpha"].as<float>();
+    }
+
+    if (vm.count("ftrl_beta")) {
+        b.ftrl_beta = vm["ftrl_beta"].as<float>();
+    }
+
+    all.reg.stride_shift = 2; // NOTE: for more parameter storage
+    
+    b.progressive_validation = false;
+    if (vm.count("progressive_validation")) {
+      std::string filename = vm["progressive_validation"].as<string>();
+      b.fo = fopen(filename.c_str(), "w");
+      assert(b.fo != NULL);
+      b.progressive_validation = true;
+    }
+
+    if (!all.quiet) {
+        cerr << "Enabling FTRL-Proximal based optimization" << endl;
+        cerr << "ftrl_alpha = " << b.ftrl_alpha << endl;
+        cerr << "ftrl_beta = " << b.ftrl_beta << endl;
+    }
+
+    learner<ftrl>& l = init_learner(&b, learn, 1 << all.reg.stride_shift);
+    l.set_predict(predict);
+    l.set_save_load(save_load);
+    return make_base(l);
+  }
+} // end namespace
diff --git a/vowpalwabbit/ftrl_proximal.h b/vowpalwabbit/ftrl_proximal.h
new file mode 100644
index 00000000..59bf4653
--- /dev/null
+++ b/vowpalwabbit/ftrl_proximal.h
@@ -0,0 +1,12 @@
+/*
+Copyright (c) by respective owners including Yahoo!, Microsoft, and
+individual contributors. All rights reserved.  Released under a BSD
+license as described in the file LICENSE.
+ */
+#ifndef FTRL_PROXIMAL_H
+#define FTRL_PROXIMAL_H
+
+namespace FTRL {
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
+}
+#endif
diff --git a/vowpalwabbit/gd.cc b/vowpalwabbit/gd.cc
index 589d39c2..851fbea7 100644
--- a/vowpalwabbit/gd.cc
+++ b/vowpalwabbit/gd.cc
@@ -32,7 +32,7 @@ using namespace LEARNER;
 namespace GD
 {
   struct gd{
-    double normalized_sum_norm_x;
+    //double normalized_sum_norm_x;
     double total_weight;
     size_t no_win_counter;
     size_t early_stop_thres;
@@ -40,7 +40,9 @@ namespace GD
     float neg_norm_power;
     float neg_power_t;
     float update_multiplier;
-    void (*predict)(gd&, learner&, example&);
+    void (*predict)(gd&, base_learner&, example&);
+    void (*learn)(gd&, base_learner&, example&);
+    void (*update)(gd&, base_learner&, example&);
 
     vw* all;
   };
@@ -75,14 +77,14 @@ namespace GD
     if (normalized) {
       if (sqrt_rate) 
 	{
-	  float avg_norm = (float) g.total_weight / (float) g.normalized_sum_norm_x;
+	  float avg_norm = (float) g.total_weight / (float) g.all->normalized_sum_norm_x;
 	  if (adaptive)
 	    return sqrt(avg_norm);
 	  else
 	    return avg_norm;
 	}
       else 
-	return powf( (float) g.normalized_sum_norm_x / (float) g.total_weight, g.neg_norm_power);
+	return powf( (float) g.all->normalized_sum_norm_x / (float) g.total_weight, g.neg_norm_power);
     }
     return 1.f;
   }
@@ -346,7 +348,7 @@ float finalize_prediction(shared_data* sd, float ret)
  }
 
 template<bool l1, bool audit>
-void predict(gd& g, learner& base, example& ec)
+void predict(gd& g, base_learner& base, example& ec)
 {
   vw& all = *g.all;
   
@@ -451,7 +453,7 @@ template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normaliz
     foreach_feature<norm_data,pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized, spare> >(all, ec, nd);
     
     if(normalized) {
-      g.normalized_sum_norm_x += ld.weight * nd.norm_x;
+      g.all->normalized_sum_norm_x += ld.weight * nd.norm_x;
       g.total_weight += ld.weight;
 
       g.update_multiplier = average_update<sqrt_rate, adaptive, normalized>(g, nd.pred_per_update);
@@ -508,7 +510,7 @@ float compute_update(gd& g, example& ec)
 }
 
 template<bool invariant, bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
-void update(gd& g, learner& base, example& ec)
+void update(gd& g, base_learner& base, example& ec)
 {//invariant: not a test label, importance weight > 0
   float update;
   if ( (update = compute_update<invariant, sqrt_rate, feature_mask_off, adaptive, normalized, spare> (g, ec)) != 0.)
@@ -519,7 +521,7 @@ void update(gd& g, learner& base, example& ec)
 }
 
 template<bool invariant, bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized, size_t spare>
-void learn(gd& g, learner& base, example& ec)
+void learn(gd& g, base_learner& base, example& ec)
 {//invariant: not a test label, importance weight > 0
   assert(ec.in_use);
   assert(ec.l.simple.label != FLT_MAX);
@@ -609,9 +611,10 @@ void save_load_regressor(vw& all, io_buf& model_file, bool read, bool text)
   while ((!read && i < length) || (read && brw >0));  
 }
 
-void save_load_online_state(gd& g, io_buf& model_file, bool read, bool text)
+//void save_load_online_state(gd& g, io_buf& model_file, bool read, bool text)
+void save_load_online_state(vw& all, io_buf& model_file, bool read, bool text)
 {
-  vw& all = *g.all;
+  //vw& all = *g.all;
   
   char buff[512];
   
@@ -620,10 +623,10 @@ void save_load_online_state(gd& g, io_buf& model_file, bool read, bool text)
 			    "", read, 
 			    buff, text_len, text);
 
-  text_len = sprintf(buff, "norm normalizer %f\n", g.normalized_sum_norm_x);
-  bin_text_read_write_fixed(model_file,(char*)&g.normalized_sum_norm_x, sizeof(g.normalized_sum_norm_x), 
-			    "", read, 
-			    buff, text_len, text);
+  text_len = sprintf(buff, "norm normalizer %f\n", all.normalized_sum_norm_x);
+  bin_text_read_write_fixed(model_file,(char*)&all.normalized_sum_norm_x, sizeof(all.normalized_sum_norm_x), 
+  			    "", read, 
+  			    buff, text_len, text);
 
   text_len = sprintf(buff, "t %f\n", all.sd->t);
   bin_text_read_write_fixed(model_file,(char*)&all.sd->t, sizeof(all.sd->t), 
@@ -780,56 +783,57 @@ void save_load(gd& g, io_buf& model_file, bool read, bool text)
 				"", read,
 				buff, text_len, text);
       if (resume)
-	save_load_online_state(g, model_file, read, text);
+	//save_load_online_state(g, model_file, read, text);
+        save_load_online_state(all, model_file, read, text);
       else
 	save_load_regressor(all, model_file, read, text);
     }
 }
 
 template<bool invariant, bool sqrt_rate, uint32_t adaptive, uint32_t normalized, uint32_t spare, uint32_t next>
-uint32_t set_learn(vw& all, learner* ret, bool feature_mask_off)
+uint32_t set_learn(vw& all, bool feature_mask_off, gd& g)
 {
   all.normalized_idx = normalized;
   if (feature_mask_off)
     {
-      ret->set_learn<gd, learn<invariant, sqrt_rate, true, adaptive, normalized, spare> >();
-      ret->set_update<gd, update<invariant, sqrt_rate, true, adaptive, normalized, spare> >();
+      g.learn = learn<invariant, sqrt_rate, true, adaptive, normalized, spare>;
+      g.update = update<invariant, sqrt_rate, true, adaptive, normalized, spare>;
       return next;
     }
   else
     {
-      ret->set_learn<gd, learn<invariant, sqrt_rate, false, adaptive, normalized, spare> >();
-      ret->set_update<gd, update<invariant, sqrt_rate, false, adaptive, normalized, spare> >();
+      g.learn = learn<invariant, sqrt_rate, false, adaptive, normalized, spare>;
+      g.update = update<invariant, sqrt_rate, false, adaptive, normalized, spare>;
       return next;
     }
 }
 
 template<bool sqrt_rate, uint32_t adaptive, uint32_t normalized, uint32_t spare, uint32_t next>
-uint32_t set_learn(vw& all, learner* ret, bool feature_mask_off)
+uint32_t set_learn(vw& all, bool feature_mask_off, gd& g)
 {
   if (all.invariant_updates)
-    return set_learn<true, sqrt_rate, adaptive, normalized, spare, next>(all, ret, feature_mask_off);
+    return set_learn<true, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
   else
-    return set_learn<false, sqrt_rate, adaptive, normalized, spare, next>(all, ret, feature_mask_off);
+    return set_learn<false, sqrt_rate, adaptive, normalized, spare, next>(all, feature_mask_off, g);
 }
 
 template<bool sqrt_rate, uint32_t adaptive, uint32_t spare>
-uint32_t set_learn(vw& all, learner* ret, bool feature_mask_off)
+uint32_t set_learn(vw& all, bool feature_mask_off, gd& g)
 {
   // select the appropriate learn function based on adaptive, normalization, and feature mask
   if (all.normalized_updates)
-    return set_learn<sqrt_rate, adaptive, adaptive+1, adaptive+2, adaptive+3>(all, ret, feature_mask_off);
+    return set_learn<sqrt_rate, adaptive, adaptive+1, adaptive+2, adaptive+3>(all, feature_mask_off, g);
   else
-    return set_learn<sqrt_rate, adaptive, 0, spare, spare+1>(all, ret, feature_mask_off);
+    return set_learn<sqrt_rate, adaptive, 0, spare, spare+1>(all, feature_mask_off, g);
 }
 
 template<bool sqrt_rate>
-uint32_t set_learn(vw& all, learner* ret, bool feature_mask_off)
+uint32_t set_learn(vw& all, bool feature_mask_off, gd& g)
 {
   if (all.adaptive)
-    return set_learn<sqrt_rate, 1, 2>(all, ret, feature_mask_off);
+    return set_learn<sqrt_rate, 1, 2>(all, feature_mask_off, g);
   else
-    return set_learn<sqrt_rate, 0, 0>(all, ret, feature_mask_off);
+    return set_learn<sqrt_rate, 0, 0>(all, feature_mask_off, g);
 }
 
 uint32_t ceil_log_2(uint32_t v)
@@ -840,21 +844,21 @@ uint32_t ceil_log_2(uint32_t v)
     return 1 + ceil_log_2(v >> 1);
 }
 
-learner* setup(vw& all, po::variables_map& vm)
+base_learner* setup(vw& all, po::variables_map& vm)
 {
-  gd* g = (gd*)calloc_or_die(1, sizeof(gd));
-  g->all = &all;
-  g->normalized_sum_norm_x = 0;
-  g->no_win_counter = 0;
-  g->total_weight = 0.;
-  g->early_stop_thres = 3;
-  g->neg_norm_power = (all.adaptive ? (all.power_t - 1.f) : -1.f);
-  g->neg_power_t = - all.power_t;
-  
+  gd& g = calloc_or_die<gd>();
+  g.all = &all;
+  g.all->normalized_sum_norm_x = 0;
+  g.no_win_counter = 0;
+  g.total_weight = 0.;
+  g.early_stop_thres = 3;
+  g.neg_norm_power = (all.adaptive ? (all.power_t - 1.f) : -1.f);
+  g.neg_power_t = - all.power_t;
+
   if(all.initial_t > 0)//for the normalized update: if initial_t is bigger than 1 we interpret this as if we had seen (all.initial_t) previous fake datapoints all with norm 1
     {
-      g->normalized_sum_norm_x = all.initial_t;
-      g->total_weight = all.initial_t;
+      g.all->normalized_sum_norm_x = all.initial_t;
+      g.total_weight = all.initial_t;
     }
 
   bool feature_mask_off = true;
@@ -865,11 +869,11 @@ learner* setup(vw& all, po::variables_map& vm)
   {
     all.sd->holdout_best_loss = FLT_MAX;
     if(vm.count("early_terminate"))      
-      g->early_stop_thres = vm["early_terminate"].as< size_t>();     
+      g.early_stop_thres = vm["early_terminate"].as< size_t>();     
   }
 
   if (vm.count("constant")) {
-      g->initial_constant = vm["constant"].as<float>();     
+      g.initial_constant = vm["constant"].as<float>();     
   }
 
   if( !all.training || ( ( vm.count("sgd") || vm.count("adaptive") || vm.count("invariant") || vm.count("normalized") ) && !vm.count("exact_adaptive_norm")) )
@@ -896,42 +900,28 @@ learner* setup(vw& all, po::variables_map& vm)
     cerr << "Warning: the learning rate for the last pass is multiplied by: " << pow((double)all.eta_decay_rate, (double)all.numpasses)
 	 << " adjust --decay_learning_rate larger to avoid this." << endl;
   
-  learner* ret = new learner(g, 1);
-
   if (all.reg_mode % 2)
     if (all.audit || all.hash_inv)
-      {
-	ret->set_predict<gd, predict<true, true> >();
-	g->predict = predict<true, true>;
-      }
+      g.predict = predict<true, true>;
     else
-      {
-	ret->set_predict<gd, predict<true, false> >();
-	g->predict = predict<true, false>;
-      }
+      g.predict = predict<true, false>;
   else if (all.audit || all.hash_inv)
-    {
-      ret->set_predict<gd, predict<false, true> >();
-      g->predict = predict<false, true>;
-    }
+    g.predict = predict<false, true>;
   else
-    {
-      ret->set_predict<gd, predict<false, false> >();
-      g->predict = predict<false, false>;
-    }
-  
+    g.predict = predict<false, false>;
+
   uint32_t stride;
   if (all.power_t == 0.5)
-    stride = set_learn<true>(all, ret, feature_mask_off);
+    stride = set_learn<true>(all, feature_mask_off, g);
   else
-    stride = set_learn<false>(all, ret, feature_mask_off);
-
+    stride = set_learn<false>(all, feature_mask_off, g);
   all.reg.stride_shift = ceil_log_2(stride-1);
-  ret->increment = ((uint64_t)1 << all.reg.stride_shift);
 
-  ret->set_save_load<gd,save_load>();
-
-  ret->set_end_pass<gd, end_pass>();
-  return ret;
+  learner<gd>& ret = init_learner(&g, g.learn, ((uint64_t)1 << all.reg.stride_shift));
+  ret.set_predict(g.predict);
+  ret.set_update(g.update);
+  ret.set_save_load(save_load);
+  ret.set_end_pass(end_pass);
+  return make_base(ret);
 }
 }
diff --git a/vowpalwabbit/gd.h b/vowpalwabbit/gd.h
index 05bd5b5d..de3964eb 100644
--- a/vowpalwabbit/gd.h
+++ b/vowpalwabbit/gd.h
@@ -24,8 +24,9 @@ namespace GD{
   void compute_update(example* ec);
   void offset_train(regressor &reg, example* &ec, float update, size_t offset);
   void train_one_example_single_thread(regressor& r, example* ex);
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
   void save_load_regressor(vw& all, io_buf& model_file, bool read, bool text);
+  void save_load_online_state(vw& all, io_buf& model_file, bool read, bool text);
   void output_and_account_example(example* ec);
 
   // iterate through one namespace (or its part), callback function T(some_data_R, feature_value_x, feature_weight)
diff --git a/vowpalwabbit/gd_mf.cc b/vowpalwabbit/gd_mf.cc
index df48c934..b57328f4 100644
--- a/vowpalwabbit/gd_mf.cc
+++ b/vowpalwabbit/gd_mf.cc
@@ -272,14 +272,14 @@ void mf_train(vw& all, example& ec)
     all->current_pass++;
   }
 
-  void predict(gdmf& d, learner& base, example& ec)
+  void predict(gdmf& d, base_learner& base, example& ec)
   {
     vw* all = d.all;
  
     mf_predict(*all,ec);
   }
 
-  void learn(gdmf& d, learner& base, example& ec)
+  void learn(gdmf& d, base_learner& base, example& ec)
   {
     vw* all = d.all;
  
@@ -288,10 +288,10 @@ void mf_train(vw& all, example& ec)
       mf_train(*all, ec);
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {
-    gdmf* data = (gdmf*)calloc_or_die(1,sizeof(gdmf)); 
-    data->all = &all;
+    gdmf& data = calloc_or_die<gdmf>(); 
+    data.all = &all;
 
     // store linear + 2*rank weights per index, round up to power of two
     float temp = ceilf(logf((float)(all.rank*2+1)) / logf (2.f));
@@ -330,12 +330,11 @@ void mf_train(vw& all, example& ec)
     }
     all.eta *= powf((float)(all.sd->t), all.power_t);
 
-    learner* l = new learner(data, 1 << all.reg.stride_shift);
-    l->set_learn<gdmf, learn>();
-    l->set_predict<gdmf, predict>();
-    l->set_save_load<gdmf,save_load>();
-    l->set_end_pass<gdmf,end_pass>();
+    learner<gdmf>& l = init_learner(&data, learn, 1 << all.reg.stride_shift);
+    l.set_predict(predict);
+    l.set_save_load(save_load);
+    l.set_end_pass(end_pass);
 
-    return l;
+    return make_base(l);
   }
 }
diff --git a/vowpalwabbit/gd_mf.h b/vowpalwabbit/gd_mf.h
index f574bc95..a0ce2f87 100644
--- a/vowpalwabbit/gd_mf.h
+++ b/vowpalwabbit/gd_mf.h
@@ -11,5 +11,5 @@ license as described in the file LICENSE.
 #include "gd.h"
 
 namespace GDMF{
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/global_data.cc b/vowpalwabbit/global_data.cc
index 12785bfa..825977d5 100644
--- a/vowpalwabbit/global_data.cc
+++ b/vowpalwabbit/global_data.cc
@@ -235,7 +235,7 @@ po::variables_map add_options(vw& all, po::options_description& opts)
 
 vw::vw()
 {
-  sd = (shared_data *) calloc_or_die(1, sizeof(shared_data));
+  sd = &calloc_or_die<shared_data>();
   sd->dump_interval = 1.;   // next update progress dump
   sd->contraction = 1.;
   sd->max_label = 1.;
@@ -250,6 +250,8 @@ vw::vw()
 
   data_filename = "";
 
+  file_options = new std::stringstream;
+
   bfgs = false;
   hessian_on = false;
   active = false;
@@ -287,8 +289,6 @@ vw::vw()
   per_feature_regularizer_output = "";
   per_feature_regularizer_text = "";
 
-  file_options = "";
-
   #ifdef _WIN32
   stdout_fileno = _fileno(stdout);
   #else
diff --git a/vowpalwabbit/global_data.h b/vowpalwabbit/global_data.h
index dd8567a8..bdf76b46 100644
--- a/vowpalwabbit/global_data.h
+++ b/vowpalwabbit/global_data.h
@@ -154,7 +154,6 @@ struct shared_data {
   double holdout_sum_loss_since_last_pass;
   size_t holdout_best_pass; 
 
-  bool binary_label;
   uint32_t k;
 };
 
@@ -170,9 +169,9 @@ struct vw {
 
   node_socks socks;
 
-  LEARNER::learner* l;//the top level learner
-  LEARNER::learner* scorer;//a scoring function
-  LEARNER::learner* cost_sensitive;//a cost sensitive learning algorithm.
+  LEARNER::base_learner* l;//the top level learner
+  LEARNER::base_learner* scorer;//a scoring function
+  LEARNER::base_learner* cost_sensitive;//a cost sensitive learning algorithm.
 
   void learn(example*);
 
@@ -197,9 +196,10 @@ struct vw {
   int m;
 
   bool save_resume;
+  double normalized_sum_norm_x;
 
   po::options_description opts;
-  std::string file_options;
+  std::stringstream* file_options;
   vector<std::string> args;
 
   void* /*Search::search*/ searchstr;
diff --git a/vowpalwabbit/io_buf.cc b/vowpalwabbit/io_buf.cc
index 4dbf3cf1..ba220762 100644
--- a/vowpalwabbit/io_buf.cc
+++ b/vowpalwabbit/io_buf.cc
@@ -3,10 +3,7 @@ Copyright (c) by respective owners including Yahoo!, Microsoft, and
 individual contributors. All rights reserved.  Released under a BSD (revised)
 license as described in the file LICENSE.
  */
-#include <string.h>
-
 #include "io_buf.h"
-
 #ifdef WIN32
 #include <winsock2.h>
 #endif
@@ -110,20 +107,17 @@ void buf_write(io_buf &o, char* &pointer, size_t n)
 }
 
 bool io_buf::is_socket(int f)
-{
-  // this appears to work in practice, but could probably be done in a cleaner fashion
+{ // this appears to work in practice, but could probably be done in a cleaner fashion
   const int _nhandle = 32;
   return f >= _nhandle;
 }
 
 ssize_t io_buf::read_file_or_socket(int f, void* buf, size_t nbytes) {
 #ifdef _WIN32
-  if (is_socket(f)) {
+  if (is_socket(f)) 
     return recv(f, reinterpret_cast<char*>(buf), static_cast<int>(nbytes), 0);
-  }
-  else {
+  else 
     return _read(f, buf, (unsigned int)nbytes); 
-  }
 #else
   return read(f, buf, (unsigned int)nbytes); 
 #endif
@@ -132,12 +126,10 @@ ssize_t io_buf::read_file_or_socket(int f, void* buf, size_t nbytes) {
 ssize_t io_buf::write_file_or_socket(int f, const void* buf, size_t nbytes)
 {
 #ifdef _WIN32
-  if (is_socket(f)) {
+  if (is_socket(f)) 
     return send(f, reinterpret_cast<const char*>(buf), static_cast<int>(nbytes), 0);
-  }
-  else {
+  else 
     return _write(f, buf, (unsigned int)nbytes);
-  }
 #else
   return write(f, buf, (unsigned int)nbytes);
 #endif
@@ -146,12 +138,10 @@ ssize_t io_buf::write_file_or_socket(int f, const void* buf, size_t nbytes)
 void io_buf::close_file_or_socket(int f)
 {
 #ifdef _WIN32
-  if (io_buf::is_socket(f)) {
+  if (io_buf::is_socket(f)) 
     closesocket(f);
-  }
-  else {
+  else 
     _close(f);
-  }
 #else
   close(f);
 #endif
diff --git a/vowpalwabbit/kernel_svm.cc b/vowpalwabbit/kernel_svm.cc
index 0a4eec04..9a88f17e 100644
--- a/vowpalwabbit/kernel_svm.cc
+++ b/vowpalwabbit/kernel_svm.cc
@@ -106,7 +106,7 @@ namespace KSVM
   {
     krow.delete_v();
     // free flatten example contents
-    flat_example *fec = (flat_example*)calloc_or_die(1, sizeof(flat_example));
+    flat_example *fec = &calloc_or_die<flat_example>();
     *fec = ex;
     free_flatten_example(fec); // free contents of flat example and frees fec.
   }
@@ -222,17 +222,17 @@ namespace KSVM
   int save_load_flat_example(io_buf& model_file, bool read, flat_example*& fec) {
     size_t brw = 1;
     if(read) {
-      fec = (flat_example*) calloc_or_die(1, sizeof(flat_example));
+      fec = &calloc_or_die<flat_example>();
       brw = bin_read_fixed(model_file, (char*) fec, sizeof(flat_example), "");
 
       if(brw > 0) {
 	if(fec->tag_len > 0) {
-	  fec->tag = (char*) calloc_or_die(fec->tag_len, sizeof(char));	
+	  fec->tag = calloc_or_die<char>(fec->tag_len);	
 	  brw = bin_read_fixed(model_file, (char*) fec->tag, fec->tag_len*sizeof(char), "");
 	  if(!brw) return 2;
 	}
 	if(fec->feature_map_len > 0) {
-	  fec->feature_map = (feature*) calloc_or_die(fec->feature_map_len, sizeof(feature));
+	  fec->feature_map = calloc_or_die<feature>(fec->feature_map_len);
 	  brw = bin_read_fixed(model_file, (char*) fec->feature_map, fec->feature_map_len*sizeof(feature), ""); 	  if(!brw) return 3;
 	}
       }
@@ -277,7 +277,7 @@ namespace KSVM
     for(uint32_t i = 0;i < model->num_support;i++) {
       if(read) {
 	save_load_flat_example(model_file, read, fec);
-	svm_example* tmp= (svm_example*)calloc_or_die(1,sizeof(svm_example));
+	svm_example* tmp= &calloc_or_die<svm_example>();
 	tmp->init_svm_example(fec);
 	model->support_vec.push_back(tmp);
       }
@@ -395,10 +395,10 @@ namespace KSVM
     }
   }
   
-  void predict(svm_params& params, learner &base, example& ec) {
+  void predict(svm_params& params, base_learner &base, example& ec) {
     flat_example* fec = flatten_sort_example(*(params.all),&ec);    
     if(fec) {
-      svm_example* sec = (svm_example*)calloc_or_die(1, sizeof(svm_example)); 
+      svm_example* sec = &calloc_or_die<svm_example>(); 
       sec->init_svm_example(fec);
       float score;
       predict(params, &sec, &score, 1);
@@ -556,7 +556,7 @@ namespace KSVM
       
     }
 
-    size_t* sizes = (size_t*) calloc_or_die(all.total, sizeof(size_t));
+    size_t* sizes = calloc_or_die<size_t>(all.total);
     sizes[all.node] = b->space.end - b->space.begin;
     //cerr<<"Sizes = "<<sizes[all.node]<<" ";
     all_reduce<size_t, add_size_t>(sizes, all.total, all.span_server, all.unique_id, all.total, all.node, all.socks);
@@ -570,7 +570,7 @@ namespace KSVM
     
     //cerr<<total_sum<<" "<<prev_sum<<endl;
     if(total_sum > 0) {
-      queries = (char*) calloc_or_die(total_sum, sizeof(char));
+      queries = calloc_or_die<char>(total_sum);
       memcpy(queries + prev_sum, b->space.begin, b->space.end - b->space.begin);
       b->space.delete_v();
       all_reduce<char, copy_char>(queries, total_sum, all.span_server, all.unique_id, all.total, all.node, all.socks);
@@ -584,7 +584,7 @@ namespace KSVM
       
       for(size_t i = 0;i < params.pool_size; i++) {	
 	if(!save_load_flat_example(*b, true, fec)) {
-	  params.pool[i] = (svm_example*)calloc_or_die(1,sizeof(svm_example));
+	  params.pool[i] = &calloc_or_die<svm_example>();
 	  params.pool[i]->init_svm_example(fec);
 	  train_pool[i] = true;
 	  params.pool_pos++;
@@ -617,11 +617,11 @@ namespace KSVM
     
     //cerr<<"In train "<<params.all->training<<endl;
     
-    bool* train_pool = (bool*)calloc_or_die(params.pool_size, sizeof(bool));
+    bool* train_pool = calloc_or_die<bool>(params.pool_size);
     for(size_t i = 0;i < params.pool_size;i++)
       train_pool[i] = false;
     
-    float* scores = (float*)calloc_or_die(params.pool_pos, sizeof(float));
+    float* scores = calloc_or_die<float>(params.pool_pos);
     predict(params, params.pool, scores, params.pool_pos);
     //cout<<scores[0]<<endl;
     
@@ -690,7 +690,7 @@ namespace KSVM
 	  bool overshoot = update(params, model_pos);
 	  //cerr<<model_pos<<":alpha = "<<model->alpha[model_pos]<<endl;
 
-	  double* subopt = (double*)calloc_or_die(model->num_support,sizeof(double));
+	  double* subopt = calloc_or_die<double>(model->num_support);
 	  for(size_t j = 0;j < params.reprocess;j++) {
 	    if(model->num_support == 0) break;
 	    //cerr<<"reprocess: ";
@@ -733,13 +733,13 @@ namespace KSVM
     //cerr<<params.model->support_vec[0]->example_counter<<endl;
   }
 
-  void learn(svm_params& params, learner& base, example& ec) {
+  void learn(svm_params& params, base_learner& base, example& ec) {
     flat_example* fec = flatten_sort_example(*(params.all),&ec);
     // for(int i = 0;i < fec->feature_map_len;i++)
     //   cout<<i<<":"<<fec->feature_map[i].x<<" "<<fec->feature_map[i].weight_index<<" ";
     // cout<<endl;
     if(fec) {
-      svm_example* sec = (svm_example*)calloc_or_die(1, sizeof(svm_example));
+      svm_example* sec = &calloc_or_die<svm_example>();
       sec->init_svm_example(fec);
       float score = 0;
       predict(params, &sec, &score, 1);
@@ -791,7 +791,7 @@ namespace KSVM
   }
 
 
-  LEARNER::learner* setup(vw &all, po::variables_map& vm) {
+  LEARNER::base_learner* setup(vw &all, po::variables_map& vm) {
     po::options_description desc("KSVM options");
     desc.add_options()
       ("reprocess", po::value<size_t>(), "number of reprocess steps for LASVM")
@@ -810,56 +810,54 @@ namespace KSVM
     string loss_function = "hinge";
     float loss_parameter = 0.0;
     delete all.loss;
-    all.loss = getLossFunction(&all, loss_function, (float)loss_parameter);
-
-    svm_params* params = (svm_params*) calloc_or_die(1,sizeof(svm_params));
-    params->model = (svm_model*) calloc_or_die(1,sizeof(svm_model));
-    params->model->num_support = 0;
-    //params->curcache = 0;
-    params->maxcache = 1024*1024*1024;
-    params->loss_sum = 0.;
-    params->all = &all;
+    all.loss = getLossFunction(all, loss_function, (float)loss_parameter);
+
+    svm_params& params = calloc_or_die<svm_params>();
+    params.model = &calloc_or_die<svm_model>();
+    params.model->num_support = 0;
+    //params.curcache = 0;
+    params.maxcache = 1024*1024*1024;
+    params.loss_sum = 0.;
+    params.all = &all;
     
     if(vm.count("reprocess"))
-      params->reprocess = vm["reprocess"].as<std::size_t>();
+      params.reprocess = vm["reprocess"].as<std::size_t>();
     else 
-      params->reprocess = 1;
+      params.reprocess = 1;
 
     if(vm.count("active"))
-      params->active = true;
-    if(params->active) {
+      params.active = true;
+    if(params.active) {
       if(vm.count("active_c"))
-	params->active_c = vm["active_c"].as<double>();
+	params.active_c = vm["active_c"].as<double>();
       else
-	params->active_c = 1.;
+	params.active_c = 1.;
       if(vm.count("pool_greedy"))
-	params->active_pool_greedy = 1;
+	params.active_pool_greedy = 1;
       /*if(vm.count("para_active"))
-	params->para_active = 1;*/
+	params.para_active = 1;*/
     }
     
     if(vm.count("pool_size")) 
-      params->pool_size = vm["pool_size"].as<std::size_t>();
+      params.pool_size = vm["pool_size"].as<std::size_t>();
     else
-      params->pool_size = 1;
+      params.pool_size = 1;
     
-    params->pool = (svm_example**)calloc_or_die(params->pool_size, sizeof(svm_example*));
-    params->pool_pos = 0;
+    params.pool = calloc_or_die<svm_example*>(params.pool_size);
+    params.pool_pos = 0;
     
     if(vm.count("subsample"))
-	params->subsample = vm["subsample"].as<std::size_t>();
-      else if(params->para_active)
-	params->subsample = (size_t)ceil(params->pool_size / all.total);
+	params.subsample = vm["subsample"].as<std::size_t>();
+      else if(params.para_active)
+	params.subsample = (size_t)ceil(params.pool_size / all.total);
       else
-	params->subsample = 1;
+	params.subsample = 1;
     
-    params->lambda = all.l2_lambda;
+    params.lambda = all.l2_lambda;
 
-    std::stringstream ss1, ss2;
-    ss1 <<" --lambda "<< params->lambda;
-    all.file_options.append(ss1.str());
+    *all.file_options <<" --lambda "<< params.lambda;
       
-    cerr<<"Lambda = "<<params->lambda<<endl;
+    cerr<<"Lambda = "<<params.lambda<<endl;
 
     std::string kernel_type;
 
@@ -868,48 +866,42 @@ namespace KSVM
     else
       kernel_type = string("linear");
     
-    ss2 <<" --kernel "<< kernel_type;
-    all.file_options.append(ss2.str());
+    *all.file_options <<" --kernel "<< kernel_type;
 
     cerr<<"Kernel = "<<kernel_type<<endl;
 
     if(kernel_type.compare("rbf") == 0) {
-      params->kernel_type = SVM_KER_RBF;
+      params.kernel_type = SVM_KER_RBF;
       float bandwidth = 1.;
       if(vm.count("bandwidth")) {
-		std::stringstream ss;
 		bandwidth = vm["bandwidth"].as<float>();	
-		ss<<" --bandwidth "<<bandwidth;
-		all.file_options.append(ss.str());
+		*all.file_options <<" --bandwidth "<<bandwidth;
       }
       cerr<<"bandwidth = "<<bandwidth<<endl;
-      params->kernel_params = calloc_or_die(1,sizeof(double*));
-      *((float*)params->kernel_params) = bandwidth;
+      params.kernel_params = &calloc_or_die<double>();
+      *((float*)params.kernel_params) = bandwidth;
     }
     else if(kernel_type.compare("poly") == 0) {
-      params->kernel_type = SVM_KER_POLY;
+      params.kernel_type = SVM_KER_POLY;
       int degree = 2;
       if(vm.count("degree")) {
-	  std::stringstream ss;
 	  degree = vm["degree"].as<int>();	
-	  ss<<" --degree "<<degree;
-	  all.file_options.append(ss.str());
+	  *all.file_options <<" --degree "<<degree;
 	}
       cerr<<"degree = "<<degree<<endl;
-      params->kernel_params = calloc_or_die(1,sizeof(int*));
-      *((int*)params->kernel_params) = degree;
+      params.kernel_params = &calloc_or_die<int>();
+      *((int*)params.kernel_params) = degree;
     }      
     else
-      params->kernel_type = SVM_KER_LIN;            
+      params.kernel_type = SVM_KER_LIN;            
 	
-    params->all->reg.weight_mask = (uint32_t)LONG_MAX;
-    params->all->reg.stride_shift = 0;
+    params.all->reg.weight_mask = (uint32_t)LONG_MAX;
+    params.all->reg.stride_shift = 0;
     
-    learner* l = new learner(params, 1); 
-    l->set_learn<svm_params, learn>();
-    l->set_predict<svm_params, predict>();
-    l->set_save_load<svm_params, save_load>();
-    l->set_finish<svm_params, finish>();
-    return l;
+    learner<svm_params>& l = init_learner(&params, learn, 1); 
+    l.set_predict(predict);
+    l.set_save_load(save_load);
+    l.set_finish(finish);
+    return make_base(l);
   }    
 }
diff --git a/vowpalwabbit/kernel_svm.h b/vowpalwabbit/kernel_svm.h
index d195c7ca..7a65a051 100644
--- a/vowpalwabbit/kernel_svm.h
+++ b/vowpalwabbit/kernel_svm.h
@@ -6,5 +6,5 @@ license as described in the file LICENSE.
 #pragma once
 namespace KSVM
 {
-LEARNER::learner* setup(vw &all, po::variables_map& vm);
+LEARNER::base_learner* setup(vw &all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/lda_core.cc b/vowpalwabbit/lda_core.cc
index 90f9e171..8da81a4d 100644
--- a/vowpalwabbit/lda_core.cc
+++ b/vowpalwabbit/lda_core.cc
@@ -691,7 +691,7 @@ void save_load(lda& l, io_buf& model_file, bool read, bool text)
     l.doc_lengths.erase();
   }
   
-  void learn(lda& l, learner& base, example& ec) 
+  void learn(lda& l, base_learner& base, example& ec) 
   {
     size_t num_ex = l.examples.size();
     l.examples.push_back(&ec);
@@ -709,7 +709,7 @@ void save_load(lda& l, io_buf& model_file, bool read, bool text)
   }
 
   // placeholder
-  void predict(lda& l, learner& base, example& ec)
+  void predict(lda& l, base_learner& base, example& ec)
   {
     learn(l, base, ec);
   }
@@ -746,13 +746,13 @@ void end_examples(lda& l)
     ld.v.delete_v();
   }
 
-learner* setup(vw&all, po::variables_map& vm)
+base_learner* setup(vw&all, po::variables_map& vm)
 {
-  lda* ld = (lda*)calloc_or_die(1,sizeof(lda));
-  ld->sorted_features = vector<index_feature>();
-  ld->total_lambda_init = 0;
-  ld->all = &all;
-  ld->example_t = all.initial_t;
+  lda& ld = calloc_or_die<lda>();
+  ld.sorted_features = vector<index_feature>();
+  ld.total_lambda_init = 0;
+  ld.all = &all;
+  ld.example_t = all.initial_t;
 
   po::options_description lda_opts("LDA options");
   lda_opts.add_options()
@@ -769,9 +769,7 @@ learner* setup(vw&all, po::variables_map& vm)
   all.random_weights = true;
   all.add_constant = false;
 
-  std::stringstream ss;
-  ss << " --lda " << all.lda;
-  all.file_options.append(ss.str());
+  *all.file_options << " --lda " << all.lda;
 
   if (all.eta > 1.)
     {
@@ -784,19 +782,18 @@ learner* setup(vw&all, po::variables_map& vm)
     all.p->ring_size = all.p->ring_size > minibatch2 ? all.p->ring_size : minibatch2;
   }
   
-  ld->v.resize(all.lda*all.minibatch);
+  ld.v.resize(all.lda*all.minibatch);
   
-  ld->decay_levels.push_back(0.f);
-
-  learner* l = new learner(ld, 1 << all.reg.stride_shift);
-  l->set_learn<lda,learn>();
-  l->set_predict<lda,predict>();
-  l->set_save_load<lda,save_load>();
-  l->set_finish_example<lda,finish_example>();
-  l->set_end_examples<lda,end_examples>();  
-  l->set_end_pass<lda,end_pass>();  
-  l->set_finish<lda,finish>();
+  ld.decay_levels.push_back(0.f);
+
+  learner<lda>& l = init_learner(&ld, learn, 1 << all.reg.stride_shift);
+  l.set_predict(predict);
+  l.set_save_load(save_load);
+  l.set_finish_example(finish_example);
+  l.set_end_examples(end_examples);  
+  l.set_end_pass(end_pass);  
+  l.set_finish(finish);
   
-  return l;
+  return make_base(l);
 }
 }
diff --git a/vowpalwabbit/lda_core.h b/vowpalwabbit/lda_core.h
index 3a377d9f..2a065783 100644
--- a/vowpalwabbit/lda_core.h
+++ b/vowpalwabbit/lda_core.h
@@ -5,5 +5,5 @@ license as described in the file LICENSE.
  */
 #pragma once
 namespace LDA{
-  LEARNER::learner* setup(vw&, po::variables_map&);
+  LEARNER::base_learner* setup(vw&, po::variables_map&);
 }
diff --git a/vowpalwabbit/learner.cc b/vowpalwabbit/learner.cc
index e718d309..0376a00c 100644
--- a/vowpalwabbit/learner.cc
+++ b/vowpalwabbit/learner.cc
@@ -2,8 +2,7 @@
 #include "parser.h"
 #include "learner.h"
 #include "vw.h"
-
-void save_predictor(vw& all, string reg_name, size_t current_pass);
+#include "parse_regressor.h"
 
 void dispatch_example(vw& all, example& ec)
 {
@@ -16,44 +15,55 @@ void dispatch_example(vw& all, example& ec)
 
 namespace LEARNER
 {
-  void generic_driver(vw* all)
+  void generic_driver(vw& all)
   {
     example* ec = NULL;
 
-    all->l->init_driver();
-    while ( true )
+    all.l->init_driver();
+    while ( all.early_terminate == false )
       {
-	if ((ec = VW::get_example(all->p)) != NULL)//semiblocking operation.
+	if ((ec = VW::get_example(all.p)) != NULL)//semiblocking operation.
 	  {
 	    if (ec->indices.size() > 1) // 1+ nonconstant feature. (most common case first)
-	      dispatch_example(*all, *ec);
+	      dispatch_example(all, *ec);
 	    else if (ec->end_pass)
 	      {
-		all->l->end_pass();
-		VW::finish_example(*all,ec);
+		all.l->end_pass();
+		VW::finish_example(all, ec);
 	      }
 	    else if (ec->tag.size() >= 4 && !strncmp((const char*) ec->tag.begin, "save", 4))
 	      {// save state command
 
-		string final_regressor_name = all->final_regressor_name;
+		string final_regressor_name = all.final_regressor_name;
 		
 		if ((ec->tag).size() >= 6 && (ec->tag)[4] == '_')
 		  final_regressor_name = string(ec->tag.begin+5, (ec->tag).size()-5);
 		
-		if (!all->quiet)
+		if (!all.quiet)
 		  cerr << "saving regressor to " << final_regressor_name << endl;
-		save_predictor(*all, final_regressor_name, 0);
+		save_predictor(all, final_regressor_name, 0);
 		
-		VW::finish_example(*all,ec);
+		VW::finish_example(all,ec);
 	      }
 	    else // empty example
-	      dispatch_example(*all, *ec);
+	      dispatch_example(all, *ec);
 	  }
-	else if (parser_done(all->p))
+	else if (parser_done(all.p))
 	  {
-	    all->l->end_examples();
+	    all.l->end_examples();
 	    return;
 	  }
       }
+    if (all.early_terminate) //drain any extra examples from parser and call end_examples
+      while ( all.early_terminate == false )
+	{
+	  if ((ec = VW::get_example(all.p)) != NULL)//semiblocking operation.
+	    VW::finish_example(all, ec);
+	  else if (parser_done(all.p))
+	    {
+	      all.l->end_examples();
+	      return;
+	    }
+	}
   }
 }
diff --git a/vowpalwabbit/learner.h b/vowpalwabbit/learner.h
index 7deeb4d2..2649429c 100644
--- a/vowpalwabbit/learner.h
+++ b/vowpalwabbit/learner.h
@@ -6,6 +6,7 @@ license as described in the file LICENSE.
 #pragma once
 // This is the interface for a learning algorithm
 #include<iostream>
+#include"memory.h"
 using namespace std;
 
 struct vw;
@@ -13,15 +14,16 @@ void return_simple_example(vw& all, void*, example& ec);
   
 namespace LEARNER
 {
-  struct learner;
+  template<class T> struct learner;
+  typedef learner<char> base_learner;
   
   struct func_data {
     void* data;
-    learner* base;
+    base_learner* base;
     void (*func)(void* data);
   };
   
-  inline func_data tuple_dbf(void* data, learner* base, void (*func)(void* data))
+  inline func_data tuple_dbf(void* data, base_learner* base, void (*func)(void* data))
   {
     func_data foo;
     foo.data = data;
@@ -32,202 +34,177 @@ namespace LEARNER
   
   struct learn_data {
     void* data;
-    learner* base;
-    void (*learn_f)(void* data, learner& base, example&);
-    void (*predict_f)(void* data, learner& base, example&);
-    void (*update_f)(void* data, learner& base, example&);
+    base_learner* base;
+    void (*learn_f)(void* data, base_learner& base, example&);
+    void (*predict_f)(void* data, base_learner& base, example&);
+    void (*update_f)(void* data, base_learner& base, example&);
   };
 
   struct save_load_data{
     void* data;
-    learner* base;
+    base_learner* base;
     void (*save_load_f)(void*, io_buf&, bool read, bool text);
   };
   
   struct finish_example_data{
     void* data;
-    learner* base;
+    base_learner* base;
     void (*finish_example_f)(vw&, void* data, example&);
   };
   
-  void generic_driver(vw* all);
+  void generic_driver(vw& all);
   
-  inline void generic_sl(void*, io_buf&, bool, bool) {}
-  inline void generic_learner(void* data, learner& base, example&) {}
-  inline void generic_func(void* data) {}
-
-  const save_load_data generic_save_load_fd = {NULL, NULL, generic_sl};
-  const learn_data generic_learn_fd = {NULL, NULL, generic_learner, generic_learner, NULL};
-  const func_data generic_func_fd = {NULL, NULL, generic_func};
+  inline void noop_sl(void*, io_buf&, bool, bool) {}
+  inline void noop(void* data) {}
+
+  typedef void (*tlearn)(void* d, base_learner& base, example& ec);
+  typedef void (*tsl)(void* d, io_buf& io, bool read, bool text);
+  typedef void (*tfunc)(void*d);
+  typedef void (*tend_example)(vw& all, void* d, example& ec);
+
+  template<class T> learner<T>& init_learner(T*, void (*)(T&, base_learner&, example&), size_t);
+  template<class T> 
+    learner<T>& init_learner(T*, base_learner*, void (*learn)(T&, base_learner&, example&), 
+			     void (*predict)(T&, base_learner&, example&), size_t ws = 1);
   
-  template<class R, void (*T)(R&, learner& base, example& ec)>
-    inline void tlearn(void* d, learner& base, example& ec)
-    { T(*(R*)d, base, ec); }
-
-  template<class R, void (*T)(R&, io_buf& io, bool read, bool text)>
-    inline void tsl(void* d, io_buf& io, bool read, bool text)
-  { T(*(R*)d, io, read, text); }
-
-  template<class R, void (*T)(R&)>
-    inline void tfunc(void* d) { T(*(R*)d); }
-
-  template<class R, void (*T)(vw& all, R&, example&)>
-    inline void tend_example(vw& all, void* d, example& ec)
-  { T(all, *(R*)d, ec); }
-
-  template <class T, void (*learn)(T* data, learner& base, example&), void (*predict)(T* data, learner& base, example&)>
-    struct learn_helper {
-      void (*learn_f)(void* data, learner& base, example&);
-      void (*predict_f)(void* data, learner& base, example&);
+  template<class T>
+    struct learner {
+    private:
+      func_data init_fd;
+      learn_data learn_fd;
+      finish_example_data finish_example_fd;
+      save_load_data save_load_fd;
+      func_data end_pass_fd;
+      func_data end_examples_fd;
+      func_data finisher_fd;
+      
+    public:
+      size_t weights; //this stores the number of "weight vectors" required by the learner.
+      size_t increment;
       
-      learn_helper() 
-      { learn_f = tlearn<T,learn>;
-	predict_f = tlearn<T,predict>;
+      //called once for each example.  Must work under reduction.
+      inline void learn(example& ec, size_t i=0) 
+      { 
+	ec.ft_offset += (uint32_t)(increment*i);
+	learn_fd.learn_f(learn_fd.data, *learn_fd.base, ec);
+	ec.ft_offset -= (uint32_t)(increment*i);
       }
+      inline void predict(example& ec, size_t i=0) 
+      { 
+	ec.ft_offset += (uint32_t)(increment*i);
+	learn_fd.predict_f(learn_fd.data, *learn_fd.base, ec);
+	ec.ft_offset -= (uint32_t)(increment*i);
+      }
+      inline void set_predict(void (*u)(T& data, base_learner& base, example&))
+      { learn_fd.predict_f = (tlearn)u; }
+      
+      inline void update(example& ec, size_t i=0) 
+      { 
+	ec.ft_offset += (uint32_t)(increment*i);
+	learn_fd.update_f(learn_fd.data, *learn_fd.base, ec);
+	ec.ft_offset -= (uint32_t)(increment*i);
+      }
+      inline void set_update(void (*u)(T& data, base_learner& base, example&))
+      { learn_fd.update_f = (tlearn)u; }
+      
+      //called anytime saving or loading needs to happen. Autorecursive.
+      inline void save_load(io_buf& io, bool read, bool text) 
+      { save_load_fd.save_load_f(save_load_fd.data, io, read, text); 
+	if (save_load_fd.base) save_load_fd.base->save_load(io, read, text); }
+      inline void set_save_load(void (*sl)(T&, io_buf&, bool, bool))
+      { save_load_fd.save_load_f = (tsl)sl; 
+	save_load_fd.data = learn_fd.data; 
+	save_load_fd.base = learn_fd.base;}
+      
+      //called to clean up state.  Autorecursive.
+      void set_finish(void (*f)(T&)) 
+      { finisher_fd = tuple_dbf(learn_fd.data,learn_fd.base, (tfunc)f); }
+      inline void finish() 
+      { 
+	if (finisher_fd.data) 
+	  {finisher_fd.func(finisher_fd.data); free(finisher_fd.data); } 
+	if (finisher_fd.base) { 
+	  finisher_fd.base->finish();
+	  free(finisher_fd.base);
+	}
+      }
+      
+      void end_pass(){ 
+	end_pass_fd.func(end_pass_fd.data);
+	if (end_pass_fd.base) end_pass_fd.base->end_pass(); }//autorecursive
+      void set_end_pass(void (*f)(T&)) 
+      {end_pass_fd = tuple_dbf(learn_fd.data, learn_fd.base, (tfunc)f);}
+      
+      //called after parsing of examples is complete.  Autorecursive.
+      void end_examples() 
+      { end_examples_fd.func(end_examples_fd.data); 
+	if (end_examples_fd.base) end_examples_fd.base->end_examples(); }  
+      void set_end_examples(void (*f)(T&)) 
+      {end_examples_fd = tuple_dbf(learn_fd.data,learn_fd.base, (tfunc)f);}
+      
+      //Called at the beginning by the driver.  Explicitly not recursive.
+      void init_driver() { init_fd.func(init_fd.data);}
+      void set_init_driver(void (*f)(T&)) 
+      { init_fd = tuple_dbf(learn_fd.data,learn_fd.base, (tfunc)f); }
+      
+      //called after learn example for each example.  Explicitly not recursive.
+      inline void finish_example(vw& all, example& ec) 
+      { finish_example_fd.finish_example_f(all, finish_example_fd.data, ec);}
+      void set_finish_example(void (*f)(vw& all, T&, example&))
+      {finish_example_fd.data = learn_fd.data;
+	finish_example_fd.finish_example_f = (tend_example)f;}
+      
+      friend learner<T>& init_learner<>(T*, void (*learn)(T&, base_learner&, example&), size_t);
+      friend learner<T>& init_learner<>(T*, base_learner*, void (*l)(T&, base_learner&, example&), 
+					void (*pred)(T&, base_learner&, example&), size_t);
     };
-
-struct learner {
-private:
-  func_data init_fd;
-  learn_data learn_fd;
-  finish_example_data finish_example_fd;
-  save_load_data save_load_fd;
-  func_data end_pass_fd;
-  func_data end_examples_fd;
-  func_data finisher_fd;
   
-public:
-  size_t weights; //this stores the number of "weight vectors" required by the learner.
-  size_t increment;
-
-  //called once for each example.  Must work under reduction.
-  inline void learn(example& ec, size_t i=0) 
-  { 
-    ec.ft_offset += (uint32_t)(increment*i);
-    learn_fd.learn_f(learn_fd.data, *learn_fd.base, ec);
-    ec.ft_offset -= (uint32_t)(increment*i);
-  }
-  template <class T, void (*u)(T& data, learner& base, example&)>
-  inline void set_learn()
-  {
-    learn_fd.learn_f = tlearn<T,u>;
-    learn_fd.update_f = tlearn<T,u>;
-  }
-
-  inline void predict(example& ec, size_t i=0) 
-  { 
-    ec.ft_offset += (uint32_t)(increment*i);
-    learn_fd.predict_f(learn_fd.data, *learn_fd.base, ec);
-    ec.ft_offset -= (uint32_t)(increment*i);
-  }
-  template <class T, void (*u)(T& data, learner& base, example&)>
-  inline void set_predict()
-  {
-    learn_fd.predict_f = tlearn<T,u>;
-  }
-
-  inline void update(example& ec, size_t i=0) 
-  { 
-    ec.ft_offset += (uint32_t)(increment*i);
-    learn_fd.update_f(learn_fd.data, *learn_fd.base, ec);
-    ec.ft_offset -= (uint32_t)(increment*i);
-  }
-  template <class T, void (*u)(T& data, learner& base, example&)>
-  inline void set_update()
-  {
-    learn_fd.update_f = tlearn<T,u>;
-  }
-
-  //called anytime saving or loading needs to happen. Autorecursive.
-  inline void save_load(io_buf& io, bool read, bool text) { save_load_fd.save_load_f(save_load_fd.data, io, read, text); if (save_load_fd.base) save_load_fd.base->save_load(io, read, text); }
-  template <class T, void (*sl)(T&, io_buf&, bool, bool)>
-  inline void set_save_load()
-  { save_load_fd.save_load_f = tsl<T,sl>; 
-    save_load_fd.data = learn_fd.data; 
-    save_load_fd.base = learn_fd.base;}
-
-  //called to clean up state.  Autorecursive.
-  template <class T, void (*f)(T&)>
-  void set_finish() { finisher_fd = tuple_dbf(learn_fd.data,learn_fd.base, tfunc<T, f>); }
-  inline void finish() 
-  { 
-    if (finisher_fd.data) 
-      {finisher_fd.func(finisher_fd.data); free(finisher_fd.data); } 
-    if (finisher_fd.base) { 
-      finisher_fd.base->finish();
-      delete finisher_fd.base;
+  template<class T> 
+    learner<T>& init_learner(T* dat, void (*learn)(T&, base_learner&, example&), 
+			     size_t params_per_weight)
+    { // the constructor for all learning algorithms.
+      learner<T>& ret = calloc_or_die<learner<T> >();
+      ret.weights = 1;
+      ret.increment = params_per_weight;
+      ret.end_pass_fd.func = noop;
+      ret.end_examples_fd.func = noop;
+      ret.init_fd.func = noop;
+      ret.save_load_fd.save_load_f = noop_sl;
+      ret.finisher_fd.data = dat;
+      ret.finisher_fd.func = noop;
+      
+      ret.learn_fd.data = dat;
+      ret.learn_fd.learn_f = (tlearn)learn;
+      ret.learn_fd.update_f = (tlearn)learn;
+      ret.learn_fd.predict_f = (tlearn)learn;
+      ret.finish_example_fd.data = dat;
+      ret.finish_example_fd.finish_example_f = return_simple_example;
+
+      return ret;
     }
-  }
-
-  void end_pass(){ 
-    end_pass_fd.func(end_pass_fd.data);
-    if (end_pass_fd.base) end_pass_fd.base->end_pass(); }//autorecursive
-  template <class T, void (*f)(T&)>
-    void set_end_pass() {end_pass_fd = tuple_dbf(learn_fd.data, learn_fd.base, tfunc<T,f>);}
-
-  //called after parsing of examples is complete.  Autorecursive.
-  void end_examples() 
-  { end_examples_fd.func(end_examples_fd.data); 
-    if (end_examples_fd.base) end_examples_fd.base->end_examples(); }  
-  template <class T, void (*f)(T&)>
-    void set_end_examples() {end_examples_fd = tuple_dbf(learn_fd.data,learn_fd.base, tfunc<T,f>);}
-
-  //Called at the beginning by the driver.  Explicitly not recursive.
-  void init_driver() { init_fd.func(init_fd.data);}
-  template <class T, void (*f)(T&)>
-  void set_init_driver() { init_fd = tuple_dbf(learn_fd.data,learn_fd.base, tfunc<T,f>); }
-
-  //called after learn example for each example.  Explicitly not recursive.
-  inline void finish_example(vw& all, example& ec) { finish_example_fd.finish_example_f(all, finish_example_fd.data, ec);}
-  template<class T, void (*f)(vw& all, T&, example&)>
-  void set_finish_example()
-  {finish_example_fd.data = learn_fd.data;
-    finish_example_fd.finish_example_f = tend_example<T,f>;}
-
-  void driver(vw* all) {LEARNER::generic_driver(all);}
-
-  inline learner()
-  {
-    weights = 1;
-    increment = 1;
-
-    learn_fd = LEARNER::generic_learn_fd;
-    finish_example_fd.data = NULL;
-    finish_example_fd.finish_example_f = return_simple_example;
-    end_pass_fd = LEARNER::generic_func_fd;
-    end_examples_fd = LEARNER::generic_func_fd;
-    init_fd = LEARNER::generic_func_fd;
-    finisher_fd = LEARNER::generic_func_fd;
-    save_load_fd = LEARNER::generic_save_load_fd;
-  }
-
-  inline learner(void* dat, size_t params_per_weight)
-  { // the constructor for all learning algorithms.
-    *this = learner();
-
-    learn_fd.data = dat;
-
-    finisher_fd.data = dat;
-    finisher_fd.base = NULL;
-    finisher_fd.func = LEARNER::generic_func;
-
-    increment = params_per_weight;
-  }
-
-  inline learner(void *dat, learner* base, size_t ws = 1) 
-  { //the reduction constructor, with separate learn and predict functions
-    *this = *base;
-    
-    learn_fd.data = dat;
-    learn_fd.base = base;
-
-    finisher_fd.data = dat;
-    finisher_fd.base = base;
-    finisher_fd.func = LEARNER::generic_func;
-
-    weights = ws;
-    increment = base->increment * weights;
-  }
-};
-
+  
+  template<class T> 
+    learner<T>& init_learner(T* dat, base_learner* base, 
+			     void (*learn)(T&, base_learner&, example&), 
+			     void (*predict)(T&, base_learner&, example&), size_t ws = 1) 
+    { //the reduction constructor, with separate learn and predict functions
+      learner<T>& ret = calloc_or_die<learner<T> >();
+      ret = *(learner<T>*)base;
+      
+      ret.learn_fd.data = dat;
+      ret.learn_fd.learn_f = (tlearn)learn;
+      ret.learn_fd.update_f = (tlearn)learn;
+      ret.learn_fd.predict_f = (tlearn)predict;
+      ret.learn_fd.base = base;
+      
+      ret.finisher_fd.data = dat;
+      ret.finisher_fd.base = base;
+      ret.finisher_fd.func = noop;
+      
+      ret.weights = ws;
+      ret.increment = base->increment * ret.weights;
+      return ret;
+    }
+  
+  template<class T> base_learner* make_base(learner<T>& base) { return (base_learner*)&base; }
 }
diff --git a/vowpalwabbit/log_multi.cc b/vowpalwabbit/log_multi.cc
index e2859972..226376bd 100644
--- a/vowpalwabbit/log_multi.cc
+++ b/vowpalwabbit/log_multi.cc
@@ -110,6 +110,7 @@ namespace LOG_MULTI
     
     node.parent = 0;
     node.min_count = 0;
+    node.preds = v_init<node_pred>();
     init_leaf(node);
 
     return node;
@@ -245,7 +246,7 @@ namespace LOG_MULTI
     return b.nodes[current].internal;
   }
   
-  void train_node(log_multi& b, learner& base, example& ec, uint32_t& current, uint32_t& class_index)
+  void train_node(log_multi& b, base_learner& base, example& ec, uint32_t& current, uint32_t& class_index)
   {
     if(b.nodes[current].norm_Eh > b.nodes[current].preds[class_index].norm_Ehk)
       ec.l.simple.label = -1.f;
@@ -296,7 +297,7 @@ namespace LOG_MULTI
       return n.right;
   }
 
-  void predict(log_multi& b, learner& base, example& ec)	
+  void predict(log_multi& b,  base_learner& base, example& ec)	
   {
     MULTICLASS::multiclass mc = ec.l.multi;
 
@@ -315,7 +316,7 @@ namespace LOG_MULTI
     ec.l.multi = mc;
   }
 
-  void learn(log_multi& b, learner& base, example& ec)
+  void learn(log_multi& b, base_learner& base, example& ec)
   {
     //    verify_min_dfs(b, b.nodes[0]);
 
@@ -495,54 +496,44 @@ namespace LOG_MULTI
       }
   }
   
-  void finish_example(vw& all, log_multi&, example& ec)
-  {
-    MULTICLASS::output_example(all, ec);
-    VW::finish_example(all, &ec);
-  }
+  void finish_example(vw& all, log_multi&, example& ec) { MULTICLASS::finish_example(all, ec); }
   
-  learner* setup(vw& all, po::variables_map& vm)	//learner setup
+  base_learner* setup(vw& all, po::variables_map& vm)	//learner setup
   {
-    log_multi* data = (log_multi*)calloc(1, sizeof(log_multi));
+    log_multi& data = calloc_or_die<log_multi>();
 
     po::options_description opts("TXM Online options");
     opts.add_options()
       ("no_progress", "disable progressive validation")
-      ("swap_resistance", po::value<uint32_t>(&(data->swap_resist))->default_value(4), "higher = more resistance to swap, default=4");
+      ("swap_resistance", po::value<uint32_t>(&(data.swap_resist))->default_value(4), "higher = more resistance to swap, default=4");
     
     vm = add_options(all, opts);
     
-    data->k = (uint32_t)vm["log_multi"].as<size_t>();
-    
-    //append log_multi with nb_actions to options_from_file so it is saved to regressor later
-    std::stringstream ss;
-    ss << " --log_multi " << data->k;
-    all.file_options.append(ss.str());
+    data.k = (uint32_t)vm["log_multi"].as<size_t>();
+    *all.file_options << " --log_multi " << data.k;
     
     if (vm.count("no_progress"))
-      data->progress = false;
+      data.progress = false;
     else
-      data->progress = true;
+      data.progress = true;
 
-    data->all = &all;
+    data.all = &all;
     (all.p->lp) = MULTICLASS::mc_label;
     
     string loss_function = "quantile"; 
     float loss_parameter = 0.5;
     delete(all.loss);
-    all.loss = getLossFunction(&all, loss_function, loss_parameter);
+    all.loss = getLossFunction(all, loss_function, loss_parameter);
 
-    data->max_predictors = data->k - 1;
+    data.max_predictors = data.k - 1;
 
-    learner* l = new learner(data, all.l, data->max_predictors);
-    l->set_save_load<log_multi,save_load_tree>();
-    l->set_learn<log_multi,learn>();
-    l->set_predict<log_multi,predict>();
-    l->set_finish_example<log_multi,finish_example>();
-    l->set_finish<log_multi,finish>();
+    learner<log_multi>& l = init_learner(&data, all.l, learn, predict, data.max_predictors);
+    l.set_save_load(save_load_tree);
+    l.set_finish_example(finish_example);
+    l.set_finish(finish);
     
-    init_tree(*data);	
+    init_tree(data);	
     
-    return l;
+    return make_base(l);
   }	
 }
diff --git a/vowpalwabbit/log_multi.h b/vowpalwabbit/log_multi.h
index 26ecb435..5e1ee3bf 100644
--- a/vowpalwabbit/log_multi.h
+++ b/vowpalwabbit/log_multi.h
@@ -6,5 +6,5 @@ license as described in the file LICENSE.
 #pragma once
 namespace LOG_MULTI
 {  
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/loss_functions.cc b/vowpalwabbit/loss_functions.cc
index 0cdc5fa7..6badc633 100644
--- a/vowpalwabbit/loss_functions.cc
+++ b/vowpalwabbit/loss_functions.cc
@@ -127,7 +127,8 @@ public:
   }
   
   float getLoss(shared_data*, float prediction, float label) {
-    assert(label == -1.f || label == 1.f);
+    if (label != -1.f && label != 1.f)
+      cout << "You are using a label not -1 or 1 with a loss function expecting that!" << endl;
     float e = 1 - label*prediction;
     return (e > 0) ? e : 0;
   }
@@ -170,7 +171,8 @@ public:
   }
   
   float getLoss(shared_data*, float prediction, float label) {
-    assert(label == -1.f || label == 1.f || label == FLT_MAX);
+    if (label != -1.f && label != 1.f)
+      cout << "You are using a label not -1 or 1 with a loss function expecting that!" << endl;
     return log(1 + exp(-label * prediction));
   }
   
@@ -295,21 +297,18 @@ public:
   float tau;
 };
 
-loss_function* getLossFunction(void* a, string funcName, float function_parameter) {
-  vw* all=(vw*)a;
-  if(funcName.compare("squared") == 0 || funcName.compare("Huber") == 0) {
+loss_function* getLossFunction(vw& all, string funcName, float function_parameter) {
+  if(funcName.compare("squared") == 0 || funcName.compare("Huber") == 0) 
     return new squaredloss();
-  } else if(funcName.compare("classic") == 0){
+  else if(funcName.compare("classic") == 0)
     return new classic_squaredloss();
-  } else if(funcName.compare("hinge") == 0) {
-    all->sd->binary_label = true;
+  else if(funcName.compare("hinge") == 0) 
     return new hingeloss();
-  } else if(funcName.compare("logistic") == 0) {
-    if (all->set_minmax != noop_mm)
+  else if(funcName.compare("logistic") == 0) {
+    if (all.set_minmax != noop_mm)
       {
-	all->sd->min_label = -50;
-	all->sd->max_label = 50;
-	all->sd->binary_label = true;
+	all.sd->min_label = -50;
+	all.sd->max_label = 50;
       }
     return new logloss();
   } else if(funcName.compare("quantile") == 0 || funcName.compare("pinball") == 0 || funcName.compare("absolute") == 0) {
@@ -318,5 +317,4 @@ loss_function* getLossFunction(void* a, string funcName, float function_paramete
     cout << "Invalid loss function name: \'" << funcName << "\' Bailing!" << endl;
     throw exception();
   }
-  cout << "end getLossFunction" << endl;
 }
diff --git a/vowpalwabbit/loss_functions.h b/vowpalwabbit/loss_functions.h
index 421b3bfe..35b6f24b 100644
--- a/vowpalwabbit/loss_functions.h
+++ b/vowpalwabbit/loss_functions.h
@@ -8,8 +8,7 @@ license as described in the file LICENSE.
 #include "parse_primitives.h"
 
 struct shared_data;
-
-using namespace std;
+struct vw;
 
 class loss_function {
 
@@ -34,4 +33,4 @@ public :
 	virtual ~loss_function() {};
 };
 
-loss_function* getLossFunction(void*, string funcName, float function_parameter = 0);
+loss_function* getLossFunction(vw&, std::string funcName, float function_parameter = 0);
diff --git a/vowpalwabbit/lrq.cc b/vowpalwabbit/lrq.cc
index 22dadf2d..268815d5 100644
--- a/vowpalwabbit/lrq.cc
+++ b/vowpalwabbit/lrq.cc
@@ -62,7 +62,7 @@ namespace {
 namespace LRQ {
 
   template <bool is_learn>
-  void predict_or_learn(LRQstate& lrq, learner& base, example& ec)
+  void predict_or_learn(LRQstate& lrq, base_learner& base, example& ec)
   {
     vw& all = *lrq.all;
 
@@ -135,11 +135,11 @@ namespace LRQ {
 
                             if (iter == 0 && (all.audit || all.hash_inv))
                               {
-                                char* new_space = (char*)calloc_or_die(4, sizeof(char));
+                                char* new_space = calloc_or_die<char>(4);
                                 strcpy(new_space, "lrq");
                                 size_t n_len = strlen(i->c_str () + 4);
                                 size_t len = strlen(ra->feature) + n_len + 2;
-                                char* new_feature = (char*)calloc_or_die(len, sizeof(char));
+                                char* new_feature = calloc_or_die<char>(len);
                                 new_feature[0] = right;
                                 new_feature[1] = '^';
                                 strcat(new_feature, ra->feature);
@@ -187,43 +187,39 @@ namespace LRQ {
       }
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {//parse and set arguments
-    LRQstate* lrq = (LRQstate*) calloc (1, sizeof (LRQstate));
-    unsigned int maxk = 0;
-    lrq->all = &all;
-
+    LRQstate& lrq = calloc_or_die<LRQstate>();
+    size_t maxk = 0;
+    lrq.all = &all;
+    
     size_t random_seed = 0;
     if (vm.count("random_seed")) random_seed = vm["random_seed"].as<size_t> ();
-
-    lrq->initial_seed = lrq->seed = random_seed | 8675309;
-	if (vm.count("lrqdropout"))
-		lrq->dropout = true;
-	else
-		lrq->dropout = false;
-
-    all.file_options.append(" --lrqdropout");
     
-    lrq->lrpairs = vm["lrq"].as<vector<string> > ();
+    lrq.initial_seed = lrq.seed = random_seed | 8675309;
+    if (vm.count("lrqdropout"))
+      lrq.dropout = true;
+    else
+      lrq.dropout = false;
     
-    stringstream ss;
-    for (vector<string>::iterator i = lrq->lrpairs.begin (); 
-	 i != lrq->lrpairs.end (); 
-	 ++i)
-      ss << " --lrq " << *i;
+    *all.file_options << " --lrqdropout ";
     
-    all.file_options.append(ss.str());
+    lrq.lrpairs = vm["lrq"].as<vector<string> > ();
+    
+    for (vector<string>::iterator i = lrq.lrpairs.begin (); 
+	 i != lrq.lrpairs.end (); 
+	 ++i)
+      *all.file_options << " --lrq " << *i;
     
-
     if (! all.quiet)
       {
         cerr << "creating low rank quadratic features for pairs: ";
-        if (lrq->dropout)
+        if (lrq.dropout)
           cerr << "(using dropout) ";
       }
 
-    for (vector<string>::iterator i = lrq->lrpairs.begin (); 
-         i != lrq->lrpairs.end (); 
+    for (vector<string>::iterator i = lrq.lrpairs.begin (); 
+         i != lrq.lrpairs.end (); 
          ++i)
       {
         if(!all.quiet){
@@ -237,8 +233,8 @@ namespace LRQ {
         
         unsigned int k = atoi (i->c_str () + 2);
 
-        lrq->lrindices[(int) (*i)[0]] = 1;
-        lrq->lrindices[(int) (*i)[1]] = 1;
+        lrq.lrindices[(int) (*i)[0]] = 1;
+        lrq.lrindices[(int) (*i)[1]] = 1;
 
         maxk = max (maxk, k);
       }
@@ -247,12 +243,11 @@ namespace LRQ {
       cerr<<endl;
         
     all.wpp = all.wpp * (1 + maxk);
-    learner* l = new learner(lrq, all.l, 1 + maxk);
-    l->set_learn<LRQstate, predict_or_learn<true> >();
-    l->set_predict<LRQstate, predict_or_learn<false> >();
-    l->set_end_pass<LRQstate,reset_seed>();
+    learner<LRQstate>& l = init_learner(&lrq, all.l, predict_or_learn<true>, 
+					predict_or_learn<false>, 1 + maxk);
+    l.set_end_pass(reset_seed);
 
     // TODO: leaks memory ?
-    return l;
+    return make_base(l);
   }
 }
diff --git a/vowpalwabbit/lrq.h b/vowpalwabbit/lrq.h
index af0aae77..376bd6e5 100644
--- a/vowpalwabbit/lrq.h
+++ b/vowpalwabbit/lrq.h
@@ -1,4 +1,4 @@
 #pragma once
 namespace LRQ {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/main.cc b/vowpalwabbit/main.cc
index 96e08aa1..c7f40326 100644
--- a/vowpalwabbit/main.cc
+++ b/vowpalwabbit/main.cc
@@ -3,12 +3,6 @@ Copyright (c) by respective owners including Yahoo!, Microsoft, and
 individual contributors. All rights reserved.  Released under a BSD
 license as described in the file LICENSE.
  */
-
-#include <math.h>
-#include <iostream>
-#include <fstream>
-#include <float.h>
-#include <time.h>
 #ifdef _WIN32
 #include <WinSock2.h>
 #else
@@ -17,7 +11,6 @@ license as described in the file LICENSE.
 #endif
 #include <sys/timeb.h>
 #include "global_data.h"
-#include "parse_example.h"
 #include "parse_args.h"
 #include "accumulate.h"
 #include "best_constant.h"
@@ -44,9 +37,7 @@ int main(int argc, char *argv[])
         }
 
     VW::start_parser(*all);
-
-    all->l->driver(all);
-
+    LEARNER::generic_driver(*all);
     VW::end_parser(*all);
 
     ftime(&t_end);
diff --git a/vowpalwabbit/memory.cc b/vowpalwabbit/memory.cc
index aa3772d3..7e40bf71 100644
--- a/vowpalwabbit/memory.cc
+++ b/vowpalwabbit/memory.cc
@@ -1,20 +1,6 @@
 #include <stdlib.h>
-#include <iostream>
 
-void* calloc_or_die(size_t nmemb, size_t size)
-{
-  if (nmemb == 0 || size == 0)
-    return NULL;
-  
-  void* data = calloc(nmemb, size);
-  if (data == NULL) {
-    std::cerr << "internal error: memory allocation failed; dying!" << std::endl;
-    throw std::exception();
-  }
-  return data;
-}
-
-void free_it(void*ptr)
+void free_it(void* ptr)
 {
   if (ptr != NULL)
     free(ptr);
diff --git a/vowpalwabbit/memory.h b/vowpalwabbit/memory.h
index 266290cd..6d67d51e 100644
--- a/vowpalwabbit/memory.h
+++ b/vowpalwabbit/memory.h
@@ -1,4 +1,26 @@
 #pragma once
-void* calloc_or_die(size_t nmemb, size_t size);
+
+#include <stdlib.h>
+#include <iostream>
+
+template<class T>
+T* calloc_or_die(size_t nmemb)
+{
+  if (nmemb == 0)
+    return NULL;
+  
+  void* data = calloc(nmemb, sizeof(T));
+  if (data == NULL) {
+    std::cerr << "internal error: memory allocation failed; dying!" << std::endl;
+    throw std::exception();
+  }
+  return (T*)data;
+}
+
+template<class T> T& calloc_or_die()
+{
+  return *calloc_or_die<T>(1);
+}
+
 
 void free_it(void* ptr);
diff --git a/vowpalwabbit/mf.cc b/vowpalwabbit/mf.cc
index 2cce6272..4e00be8d 100644
--- a/vowpalwabbit/mf.cc
+++ b/vowpalwabbit/mf.cc
@@ -43,7 +43,7 @@ struct mf {
 };
 
 template <bool cache_sub_predictions>
-void predict(mf& data, learner& base, example& ec) {
+void predict(mf& data, base_learner& base, example& ec) {
   float prediction = 0;
   if (cache_sub_predictions)
     data.sub_predictions.resize(2*data.rank+1, true);
@@ -102,7 +102,7 @@ void predict(mf& data, learner& base, example& ec) {
   ec.pred.scalar = GD::finalize_prediction(data.all->sd, ec.partial_prediction);
 }
 
-void learn(mf& data, learner& base, example& ec) {
+void learn(mf& data, base_learner& base, example& ec) {
   // predict with current weights
   predict<true>(data, base, ec);
   float predicted = ec.pred.scalar;
@@ -189,7 +189,7 @@ void finish(mf& o) {
 }
 
 
-learner* setup(vw& all, po::variables_map& vm) {
+base_learner* setup(vw& all, po::variables_map& vm) {
   mf* data = new mf;
 
   // copy global data locally
@@ -203,10 +203,8 @@ learner* setup(vw& all, po::variables_map& vm) {
 
   all.random_positive_weights = true;
 
-  learner* l = new learner(data, all.l, 2*data->rank+1);
-  l->set_learn<mf, learn>();
-  l->set_predict<mf, predict<false> >();
-  l->set_finish<mf,finish>();
-  return l;
+  learner<mf>& l = init_learner(data, all.l, learn, predict<false>, 2*data->rank+1);
+  l.set_finish(finish);
+  return make_base(l);
 }
 }
diff --git a/vowpalwabbit/mf.h b/vowpalwabbit/mf.h
index 16d28e03..99643601 100644
--- a/vowpalwabbit/mf.h
+++ b/vowpalwabbit/mf.h
@@ -11,5 +11,5 @@ license as described in the file LICENSE.
 #include "gd.h"
 
 namespace MF{
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/multiclass.cc b/vowpalwabbit/multiclass.cc
index 8736bc6e..4aad3a73 100644
--- a/vowpalwabbit/multiclass.cc
+++ b/vowpalwabbit/multiclass.cc
@@ -56,9 +56,7 @@ namespace MULTICLASS {
     ld->weight = 1.;
   }
 
-  void delete_label(void* v)
-  {
-  }
+  void delete_label(void* v) {}
 
   void parse_label(parser* p, shared_data*, void* v, v_array<substring>& words)
   {
@@ -142,35 +140,36 @@ namespace MULTICLASS {
       }
   }
 
-  void output_example(vw& all, example& ec)
+  void finish_example(vw& all, example& ec)
   {
     multiclass ld = ec.l.multi;
-
+    
     size_t loss = 1;
     if (ld.label == (uint32_t)ec.pred.multiclass)
       loss = 0;
-
+    
     if(ec.test_only)
-    {
-      all.sd->weighted_holdout_examples += ld.weight;//test weight seen
-      all.sd->weighted_holdout_examples_since_last_dump += ld.weight;
-      all.sd->weighted_holdout_examples_since_last_pass += ld.weight;
-      all.sd->holdout_sum_loss += loss;
-      all.sd->holdout_sum_loss_since_last_dump += loss;
-      all.sd->holdout_sum_loss_since_last_pass += loss;//since last pass
-    }
+      {
+	all.sd->weighted_holdout_examples += ld.weight;//test weight seen
+	all.sd->weighted_holdout_examples_since_last_dump += ld.weight;
+	all.sd->weighted_holdout_examples_since_last_pass += ld.weight;
+	all.sd->holdout_sum_loss += loss;
+	all.sd->holdout_sum_loss_since_last_dump += loss;
+	all.sd->holdout_sum_loss_since_last_pass += loss;//since last pass
+      }
     else
-    {
-      all.sd->weighted_examples += ld.weight;
-      all.sd->total_features += ec.num_features;
-      all.sd->sum_loss += loss;
-      all.sd->sum_loss_since_last_dump += loss;
-      all.sd->example_number++;
-    }
- 
+      {
+	all.sd->weighted_examples += ld.weight;
+	all.sd->total_features += ec.num_features;
+	all.sd->sum_loss += loss;
+	all.sd->sum_loss_since_last_dump += loss;
+	all.sd->example_number++;
+      }
+    
     for (int* sink = all.final_prediction_sink.begin; sink != all.final_prediction_sink.end; sink++)
       all.print(*sink, (float)ec.pred.multiclass, 0, ec.tag);
-
+    
     MULTICLASS::print_update(all, ec);
+    VW::finish_example(all, &ec);
   }
 }
diff --git a/vowpalwabbit/multiclass.h b/vowpalwabbit/multiclass.h
index aca34075..f48efbfc 100644
--- a/vowpalwabbit/multiclass.h
+++ b/vowpalwabbit/multiclass.h
@@ -18,7 +18,7 @@ namespace MULTICLASS
   
   extern label_parser mc_label;
   
-  void output_example(vw& all, example& ec);
+  void finish_example(vw& all, example& ec);
 
   inline int label_is_test(multiclass* ld)
   { return ld->label == (uint32_t)-1; }
diff --git a/vowpalwabbit/network.cc b/vowpalwabbit/network.cc
index 7e39e879..b2922063 100644
--- a/vowpalwabbit/network.cc
+++ b/vowpalwabbit/network.cc
@@ -18,7 +18,6 @@ license as described in the file LICENSE.
 #include <netdb.h>
 #include <strings.h>
 #endif
-#include <stdlib.h>
 #include <string.h>
 
 #include <string>
diff --git a/vowpalwabbit/nn.cc b/vowpalwabbit/nn.cc
index b6d3a292..bfab6009 100644
--- a/vowpalwabbit/nn.cc
+++ b/vowpalwabbit/nn.cc
@@ -96,7 +96,7 @@ namespace NN {
   }
 
   template <bool is_learn>
-  void predict_or_learn(nn& n, learner& base, example& ec)
+  void predict_or_learn(nn& n, base_learner& base, example& ec)
   {
     bool shouldOutput = n.all->raw_prediction > 0;
 
@@ -308,10 +308,10 @@ CONVERSE: // That's right, I'm using goto.  So sue me.
     free (n.output_layer.atomics[nn_output_namespace].begin);
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  base_learner* setup(vw& all, po::variables_map& vm)
   {
-    nn* n = (nn*)calloc_or_die(1,sizeof(nn));
-    n->all = &all;
+    nn& n = calloc_or_die<nn>();
+    n.all = &all;
 
     po::options_description nn_opts("NN options");
     nn_opts.add_options()
@@ -322,64 +322,55 @@ CONVERSE: // That's right, I'm using goto.  So sue me.
     vm = add_options(all, nn_opts);
 
     //first parse for number of hidden units
-    n->k = (uint32_t)vm["nn"].as<size_t>();
-    
-    std::stringstream ss;
-    ss << " --nn " << n->k;
-    all.file_options.append(ss.str());
+    n.k = (uint32_t)vm["nn"].as<size_t>();
+    *all.file_options << " --nn " << n.k;
 
     if ( vm.count("dropout") ) {
-      n->dropout = true;
-      
-      std::stringstream ss;
-      ss << " --dropout ";
-      all.file_options.append(ss.str());
+      n.dropout = true;
+      *all.file_options << " --dropout ";
     }
     
     if ( vm.count("meanfield") ) {
-      n->dropout = false;
+      n.dropout = false;
       if (! all.quiet) 
         std::cerr << "using mean field for neural network " 
                   << (all.training ? "training" : "testing") 
                   << std::endl;
     }
 
-    if (n->dropout) 
+    if (n.dropout) 
       if (! all.quiet)
         std::cerr << "using dropout for neural network "
                   << (all.training ? "training" : "testing") 
                   << std::endl;
 
     if (vm.count ("inpass")) {
-      n->inpass = true;
+      n.inpass = true;
+      *all.file_options << " --inpass";
 
-      std::stringstream ss;
-      ss << " --inpass";
-      all.file_options.append(ss.str());
     }
 
-    if (n->inpass && ! all.quiet)
+    if (n.inpass && ! all.quiet)
       std::cerr << "using input passthrough for neural network "
                 << (all.training ? "training" : "testing") 
                 << std::endl;
 
-    n->finished_setup = false;
-    n->squared_loss = getLossFunction (0, "squared", 0);
+    n.finished_setup = false;
+    n.squared_loss = getLossFunction (all, "squared", 0);
 
-    n->xsubi = 0;
+    n.xsubi = 0;
 
     if (vm.count("random_seed"))
-      n->xsubi = vm["random_seed"].as<size_t>();
-
-    n->save_xsubi = n->xsubi;
-    n->increment = all.l->increment;//Indexing of output layer is odd.
-    learner* l = new learner(n,  all.l, n->k+1);
-    l->set_learn<nn, predict_or_learn<true> >();
-    l->set_predict<nn, predict_or_learn<false> >();
-    l->set_finish<nn, finish>();
-    l->set_finish_example<nn, finish_example>();
-    l->set_end_pass<nn,end_pass>();
-
-    return l;
+      n.xsubi = vm["random_seed"].as<size_t>();
+
+    n.save_xsubi = n.xsubi;
+    n.increment = all.l->increment;//Indexing of output layer is odd.
+    learner<nn>& l = init_learner(&n,  all.l, predict_or_learn<true>, 
+				  predict_or_learn<false>, n.k+1);
+    l.set_finish(finish);
+    l.set_finish_example(finish_example);
+    l.set_end_pass(end_pass);
+
+    return make_base(l);
   }
 }
diff --git a/vowpalwabbit/nn.h b/vowpalwabbit/nn.h
index de41be46..52e08f46 100644
--- a/vowpalwabbit/nn.h
+++ b/vowpalwabbit/nn.h
@@ -9,5 +9,5 @@ license as described in the file LICENSE.
 
 namespace NN
 {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/noop.cc b/vowpalwabbit/noop.cc
index b500d657..0c883a8c 100644
--- a/vowpalwabbit/noop.cc
+++ b/vowpalwabbit/noop.cc
@@ -7,11 +7,9 @@ license as described in the file LICENSE.
 
 #include "reductions.h"
 
-using namespace LEARNER;
-
 namespace NOOP {
-  learner* setup(vw& all)
-  {
-    return new learner();
-  }
+  void learn(char&, LEARNER::base_learner&, example&) {}
+
+  LEARNER::base_learner* setup(vw& all)
+  { return &LEARNER::init_learner<char>(NULL, learn, 1); }
 }
diff --git a/vowpalwabbit/noop.h b/vowpalwabbit/noop.h
index f9fe8267..5220e1ee 100644
--- a/vowpalwabbit/noop.h
+++ b/vowpalwabbit/noop.h
@@ -5,5 +5,5 @@ license as described in the file LICENSE.
 */
 #pragma once
 namespace NOOP {
-  LEARNER::learner* setup(vw&);
+  LEARNER::base_learner* setup(vw&);
 }
diff --git a/vowpalwabbit/oaa.cc b/vowpalwabbit/oaa.cc
index c1a763ad..2328b00d 100644
--- a/vowpalwabbit/oaa.cc
+++ b/vowpalwabbit/oaa.cc
@@ -3,38 +3,28 @@ Copyright (c) by respective owners including Yahoo!, Microsoft, and
 individual contributors. All rights reserved.  Released under a BSD (revised)
 license as described in the file LICENSE.
  */
-#include <float.h>
-#include <limits.h>
-#include <math.h>
-#include <stdio.h>
 #include <sstream>
-
 #include "multiclass.h"
 #include "simple_label.h"
 #include "reductions.h"
 #include "vw.h"
 
-using namespace std;
-using namespace LEARNER;
-using namespace MULTICLASS;
-
 namespace OAA {
   struct oaa{
-    uint32_t k;
+    size_t k;
     bool shouldOutput;
     vw* all;
   };
 
   template <bool is_learn>
-  void predict_or_learn(oaa& o, learner& base, example& ec) {
-    multiclass mc_label_data = ec.l.multi;
+  void predict_or_learn(oaa& o, LEARNER::base_learner& base, example& ec) {
+    MULTICLASS::multiclass mc_label_data = ec.l.multi;
     if (mc_label_data.label == 0 || (mc_label_data.label > o.k && mc_label_data.label != (uint32_t)-1))
       cout << "label " << mc_label_data.label << " is not in {1,"<< o.k << "} This won't work right." << endl;
     
     ec.l.simple = {0.f, mc_label_data.weight, 0.f};
 
-    string outputString;
-    stringstream outputStringStream(outputString);
+    stringstream outputStringStream;
 
     uint32_t prediction = 1;
     float score = INT_MIN;
@@ -70,33 +60,21 @@ namespace OAA {
       o.all->print_text(o.all->raw_prediction, outputStringStream.str(), ec.tag);
   }
   
-  void finish_example(vw& all, oaa&, example& ec)
-  {
-    MULTICLASS::output_example(all, ec);
-    VW::finish_example(all, &ec);
-  }
+  void finish_example(vw& all, oaa&, example& ec) { MULTICLASS::finish_example(all, ec); }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm)
   {
-    oaa* data = (oaa*)calloc_or_die(1, sizeof(oaa));
-    //first parse for number of actions
-
-    data->k = (uint32_t)vm["oaa"].as<size_t>();
-    
-    //append oaa with nb_actions to options_from_file so it is saved to regressor later
-    std::stringstream ss;
-    ss << " --oaa " << data->k;
-    all.file_options.append(ss.str());
-
-    data->shouldOutput = all.raw_prediction > 0;
-    data->all = &all;
-    all.p->lp = mc_label;
+    oaa& data = calloc_or_die<oaa>();
+    data.k = vm["oaa"].as<size_t>();
+    data.shouldOutput = all.raw_prediction > 0;
+    data.all = &all;
 
-    learner* l = new learner(data, all.l, data->k);
-    l->set_learn<oaa, predict_or_learn<true> >();
-    l->set_predict<oaa, predict_or_learn<false> >();
-    l->set_finish_example<oaa, finish_example>();
+    *all.file_options << " --oaa " << data.k;
+    all.p->lp = MULTICLASS::mc_label;
 
-    return l;
+    LEARNER::learner<oaa>& l = init_learner(&data, all.l, predict_or_learn<true>, 
+				   predict_or_learn<false>, data.k);
+    l.set_finish_example(finish_example);
+    return make_base(l);
   }
 }
diff --git a/vowpalwabbit/oaa.h b/vowpalwabbit/oaa.h
index 6b47127f..de1b08ab 100644
--- a/vowpalwabbit/oaa.h
+++ b/vowpalwabbit/oaa.h
@@ -5,6 +5,4 @@ license as described in the file LICENSE.
  */
 #pragma once
 namespace OAA
-{
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
-}
+{ LEARNER::base_learner* setup(vw& all, po::variables_map& vm); }
diff --git a/vowpalwabbit/parse_args.cc b/vowpalwabbit/parse_args.cc
index efa0961e..a760261f 100644
--- a/vowpalwabbit/parse_args.cc
+++ b/vowpalwabbit/parse_args.cc
@@ -35,6 +35,7 @@ license as described in the file LICENSE.
 #include "gd_mf.h"
 #include "mf.h"
 #include "vw.h"
+#include "ftrl_proximal.h"
 #include "rand48.h"
 #include "parse_args.h"
 #include "binary.h"
@@ -98,13 +99,13 @@ void parse_dictionary_argument(vw&all, string str) {
   ifstream infile(s);
   size_t def = (size_t)' ';
   for (string line; getline(infile, line);) {
-    char*c = (char*)line.c_str(); // we're throwing away const, which is dangerous...
+    char* c = (char*)line.c_str(); // we're throwing away const, which is dangerous...
     while (*c == ' ' || *c == '\t') ++c; // skip initial whitespace
-    char*d = c;
+    char* d = c;
     while (*d != ' ' && *d != '\t' && *d != '\n' && *d != '\0') ++d; // gobble up initial word
     if (d == c) continue; // no word
     if (*d != ' ' && *d != '\t') continue; // reached end of line
-    char*word = (char*)calloc(d-c, sizeof(char));
+    char* word = calloc_or_die<char>(d-c);
     memcpy(word, c, d-c);
     substring ss = { word, word + (d - c) };
     uint32_t hash = uniform_hash( ss.begin, ss.end-ss.begin, quadratic_constant);
@@ -131,14 +132,14 @@ void parse_dictionary_argument(vw&all, string str) {
   
   cerr << "dictionary " << s << " contains " << map->size() << " item" << (map->size() == 1 ? "\n" : "s\n");
   all.namespace_dictionaries[(size_t)ns].push_back(map);
-  dictionary_info info = { (char*)calloc(strlen(s)+1, sizeof(char)), map };
+  dictionary_info info = { calloc_or_die<char>(strlen(s)+1), map };
   strcpy(info.name, s);
   all.read_dictionaries.push_back(info);
 }
 
 void parse_affix_argument(vw&all, string str) {
   if (str.length() == 0) return;
-  char* cstr = (char*)calloc_or_die(str.length()+1, sizeof(char));
+  char* cstr = calloc_or_die<char>(str.length()+1);
   strcpy(cstr, str.c_str());
 
   char*p = strtok(cstr, ",");
@@ -352,9 +353,7 @@ void parse_feature_tweaks(vw& all, po::variables_map& vm)
 
   if (vm.count("affix")) {
     parse_affix_argument(all, vm["affix"].as<string>());
-    stringstream ss;
-    ss << " --affix " << vm["affix"].as<string>();
-    all.file_options.append(ss.str());
+    *all.file_options << " --affix " << vm["affix"].as<string>();
   }
 
   if(vm.count("ngram")){
@@ -601,7 +600,7 @@ void parse_example_tweaks(vw& all, po::variables_map& vm)
   if(vm.count("quantile_tau"))
     loss_parameter = vm["quantile_tau"].as<float>();
 
-  all.loss = getLossFunction(&all, loss_function, (float)loss_parameter);
+  all.loss = getLossFunction(all, loss_function, (float)loss_parameter);
 
   if (all.l1_lambda < 0.) {
     cerr << "l1_lambda should be nonnegative: resetting from " << all.l1_lambda << " to 0" << endl;
@@ -722,6 +721,7 @@ void parse_base_algorithm(vw& all, po::variables_map& vm)
   
   base_opt.add_options()
     ("sgd", "use regular stochastic gradient descent update.")
+    ("ftrl", "use ftrl-proximal optimization")
     ("adaptive", "use adaptive, individual learning rates.")
     ("invariant", "use safe/importance aware updates.")
     ("normalized", "use per feature normalized updates")
@@ -740,6 +740,8 @@ void parse_base_algorithm(vw& all, po::variables_map& vm)
     all.l = BFGS::setup(all, vm);
   else if (vm.count("lda"))
     all.l = LDA::setup(all, vm);
+  else if (vm.count("ftrl"))
+    all.l = FTRL::setup(all, vm);
   else if (vm.count("noop"))
     all.l = NOOP::setup(all);
   else if (vm.count("print"))
@@ -817,7 +819,7 @@ void parse_scorer_reductions(vw& all, po::variables_map& vm)
   all.l = Scorer::setup(all, vm);
 }
 
-LEARNER::learner* exclusive_setup(vw& all, po::variables_map& vm, bool& score_consumer, LEARNER::learner* (*setup)(vw&, po::variables_map&))
+LEARNER::base_learner* exclusive_setup(vw& all, po::variables_map& vm, bool& score_consumer, LEARNER::base_learner* (*setup)(vw&, po::variables_map&))
 {
   if (score_consumer) { cerr << "error: cannot specify multiple direct score consumers" << endl; throw exception(); }
   score_consumer = true;
@@ -1016,7 +1018,7 @@ vw* parse_args(int argc, char *argv[])
   parse_regressor_args(*all, vm, io_temp);
   
   int temp_argc = 0;
-  char** temp_argv = VW::get_argv_from_string(all->file_options, temp_argc);
+  char** temp_argv = VW::get_argv_from_string(all->file_options->str(), temp_argc);
   add_to_args(*all, temp_argc, temp_argv);
   for (int i = 0; i < temp_argc; i++)
     free(temp_argv[i]);
@@ -1030,7 +1032,7 @@ vw* parse_args(int argc, char *argv[])
 
   po::store(pos, vm);
   po::notify(vm);
-  all->file_options = "";
+  all->file_options->str("");
 
   parse_feature_tweaks(*all, vm); //feature tweaks
 
@@ -1093,16 +1095,14 @@ vw* parse_args(int argc, char *argv[])
 }
 
 namespace VW {
-  void cmd_string_replace_value( string& cmd, string flag_to_replace, string new_value )
+  void cmd_string_replace_value( std::stringstream*& ss, string flag_to_replace, string new_value )
   {
     flag_to_replace.append(" "); //add a space to make sure we obtain the right flag in case 2 flags start with the same set of characters
+    string cmd = ss->str();
     size_t pos = cmd.find(flag_to_replace);
-    if( pos == string::npos ) {
+    if( pos == string::npos )
       //flag currently not present in command string, so just append it to command string
-      cmd.append(" ");
-      cmd.append(flag_to_replace);
-      cmd.append(new_value);
-    }
+      *ss << " " << flag_to_replace << new_value;
     else {
       //flag is present, need to replace old value with new value
 
@@ -1112,20 +1112,19 @@ namespace VW {
       //now pos is position where value starts
       //find position of next space
       size_t pos_after_value = cmd.find(" ",pos);
-      if(pos_after_value == string::npos) {
+      if(pos_after_value == string::npos) 
         //we reach the end of the string, so replace the all characters after pos by new_value
         cmd.replace(pos,cmd.size()-pos,new_value);
-      }
-      else {
+      else 
         //replace characters between pos and pos_after_value by new_value
         cmd.replace(pos,pos_after_value-pos,new_value);
-      }
+      ss->str(cmd);
     }
   }
 
   char** get_argv_from_string(string s, int& argc)
   {
-    char* c = (char*)calloc_or_die(s.length()+3, sizeof(char));
+    char* c = calloc_or_die<char>(s.length()+3);
     c[0] = 'b';
     c[1] = ' ';
     strcpy(c+2, s.c_str());
@@ -1133,11 +1132,11 @@ namespace VW {
     v_array<substring> foo = v_init<substring>();
     tokenize(' ', ss, foo);
 
-    char** argv = (char**)calloc_or_die(foo.size(), sizeof(char*));
+    char** argv = calloc_or_die<char*>(foo.size());
     for (size_t i = 0; i < foo.size(); i++)
       {
 	*(foo[i].end) = '\0';
-	argv[i] = (char*)calloc_or_die(foo[i].end-foo[i].begin+1, sizeof(char));
+	argv[i] = calloc_or_die<char>(foo[i].end-foo[i].begin+1);
         sprintf(argv[i],"%s",foo[i].begin);
       }
 
@@ -1174,7 +1173,7 @@ namespace VW {
   {
     finalize_regressor(all, all.final_regressor_name);
     all.l->finish();
-    delete all.l;
+    free_it(all.l);
     if (all.reg.weight_vector != NULL)
       free(all.reg.weight_vector);
     free_parser(all);
@@ -1183,6 +1182,7 @@ namespace VW {
     all.p->parse_name.delete_v();
     free(all.p);
     free(all.sd);
+    delete all.file_options;
     for (size_t i = 0; i < all.final_prediction_sink.size(); i++)
       if (all.final_prediction_sink[i] != 1)
 	io_buf::close_file_or_socket(all.final_prediction_sink[i]);
diff --git a/vowpalwabbit/parse_example.cc b/vowpalwabbit/parse_example.cc
index 31ee3e1f..63bcb227 100644
--- a/vowpalwabbit/parse_example.cc
+++ b/vowpalwabbit/parse_example.cc
@@ -20,7 +20,7 @@ char* copy(char* base)
 {
   size_t len = 0;
   while (base[len++] != '\0');
-  char* ret = (char *)calloc_or_die(len,sizeof(char));
+  char* ret = calloc_or_die<char>(len);
   memcpy(ret,base,len);
   return ret;
 }
@@ -182,7 +182,7 @@ public:
               for (feature*f = feats->begin; f != feats->end; ++f) {
                 uint32_t id = f->weight_index;
                 size_t len = 2 + (feature_name.end-feature_name.begin) + 1 + (size_t)ceil(log10(id)) + 1;
-                char* str = (char*)calloc(len, sizeof(char));
+                char* str = calloc_or_die<char>(len);
                 str[0] = index;
                 str[1] = '_';
                 char *c = str+2;
@@ -270,7 +270,7 @@ public:
 	{
 	  if (base != NULL)
 	    free(base);
-	  base = (char *) calloc_or_die(2,sizeof(char));
+	  base = calloc_or_die<char>(2);
 	  base[0] = ' ';
 	  base[1] = '\0';
 	}
diff --git a/vowpalwabbit/parse_primitives.cc b/vowpalwabbit/parse_primitives.cc
index b08f05fb..4ed67313 100644
--- a/vowpalwabbit/parse_primitives.cc
+++ b/vowpalwabbit/parse_primitives.cc
@@ -11,8 +11,6 @@ license as described in the file LICENSE.
 #include "parse_primitives.h"
 #include "hash.h"
 
-using namespace std;
-
 void tokenize(char delim, substring s, v_array<substring>& ret, bool allow_empty)
 {
   ret.erase();
@@ -53,17 +51,15 @@ size_t hashstring (substring s, uint32_t h)
 }
 
 size_t hashall (substring s, uint32_t h)
-{
-  return uniform_hash((unsigned char *)s.begin, s.end - s.begin, h);
-}
+{ return uniform_hash((unsigned char *)s.begin, s.end - s.begin, h); }
 
-hash_func_t getHasher(const string& s){
+hash_func_t getHasher(const std::string& s){
   if (s=="strings")
     return hashstring;
   else if(s=="all")
     return hashall;
   else{
-    cerr << "Unknown hash function: " << s.c_str() << ". Exiting " << endl;
-    throw exception();
+    std::cerr << "Unknown hash function: " << s.c_str() << ". Exiting " << std::endl;
+    throw std::exception();
   }
 }
diff --git a/vowpalwabbit/parse_regressor.cc b/vowpalwabbit/parse_regressor.cc
index 6cd8c459..def3a8de 100644
--- a/vowpalwabbit/parse_regressor.cc
+++ b/vowpalwabbit/parse_regressor.cc
@@ -35,7 +35,7 @@ void initialize_regressor(vw& all)
 
   size_t length = ((size_t)1) << all.num_bits;
   all.reg.weight_mask = (length << all.reg.stride_shift) - 1;
-  all.reg.weight_vector = (weight *)calloc_or_die(length << all.reg.stride_shift, sizeof(weight));
+  all.reg.weight_vector = calloc_or_die<weight>(length << all.reg.stride_shift);
   if (all.reg.weight_vector == NULL)
     {
       cerr << all.program_name << ": Failed to allocate weight array with " << all.num_bits << " bits: try decreasing -b <bits>" << endl;
@@ -229,16 +229,16 @@ void save_load_header(vw& all, io_buf& model_file, bool read, bool text)
 				"", read, 
 				"\n",1, text);
       
-      text_len = sprintf(buff, "options:%s\n", all.file_options.c_str());
-      uint32_t len = (uint32_t)all.file_options.length()+1;
-      memcpy(buff2, all.file_options.c_str(),len);
+      text_len = sprintf(buff, "options:%s\n", all.file_options->str().c_str());
+      uint32_t len = (uint32_t)all.file_options->str().length()+1;
+      memcpy(buff2, all.file_options->str().c_str(),len);
       if (read)
 	len = buf_size;
       bin_text_read_write(model_file,buff2, len, 
 			  "", read,
 			  buff, text_len, text);
       if (read)
-	all.file_options.assign(buff2);
+	all.file_options->str(buff2);
     }
 
 }
@@ -348,7 +348,7 @@ void parse_mask_regressor_args(vw& all, po::variables_map& vm){
       }
     } else {
       // If no initial regressor, just clear out the options loaded from the header.
-      all.file_options.assign("");
+      all.file_options->str("");
     }
   }
 }
diff --git a/vowpalwabbit/parser.cc b/vowpalwabbit/parser.cc
index 4a230d99..44055bab 100644
--- a/vowpalwabbit/parser.cc
+++ b/vowpalwabbit/parser.cc
@@ -153,16 +153,16 @@ bool is_test_only(uint32_t counter, uint32_t period, uint32_t after, bool holdou
 
 parser* new_parser()
 {
-  parser* ret = (parser*) calloc_or_die(1,sizeof(parser));
-  ret->input = new io_buf;
-  ret->output = new io_buf;
-  ret->local_example_number = 0;
-  ret->in_pass_counter = 0;
-  ret->ring_size = 1 << 8;
-  ret->done = false;
-  ret->used_index = 0;
-
-  return ret;
+  parser& ret = calloc_or_die<parser>();
+  ret.input = new io_buf;
+  ret.output = new io_buf;
+  ret.local_example_number = 0;
+  ret.in_pass_counter = 0;
+  ret.ring_size = 1 << 8;
+  ret.done = false;
+  ret.used_index = 0;
+
+  return &ret;
 }
 
 void set_compressed(parser* par){
@@ -1170,7 +1170,7 @@ void initialize_examples(vw& all)
   all.p->end_parsed_examples = 0;
   all.p->done = false;
 
-  all.p->examples = (example*)calloc_or_die(all.p->ring_size, sizeof(example));
+  all.p->examples = calloc_or_die<example>(all.p->ring_size);
 
   for (size_t i = 0; i < all.p->ring_size; i++)
     {
diff --git a/vowpalwabbit/print.cc b/vowpalwabbit/print.cc
index 2b3d4887..d0dc2765 100644
--- a/vowpalwabbit/print.cc
+++ b/vowpalwabbit/print.cc
@@ -4,14 +4,9 @@
 #include "float.h"
 #include "reductions.h"
 
-using namespace LEARNER;
-
 namespace PRINT
 {
-  struct print{
-    vw* all;
-    
-  };
+  struct print{ vw* all; };
 
   void print_feature(vw& all, float value, float& weight)
   {
@@ -23,7 +18,7 @@ namespace PRINT
     cout << " ";
   }
 
-  void learn(print& p, learner& base, example& ec)
+  void learn(print& p, LEARNER::base_learner& base, example& ec)
   {
     label_data& ld = ec.l.simple;
     if (ld.label != FLT_MAX)
@@ -46,17 +41,15 @@ namespace PRINT
     cout << endl;
   }
   
-  learner* setup(vw& all)
+  LEARNER::base_learner* setup(vw& all)
   {
-    print* p = (print*)calloc_or_die(1, sizeof(print));
-    p->all = &all;
+    print& p = calloc_or_die<print>();
+    p.all = &all;
     size_t length = ((size_t)1) << all.num_bits;
     all.reg.weight_mask = (length << all.reg.stride_shift) - 1;
     all.reg.stride_shift = 0;
 
-    learner* ret = new learner(p, 1);
-    ret->set_learn<print,learn>();
-    ret->set_predict<print,learn>();
-    return ret;
+    LEARNER::learner<print>& ret = init_learner(&p, learn, 1);
+    return make_base(ret);
   } 
 }
diff --git a/vowpalwabbit/print.h b/vowpalwabbit/print.h
index 4096a306..b6a771ed 100644
--- a/vowpalwabbit/print.h
+++ b/vowpalwabbit/print.h
@@ -5,5 +5,5 @@ license as described in the file LICENSE.
  */
 #pragma once
 namespace PRINT {
-  LEARNER::learner* setup(vw& all);
+  LEARNER::base_learner* setup(vw& all);
 }
diff --git a/vowpalwabbit/rand48.cc b/vowpalwabbit/rand48.cc
index 4ea4e75e..4288e64d 100644
--- a/vowpalwabbit/rand48.cc
+++ b/vowpalwabbit/rand48.cc
@@ -1,8 +1,5 @@
 //A quick implementation similar to drand48 for cross-platform compatibility
 #include <stdint.h>
-#include <iostream>
-using namespace std;
-
 //
 // NB: the 'ULL' suffix is not part of the constant it is there to
 // prevent truncation of constant to (32-bit long) when compiling
@@ -25,15 +22,9 @@ float merand48(uint64_t& initial)
 
 uint64_t v = c;
 
-void msrand48(uint64_t initial)
-{
-	v = initial;
-}
+void msrand48(uint64_t initial) { v = initial; }
 
-float frand48()
-{
-	return merand48(v);
-}
+float frand48() { return merand48(v); }
 
 float frand48_noadvance()
 {
diff --git a/vowpalwabbit/scorer.cc b/vowpalwabbit/scorer.cc
index fda5c271..50645ed8 100644
--- a/vowpalwabbit/scorer.cc
+++ b/vowpalwabbit/scorer.cc
@@ -1,16 +1,11 @@
 #include <float.h>
-
 #include "reductions.h"
 
-using namespace LEARNER;
-
 namespace Scorer {
-  struct scorer{
-    vw* all;
-  };
+  struct scorer{ vw* all; };
 
   template <bool is_learn, float (*link)(float in)>
-  void predict_or_learn(scorer& s, learner& base, example& ec)
+  void predict_or_learn(scorer& s, LEARNER::base_learner& base, example& ec)
   {
     s.all->set_minmax(s.all->sd, ec.l.simple.label);
     
@@ -26,29 +21,20 @@ namespace Scorer {
   }
 
   // y = f(x) -> [0, 1]
-  float logistic(float in)
-  {
-    return 1.f / (1.f + exp(- in));
-  }
+  float logistic(float in) { return 1.f / (1.f + exp(- in)); }
 
   // http://en.wikipedia.org/wiki/Generalized_logistic_curve
   // where the lower & upper asymptotes are -1 & 1 respectively
   // 'glf1' stands for 'Generalized Logistic Function with [-1,1] range'
   //    y = f(x) -> [-1, 1]
-  float glf1(float in)
-  {
-    return 2.f / (1.f + exp(- in)) - 1.f;
-  }
+  float glf1(float in) { return 2.f / (1.f + exp(- in)) - 1.f; }
 
-  float noop(float in)
-  {
-    return in;
-  }
+  float id(float in) { return in; }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm)
   {
-    scorer* s = (scorer*)calloc_or_die(1, sizeof(scorer));
-    s->all = &all;
+    scorer& s = calloc_or_die<scorer>();
+    s.all = &all;
 
     po::options_description link_opts("Link options");
 
@@ -57,32 +43,28 @@ namespace Scorer {
 
     vm = add_options(all, link_opts);
 
-    learner* l = new learner(s, all.l);
+    LEARNER::learner<scorer>* l; 
 
     string link = vm["link"].as<string>();
     if (!vm.count("link") || link.compare("identity") == 0)
-      {
-	l->set_learn<scorer, predict_or_learn<true, noop> >();
-	l->set_predict<scorer, predict_or_learn<false, noop> >();
-      }
+      l = &init_learner(&s, all.l, predict_or_learn<true, id>, predict_or_learn<false, id>);
     else if (link.compare("logistic") == 0)
       {
-	all.file_options.append(" --link=logistic ");
-	l->set_learn<scorer, predict_or_learn<true, logistic> >();
-	l->set_predict<scorer, predict_or_learn<false, logistic> >();
+	*all.file_options << " --link=logistic ";
+	l = &init_learner(&s, all.l, predict_or_learn<true, logistic>, 
+			  predict_or_learn<false, logistic>);
       }
     else if (link.compare("glf1") == 0)
       {
-	all.file_options.append(" --link=glf1 ");
-	l->set_learn<scorer, predict_or_learn<true, glf1> >();
-	l->set_predict<scorer, predict_or_learn<false, glf1> >();
+	*all.file_options << " --link=glf1 ";
+	l = &init_learner(&s, all.l, predict_or_learn<true, glf1>, 
+			  predict_or_learn<false, glf1>);
       }
     else
       {
 	cerr << "Unknown link function: " << link << endl;
 	throw exception();
       }
-
-    return l;
+    return make_base(*l);
   }
 }
diff --git a/vowpalwabbit/scorer.h b/vowpalwabbit/scorer.h
index 3405b2e9..2d0ec294 100644
--- a/vowpalwabbit/scorer.h
+++ b/vowpalwabbit/scorer.h
@@ -1,4 +1,4 @@
 #pragma once
 namespace Scorer {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/search.cc b/vowpalwabbit/search.cc
index 665e5fef..fb14e65c 100644
--- a/vowpalwabbit/search.cc
+++ b/vowpalwabbit/search.cc
@@ -175,7 +175,7 @@ namespace Search {
     v_array<size_t> timesteps;
     v_array<float> learn_losses;
     
-    LEARNER::learner* base_learner;
+    LEARNER::base_learner* base_learner;
     clock_t start_clock_time;
 
     example*empty_example;
@@ -383,8 +383,8 @@ namespace Search {
     priv.dat_new_feature_ec->sum_feat_sq[priv.dat_new_feature_namespace] += f.x * f.x;
     if (priv.all->audit) {
       audit_data a = { NULL, NULL, f.weight_index, f.x, true };
-      a.space   = (char*)calloc_or_die(priv.dat_new_feature_feature_space->length()+1, sizeof(char));
-      a.feature = (char*)calloc_or_die(priv.dat_new_feature_audit_ss.str().length() + 32, sizeof(char));
+      a.space   = calloc_or_die<char>(priv.dat_new_feature_feature_space->length()+1);
+      a.feature = calloc_or_die<char>(priv.dat_new_feature_audit_ss.str().length() + 32);
       strcpy(a.space, priv.dat_new_feature_feature_space->c_str());
       int num = sprintf(a.feature, "fid=%lu_", (idx & mask) >> ss);
       strcpy(a.feature+num, priv.dat_new_feature_audit_ss.str().c_str());
@@ -855,7 +855,7 @@ namespace Search {
     size_t sz  = sizeof(size_t) + sizeof(ptag) + sizeof(int) + sizeof(size_t) + sizeof(size_t) + condition_on_cnt * (sizeof(ptag) + sizeof(action) + sizeof(char));
     if (sz % 4 != 0) sz = 4 * (sz / 4 + 1); // make sure sz aligns to 4 so that uniform_hash does the right thing
 
-    unsigned char* item = (unsigned char*)calloc(sz, 1);
+    unsigned char* item = calloc_or_die<unsigned char>(sz);
     unsigned char* here = item;
     *here = (unsigned char)sz; here += sizeof(size_t);
     *here = mytag;             here += sizeof(ptag);
@@ -1417,7 +1417,7 @@ namespace Search {
   }
 
   template <bool is_learn>
-  void search_predict_or_learn(search& sch, learner& base, example& ec) {
+  void search_predict_or_learn(search& sch, base_learner& base, example& ec) {
     search_private& priv = *sch.priv;
     vw* all = priv.all;
     priv.base_learner = &base;
@@ -1653,9 +1653,7 @@ namespace Search {
   template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string) {
     if (vm.count(opt_name)) {
       ret = vm[opt_name].as<T>();
-      stringstream ss;
-      ss << " --" << opt_name << " " << ret;
-      all.file_options.append(ss.str());
+      *all.file_options << " --" << opt_name << " " << ret;
     } else if (strlen(required_error_string)>0) {
       std::cerr << required_error_string << endl;
       if (! vm.count("help"))
@@ -1666,9 +1664,7 @@ namespace Search {
   void check_option(bool& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string) {
     if (vm.count(opt_name)) {
       ret = true;
-      stringstream ss;
-      ss << " --" << opt_name;
-      all.file_options.append(ss.str());
+      *all.file_options << " --" << opt_name;
     } else
       ret = false;
   }
@@ -1768,11 +1764,11 @@ namespace Search {
     delete[] cstr;
   }
 
-  learner* setup(vw&all, po::variables_map& vm) {
-    search* sch = (search*)calloc_or_die(1,sizeof(search));
-    sch->priv = new search_private();
-    search_initialize(&all, *sch);
-    search_private& priv = *sch->priv;
+  base_learner* setup(vw&all, po::variables_map& vm) {
+    search& sch = calloc_or_die<search>();
+    sch.priv = new search_private();
+    search_initialize(&all, sch);
+    search_private& priv = *sch.priv;
 
     po::options_description search_opts("Search Options");
     search_opts.add_options()
@@ -1852,7 +1848,7 @@ namespace Search {
     string neighbor_features_string;
     check_option<string>(neighbor_features_string, all, vm, "search_neighbor_features", false, string_equal,
                          "warning: you specified a different feature structure with --search_neighbor_features than the one loaded from predictor. using loaded value of: ", "");
-    parse_neighbor_features(neighbor_features_string, *sch);
+    parse_neighbor_features(neighbor_features_string, sch);
 
     if (interpolation_string.compare("data") == 0) { // run as dagger
       priv.adaptive_beta = true;
@@ -1894,7 +1890,7 @@ namespace Search {
                          "warning: you specified a different history length through --search_history_length than the one loaded from predictor. using loaded value of: ", "");
     
     //check if the base learner is contextual bandit, in which case, we dont rollout all actions.
-    priv.allowed_actions_cache = (polylabel*)calloc_or_die(1,sizeof(polylabel));
+    priv.allowed_actions_cache = &calloc_or_die<polylabel>();
     if (vm.count("cb")) {
       priv.cb_learner = true;
       CB::cb_label.default_label(priv.allowed_actions_cache);
@@ -1948,7 +1944,7 @@ namespace Search {
     for (search_task** mytask = all_tasks; *mytask != NULL; mytask++)
       if (task_string.compare((*mytask)->task_name) == 0) {
         priv.task = *mytask;
-        sch->task_name = (*mytask)->task_name;
+        sch.task_name = (*mytask)->task_name;
         break;
       }
     if (priv.task == NULL) {
@@ -1962,7 +1958,7 @@ namespace Search {
     // default to OAA labels unless the task wants to override this (which they can do in initialize)
     all.p->lp = MC::mc_label;
     if (priv.task)
-      priv.task->initialize(*sch, priv.A, vm);
+      priv.task->initialize(sch, priv.A, vm);
 
     if (vm.count("search_allowed_transitions"))     read_allowed_transitions((action)priv.A, vm["search_allowed_transitions"].as<string>().c_str());
     
@@ -1981,19 +1977,19 @@ namespace Search {
     if (!priv.allow_current_policy) // if we're not dagger
       all.check_holdout_every_n_passes = priv.passes_per_policy;
 
-    all.searchstr = sch;
+    all.searchstr = &sch;
 
     priv.start_clock_time = clock();
 
-    learner* l = new learner(sch, all.l, priv.total_number_of_policies);
-    l->set_learn<search, search_predict_or_learn<true> >();
-    l->set_predict<search, search_predict_or_learn<false> >();
-    l->set_finish_example<search,finish_example>();
-    l->set_end_examples<search,end_examples>();
-    l->set_finish<search,search_finish>();
-    l->set_end_pass<search,end_pass>();
+    learner<search>& l = init_learner(&sch, all.l, search_predict_or_learn<true>, 
+				      search_predict_or_learn<false>, 
+				      priv.total_number_of_policies);
+    l.set_finish_example(finish_example);
+    l.set_end_examples(end_examples);
+    l.set_finish(search_finish);
+    l.set_end_pass(end_pass);
 
-    return l;
+    return make_base(l);
   }
 
   float action_hamming_loss(action a, const action* A, size_t sz) {
@@ -2121,7 +2117,7 @@ namespace Search {
   void predictor::set_input_length(size_t input_length) {
     is_ldf = true;
     if (ec_alloced) ec = (example*)realloc(ec, input_length * sizeof(example));
-    else            ec = (example*)calloc(input_length, sizeof(example));
+    else            ec = calloc_or_die<example>(input_length);
     ec_cnt = input_length;
     ec_alloced = true;
   }
@@ -2134,7 +2130,7 @@ namespace Search {
   void predictor::make_new_pointer(v_array<action>& A, size_t new_size) {
     size_t old_size      = A.size();
     action* old_pointer  = A.begin;
-    A.begin     = (action*)calloc_or_die(new_size, sizeof(action));
+    A.begin     = calloc_or_die<action>(new_size);
     A.end       = A.begin + new_size;
     A.end_array = A.end;
     memcpy(A.begin, old_pointer, old_size * sizeof(action));
diff --git a/vowpalwabbit/search.h b/vowpalwabbit/search.h
index 8c89a373..e129de25 100644
--- a/vowpalwabbit/search.h
+++ b/vowpalwabbit/search.h
@@ -241,7 +241,7 @@ namespace Search {
   bool size_equal(size_t a, size_t b);
   
   // our interface within VW
-  LEARNER::learner* setup(vw&, po::variables_map&);
+  LEARNER::base_learner* setup(vw&, po::variables_map&);
   void search_finish(void*);
   void search_drive(void*);
   void search_learn(void*,example*);  
diff --git a/vowpalwabbit/search_dep_parser.cc b/vowpalwabbit/search_dep_parser.cc
index c29c754d..9629c30a 100644
--- a/vowpalwabbit/search_dep_parser.cc
+++ b/vowpalwabbit/search_dep_parser.cc
@@ -45,7 +45,6 @@ namespace DepParserTask {
   void initialize(Search::search& srn, size_t& num_actions, po::variables_map& vm) {
     task_data *data = new task_data();
     data->my_init_flag = false;
-    //data->ex = (example*)calloc_or_die(1, sizeof(example));
     data->ec_buf.resize(12, true);
     data->children = new v_array<uint32_t>[6]; 
 
diff --git a/vowpalwabbit/search_sequencetask.cc b/vowpalwabbit/search_sequencetask.cc
index 24f97ad5..d92013b4 100644
--- a/vowpalwabbit/search_sequencetask.cc
+++ b/vowpalwabbit/search_sequencetask.cc
@@ -264,7 +264,7 @@ namespace SequenceTask_DemoLDF {  // this is just to debug/show off how to do LD
       lab.costs.push_back(default_wclass);
     }
 
-    task_data* data = (task_data*)calloc(1, sizeof(task_data));
+    task_data* data = &calloc_or_die<task_data>();
     data->ldf_examples = ldf_examples;
     data->num_actions  = num_actions;
 
diff --git a/vowpalwabbit/sender.cc b/vowpalwabbit/sender.cc
index 8bff73e7..a9ded7e4 100644
--- a/vowpalwabbit/sender.cc
+++ b/vowpalwabbit/sender.cc
@@ -21,9 +21,6 @@
 #include "network.h"
 #include "reductions.h"
 
-using namespace std;
-using namespace LEARNER;
-
 namespace SENDER {
   struct sender {
     io_buf* buf;
@@ -69,7 +66,7 @@ void receive_result(sender& s)
   return_simple_example(*(s.all), NULL, *ec);  
 }
 
-  void learn(sender& s, learner& base, example& ec) 
+  void learn(sender& s, LEARNER::base_learner& base, example& ec) 
   { 
     if (s.received_index + s.all->p->ring_size / 2 - 1 == s.sent_index)
       receive_result(s);
@@ -81,8 +78,7 @@ void receive_result(sender& s)
     s.delay_ring[s.sent_index++ % s.all->p->ring_size] = &ec;
   }
 
-  void finish_example(vw& all, sender&, example& ec)
-{}
+  void finish_example(vw& all, sender&, example& ec){}
 
 void end_examples(sender& s)
 {
@@ -100,26 +96,23 @@ void end_examples(sender& s)
     delete s.buf;
   }
 
-  learner* setup(vw& all, po::variables_map& vm, vector<string> pairs)
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm, vector<string> pairs)
 {
-  sender* s = (sender*)calloc_or_die(1,sizeof(sender));
-  s->sd = -1;
+  sender& s = calloc_or_die<sender>();
+  s.sd = -1;
   if (vm.count("sendto"))
     {      
       vector<string> hosts = vm["sendto"].as< vector<string> >();
-      open_sockets(*s, hosts[0]);
+      open_sockets(s, hosts[0]);
     }
 
-  s->all = &all;
-  s->delay_ring = (example**) calloc_or_die(all.p->ring_size, sizeof(example*));
+  s.all = &all;
+  s.delay_ring = calloc_or_die<example*>(all.p->ring_size);
 
-  learner* l = new learner(s, 1);
-  l->set_learn<sender, learn>(); 
-  l->set_predict<sender, learn>(); 
-  l->set_finish<sender, finish>();
-  l->set_finish_example<sender, finish_example>(); 
-  l->set_end_examples<sender, end_examples>();
-  return l;
+  LEARNER::learner<sender>& l = init_learner(&s, learn, 1);
+  l.set_finish(finish);
+  l.set_finish_example(finish_example); 
+  l.set_end_examples(end_examples);
+  return make_base(l);
 }
-
 }
diff --git a/vowpalwabbit/sender.h b/vowpalwabbit/sender.h
index bcffa349..9740f159 100644
--- a/vowpalwabbit/sender.h
+++ b/vowpalwabbit/sender.h
@@ -5,5 +5,5 @@ license as described in the file LICENSE.
  */
 #pragma once
 namespace SENDER{
-  LEARNER::learner* setup(vw& all, po::variables_map& vm, vector<string> pairs);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm, vector<string> pairs);
 }
diff --git a/vowpalwabbit/simple_label.cc b/vowpalwabbit/simple_label.cc
index 9e220cde..150f95e9 100644
--- a/vowpalwabbit/simple_label.cc
+++ b/vowpalwabbit/simple_label.cc
@@ -15,8 +15,6 @@ char* bufread_simple_label(shared_data* sd, label_data* ld, char* c)
 {
   ld->label = *(float *)c;
   c += sizeof(ld->label);
-  if (sd->binary_label && fabs(ld->label) != 1.f && ld->label != FLT_MAX)
-    cout << "You are using a label not -1 or 1 with a loss function expecting that!" << endl;
   ld->weight = *(float *)c;
   c += sizeof(ld->weight);
   ld->initial = *(float *)c;
@@ -98,9 +96,6 @@ void parse_simple_label(parser* p, shared_data* sd, void* v, v_array<substring>&
     cerr << "malformed example!\n";
     cerr << "words.size() = " << words.size() << endl;
   }
-  if (words.size() > 0 && sd->binary_label && fabs(ld->label) != 1.f)
-    cout << "You are using a label not -1 or 1 with a loss function expecting that!" << endl;
-
   count_label(ld->label);
 }
 
diff --git a/vowpalwabbit/stagewise_poly.cc b/vowpalwabbit/stagewise_poly.cc
index c6352d26..b2e7e150 100644
--- a/vowpalwabbit/stagewise_poly.cc
+++ b/vowpalwabbit/stagewise_poly.cc
@@ -129,7 +129,7 @@ namespace StagewisePoly
 
   void depthsbits_create(stagewise_poly &poly)
   {
-    poly.depthsbits = (uint8_t *) calloc_or_die(1, depthsbits_sizeof(poly));
+    poly.depthsbits = calloc_or_die<uint8_t>(2 * poly.all->length());
     for (uint32_t i = 0; i < poly.all->length() * 2; i += 2) {
       poly.depthsbits[i] = default_depth;
       poly.depthsbits[i+1] = indicator_bit;
@@ -247,7 +247,7 @@ namespace StagewisePoly
       cout << ", new size " << poly.sd_len << endl;
 #endif //DEBUG
       free(poly.sd); //okay for null.
-      poly.sd = (sort_data *) calloc_or_die(poly.sd_len, sizeof(sort_data));
+      poly.sd = calloc_or_die<sort_data>(poly.sd_len);
     }
     assert(len <= poly.sd_len);
   }
@@ -502,7 +502,7 @@ namespace StagewisePoly
     }
   }
 
-  void predict(stagewise_poly &poly, learner &base, example &ec)
+  void predict(stagewise_poly &poly, base_learner &base, example &ec)
   {
     poly.original_ec = &ec;
     synthetic_create(poly, ec, false);
@@ -511,7 +511,7 @@ namespace StagewisePoly
     ec.updated_prediction = poly.synth_ec.updated_prediction;
   }
 
-  void learn(stagewise_poly &poly, learner &base, example &ec)
+  void learn(stagewise_poly &poly, base_learner &base, example &ec)
   {
     bool training = poly.all->training && ec.l.simple.label != FLT_MAX;
     poly.original_ec = &ec;
@@ -657,13 +657,13 @@ namespace StagewisePoly
   }
 
 
-  learner *setup(vw &all, po::variables_map &vm)
+  base_learner *setup(vw &all, po::variables_map &vm)
   {
-    stagewise_poly *poly = (stagewise_poly *) calloc_or_die(1, sizeof(stagewise_poly));
-    poly->all = &all;
+    stagewise_poly& poly = calloc_or_die<stagewise_poly>();
+    poly.all = &all;
 
-    depthsbits_create(*poly);
-    sort_data_create(*poly);
+    depthsbits_create(poly);
+    sort_data_create(poly);
 
     po::options_description sp_opt("Stagewise poly options");
     sp_opt.add_options()
@@ -676,36 +676,34 @@ namespace StagewisePoly
       ;
     vm = add_options(all, sp_opt);
 
-    poly->sched_exponent = vm.count("sched_exponent") ? vm["sched_exponent"].as<float>() : 1.f;
-    poly->batch_sz = vm.count("batch_sz") ? vm["batch_sz"].as<uint32_t>() : 1000;
-    poly->batch_sz_double = vm.count("batch_sz_no_doubling") ? false : true;
+    poly.sched_exponent = vm.count("sched_exponent") ? vm["sched_exponent"].as<float>() : 1.f;
+    poly.batch_sz = vm.count("batch_sz") ? vm["batch_sz"].as<uint32_t>() : 1000;
+    poly.batch_sz_double = vm.count("batch_sz_no_doubling") ? false : true;
 #ifdef MAGIC_ARGUMENT
-    poly->magic_argument = vm.count("magic_argument") ? vm["magic_argument"].as<float>() : 0.;
+    poly.magic_argument = vm.count("magic_argument") ? vm["magic_argument"].as<float>() : 0.;
 #endif //MAGIC_ARGUMENT
 
-    poly->sum_sparsity = 0;
-    poly->sum_input_sparsity = 0;
-    poly->num_examples = 0;
-    poly->sum_sparsity_sync = 0;
-    poly->sum_input_sparsity_sync = 0;
-    poly->num_examples_sync = 0;
-    poly->last_example_counter = -1;
-    poly->numpasses = 1;
-    poly->update_support = false;
-    poly->original_ec = NULL;
-    poly->next_batch_sz = poly->batch_sz;
+    poly.sum_sparsity = 0;
+    poly.sum_input_sparsity = 0;
+    poly.num_examples = 0;
+    poly.sum_sparsity_sync = 0;
+    poly.sum_input_sparsity_sync = 0;
+    poly.num_examples_sync = 0;
+    poly.last_example_counter = -1;
+    poly.numpasses = 1;
+    poly.update_support = false;
+    poly.original_ec = NULL;
+    poly.next_batch_sz = poly.batch_sz;
 
     //following is so that saved models know to load us.
-    all.file_options.append(" --stage_poly");
+    *all.file_options << " --stage_poly";
 
-    learner *l = new learner(poly, all.l);
-    l->set_learn<stagewise_poly, learn>();
-    l->set_predict<stagewise_poly, predict>();
-    l->set_finish<stagewise_poly, finish>();
-    l->set_save_load<stagewise_poly, save_load>();
-    l->set_finish_example<stagewise_poly,finish_example>();
-    l->set_end_pass<stagewise_poly, end_pass>();
+    learner<stagewise_poly>& l = init_learner(&poly, all.l, learn, predict);
+    l.set_finish(finish);
+    l.set_save_load(save_load);
+    l.set_finish_example(finish_example);
+    l.set_end_pass(end_pass);
 
-    return l;
+    return make_base(l);
   }
 }
diff --git a/vowpalwabbit/stagewise_poly.h b/vowpalwabbit/stagewise_poly.h
index 4f5ac1fa..983b4382 100644
--- a/vowpalwabbit/stagewise_poly.h
+++ b/vowpalwabbit/stagewise_poly.h
@@ -6,5 +6,5 @@ license as described in the file LICENSE.
 #pragma once
 namespace StagewisePoly
 {
-  LEARNER::learner *setup(vw &all, po::variables_map &vm);
+  LEARNER::base_learner *setup(vw &all, po::variables_map &vm);
 }
diff --git a/vowpalwabbit/topk.cc b/vowpalwabbit/topk.cc
index f3e8be8e..445bdb23 100644
--- a/vowpalwabbit/topk.cc
+++ b/vowpalwabbit/topk.cc
@@ -4,31 +4,23 @@ individual contributors. All rights reserved.  Released under a BSD (revised)
 license as described in the file LICENSE.
  */
 #include <float.h>
-#include <math.h>
-#include <stdio.h>
 #include <sstream>
-#include <numeric>
-#include <vector>
 #include <queue>
 
 #include "reductions.h"
 #include "vw.h"
 
-using namespace std;
-using namespace LEARNER;
-
-typedef pair<float, v_array<char> > scored_example;
-
-struct compare_scored_examples
-{
+namespace TOPK {
+  typedef pair<float, v_array<char> > scored_example;
+  
+  struct compare_scored_examples
+  {
     bool operator()(scored_example const& a, scored_example const& b) const
     {
-        return a.first > b.first;
+      return a.first > b.first;
     }
-};
-
-namespace TOPK {
-
+  };
+  
   struct topk{
     uint32_t B; //rec number
     priority_queue<scored_example, vector<scored_example>, compare_scored_examples > pr_queue;
@@ -85,7 +77,7 @@ namespace TOPK {
   }
 
   template <bool is_learn>
-  void predict_or_learn(topk& d, learner& base, example& ec)
+  void predict_or_learn(topk& d, LEARNER::base_learner& base, example& ec)
   {
     if (example_is_newline(ec)) return;//do not predict newline
 
@@ -102,7 +94,6 @@ namespace TOPK {
       d.pr_queue.pop();
       d.pr_queue.push(make_pair(ec.pred.scalar, ec.tag));
     }
-
   }
 
   void finish_example(vw& all, topk& d, example& ec)
@@ -111,19 +102,16 @@ namespace TOPK {
     VW::finish_example(all, &ec);
   }
 
-  learner* setup(vw& all, po::variables_map& vm)
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm)
   {
-    topk* data = (topk*)calloc_or_die(1, sizeof(topk));
-
-    data->B = (uint32_t)vm["top"].as<size_t>();
-
-    data->all = &all;
+    topk& data = calloc_or_die<topk>();
+    data.B = (uint32_t)vm["top"].as<size_t>();
+    data.all = &all;
 
-    learner* l = new learner(data, all.l);
-    l->set_learn<topk, predict_or_learn<true> >();
-    l->set_predict<topk, predict_or_learn<false> >();
-    l->set_finish_example<topk,finish_example>();
+    LEARNER::learner<topk>& l = init_learner(&data, all.l, predict_or_learn<true>, 
+					     predict_or_learn<false>);
+    l.set_finish_example(finish_example);
 
-    return l;
+    return make_base(l);
   }
 }
diff --git a/vowpalwabbit/topk.h b/vowpalwabbit/topk.h
index a24d0653..866d94c5 100644
--- a/vowpalwabbit/topk.h
+++ b/vowpalwabbit/topk.h
@@ -14,5 +14,5 @@ license as described in the file LICENSE.
 
 namespace TOPK
 {
-  LEARNER::learner* setup(vw& all, po::variables_map& vm);
+  LEARNER::base_learner* setup(vw& all, po::variables_map& vm);
 }
diff --git a/vowpalwabbit/unique_sort.cc b/vowpalwabbit/unique_sort.cc
index c682cf63..1a323d2d 100644
--- a/vowpalwabbit/unique_sort.cc
+++ b/vowpalwabbit/unique_sort.cc
@@ -6,9 +6,7 @@ license as described in the file LICENSE.
 #include "global_data.h"
 
 int order_features(const void* first, const void* second)
-{
-  return ((feature*)first)->weight_index - ((feature*)second)->weight_index;
-}
+{ return ((feature*)first)->weight_index - ((feature*)second)->weight_index;}
 
 int order_audit_features(const void* first, const void* second)
 {
diff --git a/vowpalwabbit/vw.h b/vowpalwabbit/vw.h
index 3408da9b..0fa4ee77 100644
--- a/vowpalwabbit/vw.h
+++ b/vowpalwabbit/vw.h
@@ -18,7 +18,7 @@ namespace VW {
    */
   vw* initialize(string s);
 
-  void cmd_string_replace_value( string& cmd, string flag_to_replace, string new_value );
+  void cmd_string_replace_value( std::stringstream*& ss, string flag_to_replace, string new_value );
 
   char** get_argv_from_string(string s, int& argc);
author	ariel faigon <github.2009@yendor.com>	2015-01-01 14:50:40 +0300
committer	ariel faigon <github.2009@yendor.com>	2015-01-01 14:50:40 +0300
commit	f1f859a19c8a3c70ad7b4706d78ae5cac6bdca36 (patch)
tree	f014f81b35fd0c25b1fec5eebe5380d0a3a4dd4e
parent	165198fc8d2c8e25e2af274a65eb0b30d6fa5c6d (diff)
parent	ca852d8cfeef77ca60aa426c9070f1568ecf644e (diff)