merged john's changes

author: Hal Daume III <me@hal3.name> 2014-05-25 00:56:42 +0400
committer: Hal Daume III <me@hal3.name> 2014-05-25 00:56:42 +0400
commit: 99f9974e4002e0aaef22741f656e5bc9baf9dc40 (patch)
tree: 2e26b0c90c94b852c229653ca302f10cd3716b67 /vowpalwabbit
parent: 7703b4bf0cece9bdccc662e8680725d333d408c3 (diff)
56 files changed, 1149 insertions, 1599 deletions
diff --git a/vowpalwabbit/Makefile.am b/vowpalwabbit/Makefile.am
index 34b150f5..cc2e1a71 100644
--- a/vowpalwabbit/Makefile.am
+++ b/vowpalwabbit/Makefile.am
@@ -4,7 +4,7 @@ liballreduce_la_SOURCES = allreduce.cc
 
 bin_PROGRAMS = vw active_interactor
 
-libvw_la_SOURCES = hash.cc memory.cc global_data.cc io_buf.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc rand48.cc simple_label.cc multiclass.cc oaa.cc ect.cc autolink.cc binary.cc lrq.cc cost_sensitive.cc csoaa.cc cb.cc cb_algs.cc wap.cc searn.cc searn_sequencetask.cc parse_example.cc scorer.cc sparse_dense.cc network.cc parse_args.cc accumulate.cc gd.cc learner.cc lda_core.cc gd_mf.cc mf.cc bfgs.cc noop.cc print.cc example.cc parser.cc loss_functions.cc sender.cc nn.cc bs.cc cbify.cc topk.cc
+libvw_la_SOURCES = hash.cc memory.cc global_data.cc io_buf.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc rand48.cc simple_label.cc multiclass.cc oaa.cc ect.cc autolink.cc binary.cc lrq.cc cost_sensitive.cc csoaa.cc cb.cc cb_algs.cc wap.cc searn.cc searn_sequencetask.cc parse_example.cc scorer.cc network.cc parse_args.cc accumulate.cc gd.cc learner.cc lda_core.cc gd_mf.cc mf.cc bfgs.cc noop.cc print.cc example.cc parser.cc loss_functions.cc sender.cc nn.cc bs.cc cbify.cc topk.cc
 
 # accumulate.cc uses all_reduce
 libvw_la_LIBADD = liballreduce.la
diff --git a/vowpalwabbit/accumulate.cc b/vowpalwabbit/accumulate.cc
index ed0b6136..daa6a75f 100644
--- a/vowpalwabbit/accumulate.cc
+++ b/vowpalwabbit/accumulate.cc
@@ -104,42 +104,8 @@ void accumulate_weighted_avg(vw& all, string master_location, regressor& reg) {
       weights[stride*i] = 0;
     }
 
-  if(!all.feature_mask_idx) //do in place all_reduce when the feature mask is absent
-    all_reduce<float>(weights, length*stride, master_location, all.unique_id, all.total, all.node, all.socks);
+  all_reduce<float>(weights, length*stride, master_location, all.unique_id, all.total, all.node, all.socks);
   
-  else {
-
-    //Find weighted averaged weight
-    all_reduce<float>(local_weights, length, master_location, all.unique_id, all.total, all.node, all.socks);
-    
-    for(uint32_t i = 0;i < length;i++) 
-      {
-	weights[stride*i] = local_weights[i];
-	local_weights[i] = weights[stride*i+1];
-	
-      }
-    
-    //Find weighted average for adaptation
-    all_reduce(local_weights, length, master_location, all.unique_id, all.total, all.node, all.socks);
-    
-    for(uint32_t i = 0;i < length;i++) 
-      {      
-	weights[stride*i+1] = local_weights[i];
-	if (all.normalized_updates)
-	  local_weights[i] = weights[stride*i+all.normalized_idx];
-	
-      }
-    
-    if (all.normalized_updates)
-      {
-	//Find weighted average for normalization
-	all_reduce(local_weights, length, master_location, all.unique_id, all.total, all.node, all.socks);
-	for(uint32_t i = 0;i < length;i++) 
-	  weights[stride*i+all.normalized_idx] = local_weights[i];
-      }
-  }
-
-
   delete[] local_weights;
 }
 
diff --git a/vowpalwabbit/allreduce.cc b/vowpalwabbit/allreduce.cc
index 782da0c1..28226901 100644
--- a/vowpalwabbit/allreduce.cc
+++ b/vowpalwabbit/allreduce.cc
@@ -36,18 +36,18 @@ socket_t sock_connect(const uint32_t ip, const int port) {
   sockaddr_in far_end;
   far_end.sin_family = AF_INET;
   far_end.sin_port = port;
-  
+
   far_end.sin_addr = *(in_addr*)&ip;
   memset(&far_end.sin_zero, '\0',8);
-  
+
   {
     char hostname[NI_MAXHOST];
     char servInfo[NI_MAXSERV];
     getnameinfo((sockaddr *) &far_end, sizeof(sockaddr), hostname, NI_MAXHOST, servInfo, NI_MAXSERV, NI_NUMERICSERV);
-    
+
     cerr << "connecting to " << hostname << ':' << ntohs(port) << endl;
   }
-  
+
   size_t count = 0;
   int ret;
   while ( (ret =connect(sock,(sockaddr*)&far_end, sizeof(far_end))) == -1 && count < 100)
@@ -59,7 +59,7 @@ socket_t sock_connect(const uint32_t ip, const int port) {
       cerr << "can't connect to: " ;
       uint32_t pip = ntohl(ip);
       unsigned char * pp = (unsigned char*)&pip;
-      
+
       for (size_t i = 0; i < 4; i++)
 	{
 	  cerr << static_cast<unsigned int>(static_cast<unsigned short>(pp[3-i])) << ".";
@@ -67,7 +67,7 @@ socket_t sock_connect(const uint32_t ip, const int port) {
       cerr << ':' << ntohs(port) << endl;
       perror(NULL);
       count++;
-#ifdef _WIN32      
+#ifdef _WIN32
       Sleep(1);
 #else
       sleep(1);
@@ -90,7 +90,7 @@ socket_t getsock()
   // of VW on the same machine to potentially contact the wrong tree node.
 #ifndef _WIN32
     int on = 1;
-    if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) < 0) 
+    if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) < 0)
       perror("setsockopt SO_REUSEADDR");
 #endif
   return sock;
@@ -104,7 +104,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const
   int lastError = WSAGetLastError();
 #endif
 
-  
+
 
   struct hostent* master = gethostbyname(master_location.c_str());
 
@@ -119,11 +119,11 @@ void all_reduce_init(const string master_location, const size_t unique_id, const
 
   socket_t master_sock = sock_connect(master_ip, htons(port));
   if(send(master_sock, (const char*)&unique_id, sizeof(unique_id), 0) < (int)sizeof(unique_id))
-    cerr << "write failed!" << endl; 
+    cerr << "write failed!" << endl;
   if(send(master_sock, (const char*)&total, sizeof(total), 0) < (int)sizeof(total))
-    cerr << "write failed!" << endl; 
+    cerr << "write failed!" << endl;
   if(send(master_sock, (char*)&node, sizeof(node), 0) < (int)sizeof(node))
-    cerr << "write failed!" << endl; 
+    cerr << "write failed!" << endl;
   int ok;
   if (recv(master_sock, (char*)&ok, sizeof(ok), 0) < (int)sizeof(ok))
     cerr << "read 1 failed!" << endl;
@@ -173,7 +173,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const
         if (listen(sock, kid_count) < 0)
         {
           perror("listen failed! ");
-          shutdown(sock, SHUT_RDWR);
+          CLOSESOCK(sock);
           sock = getsock();
         }
         else
@@ -192,7 +192,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const
   if(recv(master_sock, (char*)&parent_port, sizeof(parent_port), 0) < (int)sizeof(parent_port))
     cerr << "read 4 failed!" << endl;
 
-  shutdown(master_sock, SHUT_RDWR);
+  CLOSESOCK(master_sock);
 
   if(parent_ip != (uint32_t)-1) {
     socks.parent = sock_connect(parent_ip, parent_port);
@@ -205,7 +205,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const
   {
     sockaddr_in child_address;
     socklen_t size = sizeof(child_address);
-    socket_t f = accept(sock,(sockaddr*)&child_address,&size);    
+    socket_t f = accept(sock,(sockaddr*)&child_address,&size);
     if (f < 0)
     {
       cerr << "bad client socket!" << endl;
@@ -219,7 +219,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const
   }
 
   if (kid_count > 0)
-    shutdown(sock, SHUT_RDWR);
+    CLOSESOCK(sock);
 }
 
 
@@ -229,9 +229,9 @@ void pass_down(char* buffer, const size_t parent_read_pos, size_t& children_sent
 
   if(my_bufsize > 0) {
     //going to pass up this chunk of data to the children
-    if(child_sockets[0] != -1 && send(child_sockets[0], buffer+children_sent_pos, (int)my_bufsize, 0) < (int)my_bufsize) 
+    if(child_sockets[0] != -1 && send(child_sockets[0], buffer+children_sent_pos, (int)my_bufsize, 0) < (int)my_bufsize)
       cerr<<"Write to left child failed\n";
-    if(child_sockets[1] != -1 && send(child_sockets[1], buffer+children_sent_pos, (int)my_bufsize, 0) < (int)my_bufsize) 
+    if(child_sockets[1] != -1 && send(child_sockets[1], buffer+children_sent_pos, (int)my_bufsize, 0) < (int)my_bufsize)
       cerr<<"Write to right child failed\n";
 
     children_sent_pos += my_bufsize;
@@ -241,16 +241,16 @@ void pass_down(char* buffer, const size_t parent_read_pos, size_t& children_sent
 
 
 void broadcast(char* buffer, const size_t n, const socket_t parent_sock, const socket_t * child_sockets) {
- 
+
    size_t parent_read_pos = 0; //First unread float from parent
    size_t children_sent_pos = 0; //First unsent float to children
   //parent_sent_pos <= left_read_pos
   //parent_sent_pos <= right_read_pos
-  
+
    if(parent_sock == -1) {
-     parent_read_pos = n;						 
+     parent_read_pos = n;
    }
-   if(child_sockets[0] == -1 && child_sockets[1] == -1) 
+   if(child_sockets[0] == -1 && child_sockets[1] == -1)
      children_sent_pos = n;
 
    while (parent_read_pos < n || children_sent_pos < n)
@@ -270,8 +270,7 @@ void broadcast(char* buffer, const size_t n, const socket_t parent_sock, const s
 	  cerr <<" Read from parent failed\n";
 	  perror(NULL);
 	}
-	parent_read_pos += read_size;	
+	parent_read_pos += read_size;
       }
     }
 }
-
diff --git a/vowpalwabbit/allreduce.h b/vowpalwabbit/allreduce.h
index 35185456..7c2bdfd1 100644
--- a/vowpalwabbit/allreduce.h
+++ b/vowpalwabbit/allreduce.h
@@ -3,7 +3,7 @@ Copyright (c) by respective owners including Yahoo!, Microsoft, and
 individual contributors. All rights reserved.  Released under a BSD
 license as described in the file LICENSE.
  */
-// This implements the allreduce function of MPI.  
+// This implements the allreduce function of MPI.
 
 #ifndef ALLREDUCE_H
 #define ALLREDUCE_H
@@ -15,7 +15,7 @@ typedef unsigned int uint32_t;
 typedef unsigned short uint16_t;
 typedef int socklen_t;
 typedef SOCKET socket_t;
-#define SHUT_RDWR SD_BOTH
+#define CLOSESOCK closesocket
 #else
 #include <sys/socket.h>
 #include <sys/socket.h>
@@ -25,6 +25,7 @@ typedef SOCKET socket_t;
 #include <stdlib.h>
 #include <stdio.h>
 typedef int socket_t;
+#define CLOSESOCK close
 #endif
 
 using namespace std;
@@ -39,11 +40,11 @@ struct node_socks {
   {
     if(current_master != "") {
       if(parent != -1)
-	shutdown(this->parent, SHUT_RDWR);
-      if(children[0] != -1) 
-	shutdown(this->children[0], SHUT_RDWR);
+	CLOSESOCK(this->parent);
+      if(children[0] != -1)
+	CLOSESOCK(this->children[0]);
       if(children[1] != -1)
-	shutdown(this->children[1], SHUT_RDWR);  
+	CLOSESOCK(this->children[1]);
     }
   }
   node_socks ()
@@ -54,7 +55,7 @@ struct node_socks {
 
 
 template <class T> void addbufs(T* buf1, const T* buf2, const size_t n) {
-  for(size_t i = 0;i < n;i++) 
+  for(size_t i = 0;i < n;i++)
     buf1[i] += buf2[i];
 }
 
@@ -63,11 +64,11 @@ void all_reduce_init(const string master_location, const size_t unique_id, const
 template <class T> void pass_up(char* buffer, size_t left_read_pos, size_t right_read_pos, size_t& parent_sent_pos, socket_t parent_sock, size_t n) {
   size_t my_bufsize = min(ar_buf_size, ((int)(floor(left_read_pos/((float)sizeof(T)))*sizeof(T)) - parent_sent_pos));
   my_bufsize = min(my_bufsize, ((int)(floor(right_read_pos/((float)sizeof(T)))*sizeof(T)) - parent_sent_pos));
-  
+
   if(my_bufsize > 0) {
     //going to pass up this chunk of data to the parent
     int write_size = send(parent_sock, buffer+parent_sent_pos, (int)my_bufsize, 0);
-    if(write_size < (int)my_bufsize) 
+    if(write_size < (int)my_bufsize)
       cerr<<"Write to parent failed "<<my_bufsize<<" "<<write_size<<" "<<parent_sent_pos<<" "<<left_read_pos<<" "<<right_read_pos<<endl ;
     parent_sent_pos += my_bufsize;
   }
@@ -90,12 +91,12 @@ template <class T>void reduce(char* buffer, const size_t n, const socket_t paren
   size_t parent_sent_pos = 0; //First unsent float to parent
   //parent_sent_pos <= left_read_pos
   //parent_sent_pos <= right_read_pos
-  
+
   if(child_sockets[0] == -1) {
     child_read_pos[0] = n;
   }
   if(child_sockets[1] == -1) {
-    child_read_pos[1] = n;						 
+    child_read_pos[1] = n;
   }
 
   while (parent_sent_pos < n || child_read_pos[0] < n || child_read_pos[1] < n)
@@ -112,7 +113,7 @@ template <class T>void reduce(char* buffer, const size_t n, const socket_t paren
 	    perror(NULL);
 	    throw exception();
 	  }
-      
+
 	for(int i = 0;i < 2;i++) {
 	  if(child_sockets[i] != -1 && FD_ISSET(child_sockets[i],&fds)) {
 	    //there is data to be left from left child
@@ -120,8 +121,8 @@ template <class T>void reduce(char* buffer, const size_t n, const socket_t paren
 	      cerr<<"I think child has no data to send but he thinks he has "<<FD_ISSET(child_sockets[0],&fds)<<" "<<FD_ISSET(child_sockets[1],&fds)<<endl;
 	      throw exception();
 	    }
-	  
-	
+
+
 	    size_t count = min(ar_buf_size,n - child_read_pos[i]);
 	    int read_size = recv(child_sockets[i], child_read_buf[i] + child_unprocessed[i], (int)count, 0);
 	    if(read_size == -1) {
@@ -129,36 +130,36 @@ template <class T>void reduce(char* buffer, const size_t n, const socket_t paren
 	      perror(NULL);
 	      throw exception();
 	    }
-	    
+
 	    addbufs((T*)buffer + child_read_pos[i]/sizeof(T), (T*)child_read_buf[i], (child_read_pos[i] + read_size)/sizeof(T) - child_read_pos[i]/sizeof(T));
-	    
+
 	    child_read_pos[i] += read_size;
 	    int old_unprocessed = child_unprocessed[i];
 	    child_unprocessed[i] = child_read_pos[i] % (int)sizeof(T);
 	    for(int j = 0;j < child_unprocessed[i];j++) {
 	      child_read_buf[i][j] = child_read_buf[i][((old_unprocessed + read_size)/(int)sizeof(T))*sizeof(T)+j];
 	    }
-	  
+
 	    if(child_read_pos[i] == n) //Done reading parent
 	      FD_CLR(child_sockets[i],&fds);
 	  }
 	  else if(child_sockets[i] != -1 && child_read_pos[i] != n)
-	    FD_SET(child_sockets[i],&fds);      
+	    FD_SET(child_sockets[i],&fds);
 	}
       }
-      if(parent_sock == -1 && child_read_pos[0] == n && child_read_pos[1] == n) 
+      if(parent_sock == -1 && child_read_pos[0] == n && child_read_pos[1] == n)
 	parent_sent_pos = n;
 
-    }  
-  
+    }
+
 }
 
 void broadcast(char* buffer, const size_t n, const socket_t parent_sock, const socket_t * child_sockets);
 
 
-template <class T> void all_reduce(T* buffer, const size_t n, const std::string master_location, const size_t unique_id, const size_t total, const size_t node, node_socks& socks) 
+template <class T> void all_reduce(T* buffer, const size_t n, const std::string master_location, const size_t unique_id, const size_t total, const size_t node, node_socks& socks)
 {
-  if(master_location != socks.current_master) 
+  if(master_location != socks.current_master)
     all_reduce_init(master_location, unique_id, total, node, socks);
   reduce<T>((char*)buffer, n*sizeof(T), socks.parent, socks.children);
   broadcast((char*)buffer, n*sizeof(T), socks.parent, socks.children);
diff --git a/vowpalwabbit/autolink.cc b/vowpalwabbit/autolink.cc
index 87110908..c8aacf5e 100644
--- a/vowpalwabbit/autolink.cc
+++ b/vowpalwabbit/autolink.cc
@@ -42,18 +42,15 @@ namespace ALINK {
     ec.total_sum_feat_sq -= sum_sq;
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     autolink* data = (autolink*)calloc_or_die(1,sizeof(autolink));
     data->d = (uint32_t)vm["autolink"].as<size_t>();
     data->stride_shift = all.reg.stride_shift;
     
-    if (!vm_file.count("autolink")) 
-      {
-	std::stringstream ss;
-	ss << " --autolink " << data->d << " ";
-	all.options_from_file.append(ss.str());
-      }
+    std::stringstream ss;
+    ss << " --autolink " << data->d;
+    all.file_options = all.file_options+ss.str();
 
     learner* ret = new learner(data, all.l);
     ret->set_learn<autolink, predict_or_learn<true> >();
diff --git a/vowpalwabbit/autolink.h b/vowpalwabbit/autolink.h
index d7f8c6ce..28a3f341 100644
--- a/vowpalwabbit/autolink.h
+++ b/vowpalwabbit/autolink.h
@@ -2,6 +2,6 @@
 #ifndef AUTOLINK_H
 #define AUTOLINK_H
 namespace ALINK {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 #endif
diff --git a/vowpalwabbit/bfgs.cc b/vowpalwabbit/bfgs.cc
index 6f9f04c8..8b1778a5 100644
--- a/vowpalwabbit/bfgs.cc
+++ b/vowpalwabbit/bfgs.cc
@@ -154,7 +154,7 @@ bool test_example(example& ec)
 
   float bfgs_predict(vw& all, example& ec)
   {
-    ec.partial_prediction = GD::inline_predict<vec_add>(all,ec);
+    ec.partial_prediction = GD::inline_predict(all,ec);
     return GD::finalize_prediction(all, ec.partial_prediction);
   }
 
@@ -198,7 +198,7 @@ void update_preconditioner(vw& all, example& ec)
 float dot_with_direction(vw& all, example& ec)
 {
   ec.ft_offset+= W_DIR;  
-  float ret = GD::inline_predict<vec_add>(all, ec);
+  float ret = GD::inline_predict(all, ec);
   ec.ft_offset-= W_DIR;
 
   return ret;
@@ -802,8 +802,10 @@ void end_pass(bfgs& b)
                set_done(*all);
                cerr<<"Early termination reached w.r.t. holdout set error";
              }
-
-	   } 
+	   } if (b.final_pass == b.current_pass) {
+	     finalize_regressor(*all, all->final_regressor_name); 
+	     set_done(*all);
+	   }
            
        }else{//reaching convergence in the previous pass
         if(b.output_regularizer) 
@@ -968,7 +970,7 @@ void save_load(bfgs& b, io_buf& model_file, bool read, bool text)
     b.backstep_on = true;
   }
 
-learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm)
+learner* setup(vw& all, po::variables_map& vm)
 {
   bfgs* b = (bfgs*)calloc_or_die(1,sizeof(bfgs));
   b->all = &all;
@@ -982,6 +984,16 @@ learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm)
   b->no_win_counter = 0;
   b->early_stop_thres = 3;
 
+  po::options_description bfgs_opts("LBFGS options");
+
+  bfgs_opts.add_options()
+    ("hessian_on", "use second derivative in line search")
+    ("mem", po::value<int>(&(all.m)), "memory in bfgs")
+    ("conjugate_gradient", "use conjugate gradient based optimization")
+    ("termination", po::value<float>(&(all.rel_threshold)),"Termination threshold");
+
+  vm = add_options(all, bfgs_opts);
+
   if(!all.holdout_set_off)
   {
     all.sd->holdout_best_loss = FLT_MAX;
diff --git a/vowpalwabbit/bfgs.h b/vowpalwabbit/bfgs.h
index ac9e471a..82699fa6 100644
--- a/vowpalwabbit/bfgs.h
+++ b/vowpalwabbit/bfgs.h
@@ -7,7 +7,7 @@ license as described in the file LICENSE.
 #define BFGS_H
 
 namespace BFGS {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 
 #endif
diff --git a/vowpalwabbit/binary.cc b/vowpalwabbit/binary.cc
index 585f185a..aa55d666 100644
--- a/vowpalwabbit/binary.cc
+++ b/vowpalwabbit/binary.cc
@@ -25,15 +25,8 @@ namespace BINARY {
       ec.loss = 1.;
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {//parse and set arguments
-    if (!vm_file.count("binary"))
-      {
-	std::stringstream ss;
-	ss << " --binary ";
-	all.options_from_file.append(ss.str());
-      }
-
     all.sd->binary_label = true;
     //Create new learner
     learner* ret = new learner(NULL, all.l);
diff --git a/vowpalwabbit/binary.h b/vowpalwabbit/binary.h
index 0016f16b..0cc4ab1b 100644
--- a/vowpalwabbit/binary.h
+++ b/vowpalwabbit/binary.h
@@ -1,6 +1,6 @@
 #ifndef BINARY_H
 #define BINARY_H
 namespace BINARY {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 #endif
diff --git a/vowpalwabbit/bs.cc b/vowpalwabbit/bs.cc
index c3cb5f25..e7321819 100644
--- a/vowpalwabbit/bs.cc
+++ b/vowpalwabbit/bs.cc
@@ -192,59 +192,31 @@ namespace BS {
     d.pred_vec.~vector();
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     bs* data = (bs*)calloc_or_die(1, sizeof(bs));
     data->ub = FLT_MAX;
     data->lb = -FLT_MAX;
 
-    po::options_description desc("BS options");
-    desc.add_options()
+    po::options_description bs_options("Bootstrap options");
+    bs_options.add_options()
       ("bs_type", po::value<string>(), "prediction type {mean,vote}");
+    
+    vm = add_options(all, bs_options);
 
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
-
-    po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc,all.options_from_file_argv).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    po::store(parsed_file, vm_file);
-    po::notify(vm_file);
+    data->B = (uint32_t)vm["bootstrap"].as<size_t>();
 
-    if( vm_file.count("bs") ) {
-      data->B = (uint32_t)vm_file["bs"].as<size_t>();
-      if( vm.count("bs") && (uint32_t)vm["bs"].as<size_t>() != data->B )
-        std::cerr << "warning: you specified a different number of samples through --bs than the one loaded from predictor. Pursuing with loaded value of: " << data->B << endl;
-    }
-    else {
-      data->B = (uint32_t)vm["bs"].as<size_t>();
+    //append bs with number of samples to options_from_file so it is saved to regressor later
+    std::stringstream ss;
+    ss << " --bootstrap " << data->B;
+    all.file_options.append(ss.str());
 
-      //append bs with number of samples to options_from_file so it is saved to regressor later
-      std::stringstream ss;
-      ss << " --bs " << data->B;
-      all.options_from_file.append(ss.str());
-    }
+    std::string type_string("mean");
 
-    if (vm.count("bs_type") || vm_file.count("bs_type"))
+    if (vm.count("bs_type"))
     {
-      std::string type_string;
-
-      if(vm_file.count("bs_type")) {
-        type_string = vm_file["bs_type"].as<std::string>();
-        if( vm.count("bs_type") && type_string.compare(vm["bs_type"].as<string>()) != 0)
-          cerr << "You specified a different --bs_type than the one loaded from regressor file. Pursuing with loaded value of: " << type_string << endl;
-      }
-      else {
-        type_string = vm["bs_type"].as<std::string>();
-
-        all.options_from_file.append(" --bs_type ");
-        all.options_from_file.append(type_string);
-      }
-
+      type_string = vm["bs_type"].as<std::string>();
+      
       if (type_string.compare("mean") == 0) { 
         data->bs_type = BS_TYPE_MEAN;
       }
@@ -256,11 +228,10 @@ namespace BS {
         data->bs_type = BS_TYPE_MEAN;
       }
     }
-    else {
-      //by default use mean
+    else //by default use mean
       data->bs_type = BS_TYPE_MEAN;
-      all.options_from_file.append(" --bs_type mean");
-    }
+    all.file_options.append(" --bs_type ");
+    all.file_options.append(type_string);
 
     data->pred_vec.reserve(data->B);
     data->all = &all;
diff --git a/vowpalwabbit/bs.h b/vowpalwabbit/bs.h
index c3c1e578..9abbafea 100644
--- a/vowpalwabbit/bs.h
+++ b/vowpalwabbit/bs.h
@@ -13,7 +13,7 @@ license as described in the file LICENSE.
 
 namespace BS
 {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
   void print_result(int f, float res, float weight, v_array<char> tag, float lb, float ub);
   
   void output_example(vw& all, example* ec, float lb, float ub);
diff --git a/vowpalwabbit/cb_algs.cc b/vowpalwabbit/cb_algs.cc
index 1873b29a..5c79a7fd 100644
--- a/vowpalwabbit/cb_algs.cc
+++ b/vowpalwabbit/cb_algs.cc
@@ -291,7 +291,7 @@ namespace CB_ALGS
     vw* all = c.all;
     CB::label* ld = (CB::label*)ec.ld;
 
-    //check if this is a test example where we just want a prediction
+     //check if this is a test example where we just want a prediction
     if( is_test_label(ld) )
     {
       //if so just query base cost-sensitive learner
@@ -299,6 +299,8 @@ namespace CB_ALGS
 
       ec.ld = &c.cb_cs_ld;
       base.predict(ec);
+      ld->prediction = c.cb_cs_ld.prediction;
+
       ec.ld = ld;
       for (size_t i=0; i<ld->costs.size(); i++)
         ld->costs[i].partial_prediction = c.cb_cs_ld.costs[i].partial_prediction;
@@ -477,60 +479,37 @@ namespace CB_ALGS
     VW::finish_example(all, &ec);
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     cb* c = (cb*)calloc_or_die(1, sizeof(cb));
     c->all = &all;
     c->min_cost = 0.;
     c->max_cost = 1.;
-    po::options_description desc("CB options");
-    desc.add_options()
+
+    uint32_t nb_actions = (uint32_t)vm["cb"].as<size_t>();
+    //append cb with nb_actions to file_options so it is saved to regressor later
+
+    po::options_description cb_opts("CB options");
+    cb_opts.add_options()
       ("cb_type", po::value<string>(), "contextual bandit method to use in {ips,dm,dr}");
 
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
-
-    po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc,all.options_from_file_argv).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    po::store(parsed_file, vm_file);
-    po::notify(vm_file);
-
-    uint32_t nb_actions = 0;
-    if( vm_file.count("cb") ) { //if loaded options from regressor file already
-      nb_actions = (uint32_t)vm_file["cb"].as<size_t>();
-      if( vm.count("cb") && (uint32_t)vm["cb"].as<size_t>() != nb_actions )
-        std::cerr << "warning: you specified a different number of actions through --cb than the one loaded from regressor. Pursuing with loaded value of: " << nb_actions << endl;
-    }
-    else {
-      nb_actions = (uint32_t)vm["cb"].as<size_t>();
-      //append cb with nb_actions to options_from_file so it is saved to regressor later
-      std::stringstream ss;
-      ss << " --cb " << nb_actions;
-      all.options_from_file.append(ss.str());
-    }
+    vm = add_options(all, cb_opts);
+
+    std::stringstream ss;
+    ss << " --cb " << nb_actions;
+    all.file_options.append(ss.str());
+
     all.sd->k = nb_actions;
 
     size_t problem_multiplier = 2;//default for DR
-    if (vm.count("cb_type") || vm_file.count("cb_type"))
+    if (vm.count("cb_type"))
     {
       std::string type_string;
 
-      if(vm_file.count("cb_type")) {
-        type_string = vm_file["cb_type"].as<std::string>();
-        if( vm.count("cb_type") && type_string.compare(vm["cb_type"].as<string>()) != 0)
-          cerr << "You specified a different --cb_type than the one loaded from regressor file. Pursuing with loaded value of: " << type_string << endl;
-      }
-      else {
-        type_string = vm["cb_type"].as<std::string>();
-
-        all.options_from_file.append(" --cb_type ");
-        all.options_from_file.append(type_string);
-      }
+      type_string = vm["cb_type"].as<std::string>();
+      
+      all.file_options.append(" --cb_type ");
+      all.file_options.append(type_string);
 
       if (type_string.compare("dr") == 0) 
         c->cb_type = CB_TYPE_DR;
@@ -552,7 +531,7 @@ namespace CB_ALGS
     else {
       //by default use doubly robust
       c->cb_type = CB_TYPE_DR;
-      all.options_from_file.append(" --cb_type dr");
+      all.file_options.append(" --cb_type dr");
     }
 
     all.p->lp = CB::cb_label; 
diff --git a/vowpalwabbit/cb_algs.h b/vowpalwabbit/cb_algs.h
index 730a548a..8fffadbc 100644
--- a/vowpalwabbit/cb_algs.h
+++ b/vowpalwabbit/cb_algs.h
@@ -9,7 +9,7 @@ license as described in the file LICENSE.
 //TODO: extend to handle CSOAA_LDF and WAP_LDF
 namespace CB_ALGS {
 
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 
   template <bool is_learn>
     float get_cost_pred(vw& all, CB::cb_class* known_cost, example& ec, uint32_t index, uint32_t base)
diff --git a/vowpalwabbit/cbify.cc b/vowpalwabbit/cbify.cc
index 270e0b08..4f317380 100644
--- a/vowpalwabbit/cbify.cc
+++ b/vowpalwabbit/cbify.cc
@@ -61,7 +61,7 @@ namespace CBIFY {
 	ec.loss = loss(ld->label, ld->prediction);
 	data.tau--;
 	uint32_t action = ld->prediction;
-	CB::cb_class l = {ec.loss, action, 1.f / data.k};
+	CB::cb_class l = {ec.loss, action, 1.f / data.k, 0};
 	data.cb_label.costs.erase();
 	data.cb_label.costs.push_back(l);
 	ec.ld = &(data.cb_label);
@@ -74,6 +74,7 @@ namespace CBIFY {
 	data.cb_label.costs.erase();
 	ec.ld = &(data.cb_label);
 	base.predict(ec);
+	ld->prediction = data.cb_label.prediction;
 	ec.loss = loss(ld->label, ld->prediction);
       }
     ec.ld = ld;
@@ -87,7 +88,7 @@ namespace CBIFY {
     data.cb_label.costs.erase();
     
     base.predict(ec);
-    uint32_t action = ld->prediction;
+    uint32_t action = data.cb_label.prediction;
 
     float base_prob = data.epsilon / data.k;
     if (frand48() < 1. - data.epsilon)
@@ -101,17 +102,19 @@ namespace CBIFY {
 	action = do_uniform(data);
 	CB::cb_class l = {loss(ld->label, action), 
 			  action, base_prob};
-	if (action == ld->prediction)
+	if (action == data.cb_label.prediction)
 	  l.probability = 1.f - data.epsilon + base_prob;
 	data.cb_label.costs.push_back(l);
       }
+    
+    cout << data.cb_label.costs[0].probability << endl;
 
     if (is_learn)
       base.learn(ec);
     
     ld->prediction = action;
-    ec.loss = loss(ld->label, ld->prediction);
     ec.ld = ld;
+    ec.loss = loss(ld->label, action);
   }
 
   template <bool is_learn>
@@ -130,9 +133,9 @@ namespace CBIFY {
     for (size_t i = 0; i < data.bags; i++)
       {
 	base.predict(ec,i);
-	data.count[ld->prediction]++;
+	data.count[data.cb_label.prediction]++;
 	if (i == bag)
-	  action = ld->prediction;
+	  action = data.cb_label.prediction;
       }
     assert(action != 0);
     if (is_learn)
@@ -299,7 +302,7 @@ namespace CBIFY {
     VW::finish_example(all, &ec);
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {//parse and set arguments
     cbify* data = (cbify*)calloc_or_die(1, sizeof(cbify));
 
@@ -307,39 +310,21 @@ namespace CBIFY {
     data->counter = 0;
     data->tau = 1000;
     data->all = &all;
-    po::options_description desc("CBIFY options");
-    desc.add_options()
+    po::options_description cb_opts("CBIFY options");
+    cb_opts.add_options()
       ("first", po::value<size_t>(), "tau-first exploration")
       ("epsilon",po::value<float>() ,"epsilon-greedy exploration")
       ("bag",po::value<size_t>() ,"bagging-based exploration")
       ("cover",po::value<size_t>() ,"bagging-based exploration");
     
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
+    vm = add_options(all, cb_opts);
     
-    po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc,all.options_from_file_argv).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    po::store(parsed_file, vm_file);
-    po::notify(vm_file);
+    data->k = (uint32_t)vm["cbify"].as<size_t>();
     
-    if( vm_file.count("cbify") ) {
-      data->k = (uint32_t)vm_file["cbify"].as<size_t>();
-      if( vm.count("cbify") && (uint32_t)vm["cbify"].as<size_t>() != data->k )
-        std::cerr << "warning: you specified a different number of actions through --cbify than the one loaded from predictor. Pursuing with loaded value of: " << data->k << endl;
-    }
-    else {
-      data->k = (uint32_t)vm["cbify"].as<size_t>();
-      
-      //appends nb_actions to options_from_file so it is saved to regressor later
-      std::stringstream ss;
-      ss << " --cbify " << data->k;
-      all.options_from_file.append(ss.str());
-    }
+    //appends nb_actions to options_from_file so it is saved to regressor later
+    std::stringstream ss;
+    ss << " --cbify " << data->k;
+    all.file_options.append(ss.str());
 
     all.p->lp = MULTICLASS::mc_label;
     learner* l;
diff --git a/vowpalwabbit/cbify.h b/vowpalwabbit/cbify.h
index eb3ad92a..c347dead 100644
--- a/vowpalwabbit/cbify.h
+++ b/vowpalwabbit/cbify.h
@@ -7,7 +7,7 @@ license as described in the file LICENSE.
 #define CBIFY_H
 
 namespace CBIFY {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 
 #endif
diff --git a/vowpalwabbit/cost_sensitive.cc b/vowpalwabbit/cost_sensitive.cc
index 3e8834d9..733d5dff 100644
--- a/vowpalwabbit/cost_sensitive.cc
+++ b/vowpalwabbit/cost_sensitive.cc
@@ -1,5 +1,7 @@
 #include "float.h"
 #include "cost_sensitive.h"
+#include "parse_example.h"
+#include "gd.h"
 
 namespace COST_SENSITIVE {
 
diff --git a/vowpalwabbit/csoaa.cc b/vowpalwabbit/csoaa.cc
index b90cbc6e..d3f7f396 100644
--- a/vowpalwabbit/csoaa.cc
+++ b/vowpalwabbit/csoaa.cc
@@ -66,25 +66,19 @@ namespace CSOAA {
     VW::finish_example(all, &ec);
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     csoaa* c=(csoaa*)calloc_or_die(1,sizeof(csoaa));
     c->all = &all;
     //first parse for number of actions
     uint32_t nb_actions = 0;
-    if( vm_file.count("csoaa") ) { //if loaded options from regressor
-      nb_actions = (uint32_t)vm_file["csoaa"].as<size_t>();
-      if( vm.count("csoaa") && (uint32_t)vm["csoaa"].as<size_t>() != nb_actions ) //if csoaa was also specified in commandline, warn user if its different
-        std::cerr << "warning: you specified a different number of actions through --csoaa than the one loaded from predictor. Pursuing with loaded value of: " << nb_actions << endl;
-    }
-    else {
-      nb_actions = (uint32_t)vm["csoaa"].as<size_t>();
 
-      //append csoaa with nb_actions to options_from_file so it is saved to regressor later
-      std::stringstream ss;
-      ss << " --csoaa " << nb_actions;
-      all.options_from_file.append(ss.str());
-    }
+    nb_actions = (uint32_t)vm["csoaa"].as<size_t>();
+
+    //append csoaa with nb_actions to file_options so it is saved to regressor later
+    std::stringstream ss;
+    ss << " --csoaa " << nb_actions;
+    all.file_options.append(ss.str());
 
     all.p->lp = cs_label;
     all.sd->k = nb_actions;
@@ -785,7 +779,7 @@ namespace LabelDict {
     }
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     ldf* ld = (ldf*)calloc_or_die(1, sizeof(ldf));
 
@@ -794,33 +788,17 @@ namespace LabelDict {
     ld->first_pass = true;
  
     string ldf_arg;
-    if(vm_file.count("csoaa_ldf")) {
-      ldf_arg = vm_file["csoaa_ldf"].as<string>();
-      
-      if(vm.count("csoaa_ldf") && ldf_arg.compare(vm["csoaa_ldf"].as<string>()) != 0) {
-        ldf_arg = vm["csoaa_ldf"].as<string>();
-        //std::cerr << "warning: you specified a different ldf argument through --csoaa_ldf than the one loaded from regressor. Proceeding with value of: " << ldf_arg << endl;
-      }
-    }
-    else if( vm.count("csoaa_ldf") ){
+
+    if( vm.count("csoaa_ldf") ){
       ldf_arg = vm["csoaa_ldf"].as<string>();
-      all.options_from_file.append(" --csoaa_ldf ");
-      all.options_from_file.append(ldf_arg);
-    }
-    else if( vm_file.count("wap_ldf") ) {
-      ldf_arg = vm_file["wap_ldf"].as<string>();
-      ld->is_wap = true;
-      
-      if(vm.count("wap_ldf") && ldf_arg.compare(vm["wap_ldf"].as<string>()) != 0) {
-        ldf_arg = vm["csoaa_ldf"].as<string>();
-        //std::cerr << "warning: you specified a different value for --wap_ldf than the one loaded from regressor. Proceeding with value of: " << ldf_arg << endl;
-      }
+      all.file_options.append(" --csoaa_ldf ");
+      all.file_options.append(ldf_arg);
     }
     else {
       ldf_arg = vm["wap_ldf"].as<string>();
       ld->is_wap = true;
-      all.options_from_file.append(" --wap_ldf ");
-      all.options_from_file.append(ldf_arg);
+      all.file_options.append(" --wap_ldf ");
+      all.file_options.append(ldf_arg);
     }
 
     all.p->lp = COST_SENSITIVE::cs_label;
diff --git a/vowpalwabbit/csoaa.h b/vowpalwabbit/csoaa.h
index 41b8933a..c334f20d 100644
--- a/vowpalwabbit/csoaa.h
+++ b/vowpalwabbit/csoaa.h
@@ -7,11 +7,11 @@ license as described in the file LICENSE.
 #define CSOAA_H
 
 namespace CSOAA {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 
 namespace CSOAA_AND_WAP_LDF {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 
 #endif
diff --git a/vowpalwabbit/ect.cc b/vowpalwabbit/ect.cc
index 6763a010..76a344a4 100644
--- a/vowpalwabbit/ect.cc
+++ b/vowpalwabbit/ect.cc
@@ -370,59 +370,30 @@ namespace ECT
     VW::finish_example(all, &ec);
   }
   
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     ect* data = (ect*)calloc_or_die(1, sizeof(ect));
-    po::options_description desc("ECT options");
-    desc.add_options()
+    po::options_description ect_opts("ECT options");
+    ect_opts.add_options()
       ("error", po::value<size_t>(), "error in ECT");
-
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
-
-    po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc, all.options_from_file_argv).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    po::store(parsed_file, vm_file);
-    po::notify(vm_file);
+    
+    vm = add_options(all, ect_opts);
 
     //first parse for number of actions
-    data->k = 0;
-    if( vm_file.count("ect") ) {
-      data->k = (int)vm_file["ect"].as<size_t>();
-      if( vm.count("ect") && vm["ect"].as<size_t>() != data->k )
-        std::cerr << "warning: you specified a different number of actions through --ect than the one loaded from predictor. Pursuing with loaded value of: " << data->k << endl;
-    }
-    else {
-      data->k = (int)vm["ect"].as<size_t>();
-
-      //append ect with nb_actions to options_from_file so it is saved to regressor later
-      std::stringstream ss;
-      ss << " --ect " << data->k;
-      all.options_from_file.append(ss.str());
-    }
+    data->k = (int)vm["ect"].as<size_t>();
+    
+    //append ect with nb_actions to options_from_file so it is saved to regressor later
+    stringstream ss;
+    ss << " --ect " << data->k;
 
-    if(vm_file.count("error")) {
-      data->errors = (uint32_t)vm_file["error"].as<size_t>();
-      if (vm.count("error") && (uint32_t)vm["error"].as<size_t>() != data->errors) {
-        cerr << "warning: specified value for --error different than the one loaded from predictor file. Pursuing with loaded value of: " << data->errors << endl;
-      }
-    }
-    else if (vm.count("error")) {
+    if (vm.count("error")) {
       data->errors = (uint32_t)vm["error"].as<size_t>();
-
-      //append error flag to options_from_file so it is saved in regressor file later
-      stringstream ss;
-      ss << " --error " << data->errors;
-      all.options_from_file.append(ss.str());
-    } else {
+    } else 
       data->errors = 0;
-    }
-
+    //append error flag to options_from_file so it is saved in regressor file later
+    ss << " --error " << data->errors;
+    all.file_options.append(ss.str());
+    
     all.p->lp = MULTICLASS::mc_label;
     size_t wpp = create_circuit(all, *data, data->k, data->errors+1);
     data->all = &all;
diff --git a/vowpalwabbit/ect.h b/vowpalwabbit/ect.h
index 4d3a47fc..c26b4115 100644
--- a/vowpalwabbit/ect.h
+++ b/vowpalwabbit/ect.h
@@ -8,7 +8,7 @@ license as described in the file LICENSE.
 
 namespace ECT
 {
-  LEARNER::learner* setup(vw&, std::vector<std::string>&, po::variables_map&, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw&, po::variables_map&);
 }
 
 #endif
diff --git a/vowpalwabbit/example.cc b/vowpalwabbit/example.cc
index df344eb5..54e1b4f0 100644
--- a/vowpalwabbit/example.cc
+++ b/vowpalwabbit/example.cc
@@ -49,18 +49,83 @@ void vec_store(features_and_source& p, float fx, float& fw) {
   p.feature_map.push_back(f);
 }  
   
+audit_data copy_audit_data(audit_data &src) {
+  audit_data dst;
+  dst.space = (char*)calloc_or_die(strlen(src.space)+1, sizeof(char));
+  strcpy(dst.space, src.space);
+  dst.feature = (char*)calloc_or_die(strlen(src.feature)+1, sizeof(char));
+  strcpy(dst.feature, src.feature);
+  dst.weight_index = src.weight_index;
+  dst.x = src.x;
+  dst.alloced = src.alloced;
+  return dst;
+}
+
 namespace VW {
+void copy_example_label(example* dst, example* src, size_t label_size, void(*copy_label)(void*&,void*)) {
+  if (!src->ld) {
+    if (dst->ld) free(dst->ld);  // TODO: this should be a delete_label, really
+    dst->ld = NULL;
+  } else {
+    if ((label_size == 0) && (copy_label == NULL)) {
+      if (dst->ld) free(dst->ld);  // TODO: this should be a delete_label, really
+      dst->ld = NULL;
+    } else if (copy_label) {
+      copy_label(dst->ld, src->ld);
+    } else {
+      //dst->ld = (void*)malloc(label_size);
+      memcpy(dst->ld, src->ld, label_size);
+    }
+  }
+}
+
+void copy_example_data(bool audit, example* dst, example* src)
+{
+  //std::cerr << "copy_example_data dst = " << dst << std::endl;
+  copy_array(dst->tag, src->tag);
+  dst->example_counter = src->example_counter;
+
+  copy_array(dst->indices, src->indices);
+  for (size_t i=0; i<256; i++)
+    copy_array(dst->atomics[i], src->atomics[i]);
+  dst->ft_offset = src->ft_offset;
+
+  if (audit)
+    for (size_t i=0; i<256; i++)
+      copy_array(dst->audit_features[i], src->audit_features[i], copy_audit_data);
+  
+  dst->num_features = src->num_features;
+  dst->partial_prediction = src->partial_prediction;
+  copy_array(dst->topic_predictions, src->topic_predictions);
+  dst->loss = src->loss;
+  dst->eta_round = src->eta_round;
+  dst->eta_global = src->eta_global;
+  dst->example_t = src->example_t;
+  memcpy(dst->sum_feat_sq, src->sum_feat_sq, 256 * sizeof(float));
+  dst->total_sum_feat_sq = src->total_sum_feat_sq;
+  dst->revert_weight = src->revert_weight;
+  dst->test_only = src->test_only;
+  dst->end_pass = src->end_pass;
+  dst->sorted = src->sorted;
+  dst->in_use = src->in_use;}
+
+void copy_example_data(bool audit, example* dst, example* src, size_t label_size, void(*copy_label)(void*&,void*)) {
+  copy_example_data(audit, dst, src);
+  copy_example_label(dst, src, label_size, copy_label);
+}
 
 flat_example* flatten_example(vw& all, example *ec) 
 {  
 	flat_example* fec = (flat_example*) calloc_or_die(1,sizeof(flat_example));  
-	fec->ld = ec->ld;
+	fec->ld = (label_data*)calloc_or_die(1, sizeof(label_data));
+	memcpy(fec->ld, ec->ld, sizeof(label_data));
 
 	fec->tag_len = ec->tag.size();
 	if (fec->tag_len >0)
-	{
-		fec->tag = ec->tag.begin;
-	}
+	  {
+	    fec->tag = (char*)calloc_or_die(fec->tag_len+1, sizeof(char));
+	    memcpy(fec->tag,ec->tag.begin, fec->tag_len);
+	  }
 
 	fec->example_counter = ec->example_counter;  
 	fec->ft_offset = ec->ft_offset;  
@@ -82,9 +147,16 @@ flat_example* flatten_example(vw& all, example *ec)
 }
 
 void free_flatten_example(flat_example* fec) 
-{  
-	if (fec)
-		free(fec);
+{  //note: The label memory should be freed by by freeing the original example.
+  if (fec)
+    {
+      if (fec->feature_map_len > 0)
+	free(fec->feature_map);
+      if (fec->tag_len > 0)
+	free(fec->tag);
+      free(fec->ld);
+      free(fec);
+    }
 }
 
 }
@@ -136,68 +208,3 @@ void dealloc_example(void(*delete_label)(void*), example&ec)
   ec.indices.delete_v();
 }
 
-audit_data copy_audit_data(audit_data &src) {
-  audit_data dst;
-  dst.space = (char*)calloc_or_die(strlen(src.space)+1, sizeof(char));
-  strcpy(dst.space, src.space);
-  dst.feature = (char*)calloc_or_die(strlen(src.feature)+1, sizeof(char));
-  strcpy(dst.feature, src.feature);
-  dst.weight_index = src.weight_index;
-  dst.x = src.x;
-  dst.alloced = src.alloced;
-  return dst;
-}
-
-namespace VW {
-void copy_example_label(example*dst, example*src, size_t label_size, void(*copy_label)(void*&,void*)) {
-  if (!src->ld) {
-    if (dst->ld) free(dst->ld);  // TODO: this should be a delete_label, really
-    dst->ld = NULL;
-  } else {
-    if ((label_size == 0) && (copy_label == NULL)) {
-      if (dst->ld) free(dst->ld);  // TODO: this should be a delete_label, really
-      dst->ld = NULL;
-    } else if (copy_label) {
-      copy_label(dst->ld, src->ld);
-    } else {
-      //dst->ld = (void*)malloc(label_size);
-      memcpy(dst->ld, src->ld, label_size);
-    }
-  }
-}
-
-void copy_example_data(bool audit, example* dst, example* src)
-{
-  //std::cerr << "copy_example_data dst = " << dst << std::endl;
-  copy_array(dst->tag, src->tag);
-  dst->example_counter = src->example_counter;
-
-  copy_array(dst->indices, src->indices);
-  for (size_t i=0; i<256; i++)
-    copy_array(dst->atomics[i], src->atomics[i]);
-  dst->ft_offset = src->ft_offset;
-
-  if (audit)
-    for (size_t i=0; i<256; i++)
-      copy_array(dst->audit_features[i], src->audit_features[i], copy_audit_data);
-  
-  dst->num_features = src->num_features;
-  dst->partial_prediction = src->partial_prediction;
-  copy_array(dst->topic_predictions, src->topic_predictions);
-  dst->loss = src->loss;
-  dst->eta_round = src->eta_round;
-  dst->eta_global = src->eta_global;
-  dst->example_t = src->example_t;
-  memcpy(dst->sum_feat_sq, src->sum_feat_sq, 256 * sizeof(float));
-  dst->total_sum_feat_sq = src->total_sum_feat_sq;
-  dst->revert_weight = src->revert_weight;
-  dst->test_only = src->test_only;
-  dst->end_pass = src->end_pass;
-  dst->sorted = src->sorted;
-  dst->in_use = src->in_use;}
-
-void copy_example_data(bool audit, example* dst, example* src, size_t label_size, void(*copy_label)(void*&,void*)) {
-  copy_example_data(audit, dst, src);
-  copy_example_label(dst, src, label_size, copy_label);
-}
-}
diff --git a/vowpalwabbit/gd.cc b/vowpalwabbit/gd.cc
index 9cb788d2..98d3d71f 100644
--- a/vowpalwabbit/gd.cc
+++ b/vowpalwabbit/gd.cc
@@ -19,7 +19,6 @@ license as described in the file LICENSE.
 #include <xmmintrin.h>
 #endif
 
-#include "sparse_dense.h"
 #include "gd.h"
 #include "simple_label.h"
 #include "accumulate.h"
@@ -32,7 +31,6 @@ using namespace LEARNER;
 namespace GD
 {
   struct gd{
-    size_t current_pass;
     bool active;
     bool active_simulation;
     float normalized_sum_norm_x;
@@ -45,31 +43,17 @@ namespace GD
   };
 
   void sync_weights(vw& all);
+
+  struct power_data {
+    float minus_power_t;
+    float minus_power_t_norm;
+  };
   
   struct train_data {
-    float avg_norm;
     float update;
-    float power_t;
+    power_data pt;
   };
 
-  template <void (*T)(train_data&, float, float&)>
-  void generic_train(vw& all, example& ec, float update, bool sqrt_norm)
-  {
-    if (fabs(update) == 0.)
-      return;
-    
-    float total_weight = ec.example_t;
-
-    if(!all.holdout_set_off)
-      total_weight -= (float)all.sd->weighted_holdout_examples; //exclude weights from test_only examples   
-    
-    float avg_norm = all.normalized_sum_norm_x / total_weight;
-    if (sqrt_norm) avg_norm = sqrt(avg_norm);
-    
-    train_data d = {avg_norm, update, all.power_t};
-    
-    foreach_feature<train_data,T>(all, ec, d);
-  }
 
 float InvSqrt(float x){
   float xhalf = 0.5f * x;
@@ -80,45 +64,82 @@ float InvSqrt(float x){
   return x;
 }
   
-  template<bool adaptive, bool normalized_updates, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx>
-  inline void general_update(train_data& s, float x, float& fw)
+  template<bool sqrt_rate, size_t adaptive, size_t normalized>
+  inline float compute_rate_decay(power_data& s, float& fw)
   {
     weight* w = &fw;
-    if(feature_mask_off || w[feature_mask_idx]==1.){
-      float t = 1.f;
-      if(adaptive) t = powf(w[1],-s.power_t);
-      if(normalized_updates) {
-	float norm = w[normalized_idx] * s.avg_norm;
-	float power_t_norm = 1.f - (adaptive ? s.power_t : 0.f);
-	t *= powf(norm*norm,-power_t_norm);
+    float rate_decay = 1.f;
+    if(adaptive) {
+      if (sqrt_rate)
+	{  
+#if defined(__SSE2__) && !defined(VW_LDA_NO_SSE)
+	  __m128 eta = _mm_load_ss(&w[adaptive]);
+	  eta = _mm_rsqrt_ss(eta);
+	  _mm_store_ss(&rate_decay, eta);
+#else
+	  rate_decay = InvSqrt(w[adaptive]);
+#endif
+	}
+      else
+	rate_decay = powf(w[adaptive],s.minus_power_t);
+    }
+    if(normalized) {
+      if (sqrt_rate)
+	{
+	  float inv_norm = 1.f / w[normalized];
+	  if (adaptive)
+	    rate_decay *= inv_norm;
+	  else
+	    rate_decay *= inv_norm*inv_norm;
+	}
+      else{
+	float norm = w[normalized];
+	rate_decay *= powf(norm*norm,s.minus_power_t_norm);
       }
-      w[0] += s.update * x * t;
     }
+    return rate_decay;
   }
 
-  template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx>
-  inline void specialized_update(train_data& s, float x, float& fw)
+  template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized>
+  inline void update_feature(train_data& s, float x, float& fw)
   {
-  weight* w = &fw;
-  if(feature_mask_off || w[feature_mask_idx]==1.){
-    float t = 1.f;
-    float inv_norm = 1.f;
-    if(normalized) inv_norm /= (w[normalized_idx] * s.avg_norm);
-    if(adaptive) {
-#if defined(__SSE2__) && !defined(VW_LDA_NO_SSE)
-    __m128 eta = _mm_load_ss(&w[1]);
-    eta = _mm_rsqrt_ss(eta);
-    _mm_store_ss(&t, eta);
-    t *= inv_norm;
-#else
-    t = InvSqrt(w[1]) * inv_norm;
-#endif
-    } else {
-      t *= inv_norm*inv_norm; //if only using normalized updates but not adaptive, need to divide by feature norm squared
+    weight* w = &fw;
+    if(feature_mask_off || fw != 0.){
+      float rate_decay = compute_rate_decay<sqrt_rate, adaptive, normalized>(s.pt, fw);
+      w[0] += s.update * rate_decay * x;
     }
-    w[0] += s.update * x * t;
   }
-}
+  
+  template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized>
+  void train(vw& all, example& ec, float update)
+  {
+    if (fabsf(update) == 0.f)
+      return;
+    
+    float total_weight = ec.example_t;
+
+    if(!all.holdout_set_off)
+      total_weight -= (float)all.sd->weighted_holdout_examples; //exclude weights from test_only examples   
+    
+    float avg_norm = all.normalized_sum_norm_x / total_weight;
+    if (sqrt_rate) avg_norm = sqrt(avg_norm);
+
+    float minus_power_t_norm = (adaptive ? all.power_t : 0.f) -1.f;
+
+    if (normalized) {
+      if (sqrt_rate) 
+	if (adaptive)
+	  update /= avg_norm;
+	else
+	  update /= (avg_norm * avg_norm);
+      else 
+	update *= powf(avg_norm * avg_norm, minus_power_t_norm);
+    }
+
+    train_data d = {update, {-all.power_t, minus_power_t_norm}};
+    
+    foreach_feature<train_data,update_feature<sqrt_rate, feature_mask_off, adaptive, normalized> >(all, ec, d);
+  }
 
   void end_pass(gd& g)
   {
@@ -155,6 +176,12 @@ struct string_value {
   friend bool operator<(const string_value& first, const string_value& second);
 };
 
+ inline float sign(float w){ if (w < 0.) return -1.; else  return 1.;}
+ 
+ inline float trunc_weight(const float w, const float gravity){
+   return (gravity < fabsf(w)) ? w - sign(w) * gravity : 0.f;
+ }
+
 bool operator<(const string_value& first, const string_value& second)
 {
   return fabs(first.v) > fabs(second.v);
@@ -348,82 +375,35 @@ float finalize_prediction(vw& all, float ret)
   return ret;
 }
 
-template<bool normalized_training, bool reg_mode_odd, bool power_t_half>
+ struct trunc_data {
+   float prediction;
+   float gravity;
+ };
+ 
+ inline void vec_add_trunc(trunc_data& p, const float fx, float& fw) {
+   p.prediction += trunc_weight(fw, p.gravity) * fx;
+ }
+
+ inline float trunc_predict(vw& all, example& ec, float gravity)
+ {
+   label_data* ld = (label_data*)ec.ld;
+   trunc_data temp = {ld->initial, gravity};
+   foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp);
+   return temp.prediction;
+ }
+
+template<bool reg_mode_odd>
 void predict(gd& g, learner& base, example& ec)
 {
   vw& all = *g.all;
 
-  if (normalized_training) {
-    if(power_t_half) {
-      if (reg_mode_odd)
-	{
-	  float gravity = (float)all.sd->gravity;
-	  if (all.adaptive)
-	    if (all.normalized_idx == 1)
-	      ec.partial_prediction = inline_predict<float, vec_add_trunc_rescale<true, 1> >(all, ec, gravity);
-	    else
-	      ec.partial_prediction = inline_predict<float, vec_add_trunc_rescale<true, 2> >(all, ec, gravity);
-	  else
-	    if (all.normalized_idx == 1)
-	      ec.partial_prediction = inline_predict<float, vec_add_trunc_rescale<false, 1> >(all, ec, gravity);
-	    else
-	      ec.partial_prediction = inline_predict<float, vec_add_trunc_rescale<false, 2> >(all, ec, gravity);
-	}
-      else
-	{
-	  if (all.adaptive)
-	    if (all.normalized_idx == 1)
-	      ec.partial_prediction = inline_predict<vec_add_rescale<true, 1> >(all, ec);
-	    else
-	      ec.partial_prediction = inline_predict<vec_add_rescale<true, 2> >(all, ec);
-	  else
-	    if (all.normalized_idx == 1)
-	      ec.partial_prediction = inline_predict<vec_add_rescale<false, 1> >(all, ec);
-	    else
-	      ec.partial_prediction = inline_predict<vec_add_rescale<false, 2> >(all, ec);
-	}
-    }
-    else {
-      if (reg_mode_odd)
-	{
-	  gnp temp = {(float)all.sd->gravity, all.power_t};
-	  if (all.adaptive)
-	    if (all.normalized_idx == 1)
-	      ec.partial_prediction = inline_predict<gnp, vec_add_trunc_rescale_general<true, 1> >(all, ec, temp);
-	    else
-	      ec.partial_prediction = inline_predict<gnp, vec_add_trunc_rescale_general<true, 2> >(all, ec, temp);
-	  else
-	    if (all.normalized_idx == 1)
-	      ec.partial_prediction = inline_predict<gnp, vec_add_trunc_rescale_general<false, 1> >(all, ec, temp);
-	    else
-	      ec.partial_prediction = inline_predict<gnp, vec_add_trunc_rescale_general<false, 2> >(all, ec, temp);
-	}
-      else
-	{
-	  float power_t = all.power_t;
-	  if (all.adaptive)
-	    if (all.normalized_idx == 1)
-	      ec.partial_prediction = inline_predict<float, vec_add_rescale_general<true, 1> >(all, ec, power_t);
-	    else
-	      ec.partial_prediction = inline_predict<float, vec_add_rescale_general<true, 2> >(all, ec, power_t);
-	  else
-	    if (all.normalized_idx == 1)
-	      ec.partial_prediction = inline_predict<float, vec_add_rescale_general<false, 1> >(all, ec, power_t);
-	    else
-	      ec.partial_prediction = inline_predict<float, vec_add_rescale_general<false, 2> >(all, ec, power_t);
-	}
+  if (reg_mode_odd)
+    {
+      float gravity = (float)all.sd->gravity;
+      ec.partial_prediction = trunc_predict(all, ec, gravity);
     }
-  }
-  else {
-    // no rescaling
-    if (reg_mode_odd)
-      {
-	float gravity = (float)all.sd->gravity;
-	ec.partial_prediction = inline_predict<float, vec_add_trunc>(all, ec, gravity);
-      }
-    else
-      ec.partial_prediction = inline_predict<vec_add>(all, ec);
-  }
+  else
+    ec.partial_prediction = inline_predict(all, ec);    
 
   label_data& ld = *(label_data*)ec.ld;
   ld.prediction = finalize_prediction(all, ec.partial_prediction * (float)all.sd->contraction);
@@ -434,97 +414,70 @@ void predict(gd& g, learner& base, example& ec)
 
   struct norm_data {
     float g;
-    float norm;
+    float pred_per_update;
     float norm_x;
-    float power_t;
+    power_data pd;
   };
 
-template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx>
-inline void simple_norm_compute(norm_data& nd, float x, float& fw) 
-{
+  template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized>
+inline void pred_per_update_feature(norm_data& nd, float x, float& fw) {
   weight* w = &fw;
-  if(feature_mask_off || w[feature_mask_idx]==1.){
+  if(feature_mask_off || fw != 0.){
     float x2 = x * x;
-    float t = 1.f;
-    float inv_norm = 1.f;
-    float inv_norm2 = 1.f;
+    if(adaptive)
+      w[adaptive] += nd.g * x2;
     if(normalized) {
-      inv_norm /= w[normalized_idx];
-      inv_norm2 = inv_norm*inv_norm;
-      nd.norm_x += x2 * inv_norm2;
-    }
-    if(adaptive){
-      w[1] += nd.g * x2;
-
-#if defined(__SSE2__) && !defined(VW_LDA_NO_SSE)
-    __m128 eta = _mm_load_ss(&w[1]);
-    eta = _mm_rsqrt_ss(eta);
-    _mm_store_ss(&t, eta);
-    t *= inv_norm;
-#else
-    t = InvSqrt(w[1]) * inv_norm;
-#endif
-    } else {
-      t *= inv_norm2; //if only using normalized but not adaptive, we're dividing update by feature norm squared
+      float x_abs = fabsf(x);
+      if( x_abs > w[normalized] ) {// new scale discovered
+	if( w[normalized] > 0. ) {//If the normalizer is > 0 then rescale the weight so it's as if the new scale was the old scale.
+	  float rescale = (w[normalized]/x_abs);
+	  if (sqrt_rate)
+	    w[0] *= (adaptive ? rescale : rescale*rescale);
+	  else
+	    w[0] *= powf(rescale*rescale, -nd.pd.minus_power_t_norm);
+	}
+	w[normalized] = x_abs;
+      }
+      nd.norm_x += x2 / (w[normalized] * w[normalized]);
     }
-    nd.norm += x2 * t;
-  }
-}
-
-template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx>
-inline void powert_norm_compute(norm_data& nd, float x, float& fw) {
-  weight* w = &fw;
-  if(feature_mask_off || w[feature_mask_idx]==1.){
-    float power_t_norm = 1.f - (adaptive ? nd.power_t : 0.f);
+    float rate_decay = compute_rate_decay<sqrt_rate, adaptive, normalized>(nd.pd, fw);
 
-    float x2 = x * x;
-    float t = 1.f;
-    if(adaptive){
-      w[1] += nd.g * x2;
-      t = powf(w[1], -nd.power_t);
-    }
-    if(normalized) {
-      float range2 = w[normalized_idx] * w[normalized_idx];
-      t *= powf(range2, -power_t_norm);
-      nd.norm_x += x2 / range2;
-    }
-    nd.norm += x2 * t;
+    nd.pred_per_update += x2 * rate_decay;
   }
 }
-
-  template <void (*T)(norm_data&,float,float&)>
-float compute_norm(vw& all, example& ec)
-{//We must traverse the features in _precisely_ the same order as during training.
-  label_data* ld = (label_data*)ec.ld;
-  float g = all.loss->getSquareGrad(ld->prediction, ld->label) * ld->weight;
-  if (g==0) return 1.;
-
-  norm_data nd = {g, 0., 0., all.power_t};
-
-  foreach_feature<norm_data,T>(all, ec, nd);
-
-  if(all.normalized_updates) {
+  
+  template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized>
+  float get_pred_per_update(vw& all, example& ec)
+  {//We must traverse the features in _precisely_ the same order as during training.
+    label_data* ld = (label_data*)ec.ld;
+    float g = all.loss->getSquareGrad(ld->prediction, ld->label) * ld->weight;
+    if (g==0) return 1.;
+    
+    float minus_power_t_norm = (adaptive ? all.power_t : 0.f) - 1.f;
+  norm_data nd = {g, 0., 0., {-all.power_t, minus_power_t_norm}};
+  
+  foreach_feature<norm_data,pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized> >(all, ec, nd);
+  
+  if(normalized) {
     float total_weight = ec.example_t;
-
+    
     if(!all.holdout_set_off)
       total_weight -= (float)all.sd->weighted_holdout_examples; //exclude weights from test_only examples   
     
     all.normalized_sum_norm_x += ld->weight * nd.norm_x;
-
-    float avg_sq_norm = all.normalized_sum_norm_x / total_weight;
-    if(all.power_t == 0.5) {
-      if(all.adaptive) nd.norm /= sqrt(avg_sq_norm);
-      else nd.norm /= avg_sq_norm;
-    } else {
-      float power_t_norm = 1.f - (all.adaptive ? all.power_t : 0.f);
-      nd.norm *= powf(avg_sq_norm,-power_t_norm);
-    }
+    
+    float avg_norm = all.normalized_sum_norm_x / total_weight;
+    if(sqrt_rate) {
+      if(adaptive) nd.pred_per_update /= sqrt(avg_norm);
+      else nd.pred_per_update /= avg_norm;
+    } else 
+      nd.pred_per_update *= powf(avg_norm,minus_power_t_norm);
   }
   
-  return nd.norm;
+  return nd.pred_per_update;
 }
 
-template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx>
+  template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized>
 void compute_update(vw& all, gd& g, example& ec)
 {
   label_data* ld = (label_data*)ec.ld;
@@ -556,34 +509,28 @@ void compute_update(vw& all, gd& g, example& ec)
     {
       if (all.training && ec.loss > 0.)
         {
-	  float eta_t;
-	  float norm;
+	  float pred_per_update;
           if(adaptive || normalized)
-            if(all.power_t == 0.5)
-	      norm = compute_norm<simple_norm_compute<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx> >(all,ec);
-            else
-	      norm = compute_norm<powert_norm_compute<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx> >(all,ec);
+	    pred_per_update = get_pred_per_update<sqrt_rate, feature_mask_off, adaptive, normalized>(all,ec);
           else
-            norm = ec.total_sum_feat_sq;
+            pred_per_update = ec.total_sum_feat_sq;
 
-          eta_t = all.eta * norm * ld->weight;
-          if(!adaptive && all.power_t != 0) eta_t *= powf(t,-all.power_t);
+          float delta_pred = pred_per_update * all.eta * ld->weight;
+          if(!adaptive && all.power_t != 0) delta_pred *= powf(t,-all.power_t);
 
           float update = 0.f;
           if( all.invariant_updates )
-            update = all.loss->getUpdate(ld->prediction, ld->label, eta_t, norm);
+            update = all.loss->getUpdate(ld->prediction, ld->label, delta_pred, pred_per_update);
           else
-            update = all.loss->getUnsafeUpdate(ld->prediction, ld->label, eta_t, norm);
+            update = all.loss->getUnsafeUpdate(ld->prediction, ld->label, delta_pred, pred_per_update);
 
 	  ec.eta_round = (float) (update / all.sd->contraction);
-
 	  if (all.reg_mode && fabs(ec.eta_round) > 1e-8) {
 	    double dev1 = all.loss->first_derivative(all.sd, ld->prediction, ld->label);
 	    double eta_bar = (fabs(dev1) > 1e-8) ? (-ec.eta_round / dev1) : 0.0;
 	    if (fabs(dev1) > 1e-8)
-	      all.sd->contraction *= (1. - all.l2_lambda * eta_bar * norm);
-	      //all.sd->contraction /= (1. + all.l2_lambda * eta_bar * norm);
-	    all.sd->gravity += eta_bar * sqrt(norm) * all.l1_lambda;
+	      all.sd->contraction *= (1. - all.l2_lambda * eta_bar);
+	    all.sd->gravity += eta_bar * all.l1_lambda;
 	  }
         }
     }
@@ -592,26 +539,23 @@ void compute_update(vw& all, gd& g, example& ec)
 
 }
 
-  template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx>
+  template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized>
 void update(gd& g, learner& base, example& ec)
 {
   vw* all = g.all;
 
-  compute_update<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx > (*all, g, ec);
+  compute_update<sqrt_rate, feature_mask_off, adaptive, normalized> (*all, g, ec);
   
   if (ec.eta_round != 0.)
     {
-      if(all->power_t == 0.5)
-	generic_train<specialized_update<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx> > (*all,ec,(float)ec.eta_round,true);
-      else
-	generic_train<general_update<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx> >(*all,ec,(float)ec.eta_round,false);
+      train<sqrt_rate, feature_mask_off, adaptive, normalized>(*all,ec,(float)ec.eta_round);
       
       if (all->sd->contraction < 1e-10)  // updating weights now to avoid numerical instability
 	sync_weights(*all);
     }
 }
 
-template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx>
+template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized>
 void learn(gd& g, learner& base, example& ec)
 {
   vw* all = g.all;
@@ -622,7 +566,7 @@ void learn(gd& g, learner& base, example& ec)
   g.predict(g,base,ec);
 
   if ((all->holdout_set_off || !ec.test_only) && ld->weight > 0)
-    update<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx>(g,base,ec);
+    update<sqrt_rate, feature_mask_off, adaptive, normalized>(g,base,ec);
   else if(ld->weight > 0)
     ec.loss = all->loss->getLoss(all->sd, ld->prediction, ld->label) * ld->weight;
 }
@@ -883,6 +827,51 @@ void save_load(gd& g, io_buf& model_file, bool read, bool text)
     }
 }
 
+template<bool sqrt_rate, size_t adaptive, size_t normalized, size_t next>
+size_t set_learn(vw& all, learner* ret, bool feature_mask_off)
+{
+  all.normalized_idx = normalized;
+  if (feature_mask_off)
+    {
+      ret->set_learn<gd, learn<sqrt_rate, true, adaptive,normalized> >();
+      ret->set_update<gd, update<sqrt_rate, true, adaptive,normalized> >();
+      return next;
+    }
+  else
+    {
+      ret->set_learn<gd, learn<sqrt_rate, false, adaptive,normalized> >();
+      ret->set_update<gd, update<sqrt_rate, false, adaptive,normalized> >();
+      return next;
+    }
+}
+
+template<bool sqrt_rate, size_t adaptive>
+size_t set_learn(vw& all, learner* ret, bool feature_mask_off)
+{
+  // select the appropriate learn function based on adaptive, normalization, and feature mask
+  if (all.normalized_updates)
+    return set_learn<sqrt_rate, adaptive, adaptive+1, adaptive+2>(all, ret, feature_mask_off);
+  else
+    return set_learn<sqrt_rate, adaptive, 0, adaptive+1>(all, ret, feature_mask_off);
+}
+
+template<bool sqrt_rate>
+size_t set_learn(vw& all, learner* ret, bool feature_mask_off)
+{
+  if (all.adaptive)
+    return set_learn<sqrt_rate, 1>(all, ret, feature_mask_off);
+  else
+    return set_learn<sqrt_rate, 0>(all, ret, feature_mask_off);
+}
+
+size_t ceil_log_2(size_t v)
+{
+  if (v==0)
+    return 0;
+  else 
+    return 1 + ceil_log_2(v >> 1);
+}
+
 learner* setup(vw& all, po::variables_map& vm)
 {
   gd* g = (gd*)calloc_or_die(1, sizeof(gd));
@@ -908,146 +897,52 @@ learner* setup(vw& all, po::variables_map& vm)
       g->initial_constant = vm["constant"].as<float>();     
   }
 
-  all.reg.stride_shift = 2; //use stride of 4 for default invariant normalized adaptive updates
-  //if the user specified anything in sgd,adaptive,invariant,normalized, we turn off default update rules and use whatever user specified
   if( !all.training || ( ( vm.count("sgd") || vm.count("adaptive") || vm.count("invariant") || vm.count("normalized") ) && !vm.count("exact_adaptive_norm")) )
-  {
-    all.adaptive = all.training && vm.count("adaptive");
-    all.invariant_updates = all.training && vm.count("invariant");
-    all.normalized_updates = all.training && vm.count("normalized");
-
-    all.reg.stride_shift = 0;
-
-    if( all.adaptive ) all.reg.stride_shift += 1;
-    else all.normalized_idx = 1; //store per feature norm at 1 index offset from weight value instead of 2
-
-    if( all.normalized_updates ) all.reg.stride_shift += 1;
-
-    if(!vm.count("learning_rate") && !vm.count("l") && !(all.adaptive && all.normalized_updates))
-      if (all.lda == 0)
-        all.eta = 10; //default learning rate to 10 for non default update rule
-
-    //if not using normalized or adaptive, default initial_t to 1 instead of 0
-    if(!all.adaptive && !all.normalized_updates){
-      if (!vm.count("initial_t")) {
-	all.sd->t = 1.f;
-	all.sd->weighted_unlabeled_examples = 1.f;
-	all.initial_t = 1.f;
-      }
-      all.eta *= powf((float)(all.sd->t), all.power_t);
-    }
-    if (vm.count("feature_mask")){
-      if(all.reg.stride_shift == 0){
-        all.reg.stride_shift += 1;//if --sgd, stride->2 and use the second position as mask
-        all.feature_mask_idx = 1;
-      }
-      else if(all.reg.stride_shift == 1){
-        all.reg.stride_shift += 1;//if either normalized or adaptive, stride->4, mask_idx is still 3
+    {//nondefault
+      all.adaptive = all.training && vm.count("adaptive");
+      all.invariant_updates = all.training && vm.count("invariant");
+      all.normalized_updates = all.training && vm.count("normalized");
+      
+      if(!vm.count("learning_rate") && !vm.count("l") && !(all.adaptive && all.normalized_updates))
+	all.eta = 10; //default learning rate to 10 for non default update rule
+      
+      //if not using normalized or adaptive, default initial_t to 1 instead of 0
+      if(!all.adaptive && !all.normalized_updates){
+	if (!vm.count("initial_t")) {
+	  all.sd->t = 1.f;
+	  all.sd->weighted_unlabeled_examples = 1.f;
+	  all.initial_t = 1.f;
+	}
+	all.eta *= powf((float)(all.sd->t), all.power_t);
       }
     }
-  }
-
+  
   if (pow((double)all.eta_decay_rate, (double)all.numpasses) < 0.0001 )
     cerr << "Warning: the learning rate for the last pass is multiplied by: " << pow((double)all.eta_decay_rate, (double)all.numpasses)
 	 << " adjust --decay_learning_rate larger to avoid this." << endl;
   
-  learner* ret = new learner(g, 1 << all.reg.stride_shift);
+  learner* ret = new learner(g, 1);
 
-  // select the appropriate predict function based on normalization, regularization, and power_t
-  if (all.normalized_updates && all.training)
-    if (all.reg_mode % 2)
-      if (all.power_t == 0.5)
-	{
-	  g->predict = predict<true, true, true>;
-	  ret->set_predict<gd, predict<true, true, true> >();
-	}
-      else
-	{
-	  g->predict = predict<true, true, false>;
-	  ret->set_predict<gd, predict<true, true, false> >();
-	}
-    else
-      if (all.power_t == 0.5)
-	{
-	  g->predict = predict<true, false, true>;
-	  ret->set_predict<gd, predict<true, false, true> >();
-	}
-      else
-	{
-	  g->predict = predict<true, false, false>;
-	  ret->set_predict<gd, predict<true, false, false> >();
-	}
+  if (all.reg_mode % 2)
+    {
+      ret->set_predict<gd, predict<true> >();
+      g->predict = predict<true>;
+    }
   else
-    if (all.reg_mode % 2)
-      if (all.power_t == 0.5)
-	{
-	  g->predict = predict<false, true, true>;
-	  ret->set_predict<gd, predict<false, true, true> >();
-	}
-      else
-	{
-	  g->predict = predict<false, true, false>;
-	  ret->set_predict<gd, predict<false, true, false> >();
-	}
-    else
-      if (all.power_t == 0.5)
-	{
-	  g->predict = predict<false, false, true>;
-	  ret->set_predict<gd, predict<false, false, true> >();
-	}
-      else
-	{
-	  g->predict = predict<false, false, true>;
-	  ret->set_predict<gd, predict<false, false, true> >();
-	}
-
-  // select the appropriate learn function based on adaptive, normalization, and feature mask
-  if (all.adaptive)
-    if (all.normalized_updates)
-      if (feature_mask_off)
-	{
-	  ret->set_learn<gd, learn<true,true,true, 2, 0> >();
-	  ret->set_update<gd, update<true,true,true, 2, 0> >();
-	}
-      else
-	{
-	  ret->set_learn<gd, learn<true,true,false, 2, 3> >();
-	  ret->set_update<gd, update<true,true,false, 2, 3> >();
-	}
-    else
-      if (feature_mask_off)
-	{
-	  ret->set_learn<gd, learn<true,false,true, 0, 0> >();
-	  ret->set_update<gd, update<true,false,true, 0, 0> >();
-	}
-      else
-	{
-	  ret->set_learn<gd, learn<true,false,false, 0, 2> >();
-	  ret->set_update<gd, update<true,false,false, 0, 2> >();
-	}
+    {
+      ret->set_predict<gd, predict<false> >();
+      g->predict = predict<true>;
+    }
+  
+  size_t stride;
+  if (all.power_t == 0.5)
+    stride = set_learn<true>(all, ret, feature_mask_off);
   else
-    if (all.normalized_updates)
-      if (feature_mask_off)
-	{
-	  ret->set_learn<gd, learn<false,true,true, 1, 0> >();
-	  ret->set_update<gd, update<false,true,true, 1, 0> >();
-	}
-      else
-	{
-	  ret->set_learn<gd, learn<false,true,false, 1, 2> >();
-	  ret->set_update<gd, update<false,true,false, 1, 2> >();
-	}
-    else
-      if (feature_mask_off)
-	{
-	  ret->set_learn<gd, learn<false,false,true, 0, 0> >();
-	  ret->set_update<gd, update<false, false, true, 0, 0> >();
-	}
-      else
-	{
-	  ret->set_learn<gd, learn<false,false,false, 0, 1> >();
-	  ret->set_update<gd, update<false, false, false, 0, 1> >();
-	}
+    stride = set_learn<false>(all, ret, feature_mask_off);
+
+  all.reg.stride_shift = ceil_log_2(stride-1);
+  ret->increment = (1 << all.reg.stride_shift);
+
   ret->set_save_load<gd,save_load>();
 
   ret->set_end_pass<gd, end_pass>();
diff --git a/vowpalwabbit/gd.h b/vowpalwabbit/gd.h
index 5c46e008..e8de0629 100644
--- a/vowpalwabbit/gd.h
+++ b/vowpalwabbit/gd.h
@@ -10,12 +10,11 @@ license as described in the file LICENSE.
 #include <sys/socket.h>
 #endif
 
-#include <math.h>
 #include "example.h"
 #include "parse_regressor.h"
 #include "parser.h"
-#include "sparse_dense.h"
 #include "v_array.h"
+#include "constant.h"
 
 namespace GD{
 void print_result(int f, float res, v_array<char> tag);
@@ -74,23 +73,17 @@ void output_and_account_example(example* ec);
      }
    }
 
- template <class R, void (*T)(predict_data<R>&, const float, float&)>
-   inline float inline_predict(vw& all, example& ec, R extra)
-   {
-     label_data* ld = (label_data*)ec.ld;
-     predict_data<R> temp = {ld->initial, extra};
-     foreach_feature<predict_data<R>, T>(all, ec, temp);
-     return temp.prediction;
-   }
+ inline void vec_add(float& p, const float fx, float& fw) {
+   p += fw * fx;
+ }
 
- template <void (*T)(float&, const float, float&)>
-  inline float inline_predict(vw& all, example& ec)
-   {
-     label_data* ld = (label_data*)ec.ld;
-     float temp = ld->initial;
-     foreach_feature<float, T>(all, ec, temp);
-     return temp;
-   }
+ inline float inline_predict(vw& all, example& ec)
+ {
+   label_data* ld = (label_data*)ec.ld;
+   float temp = ld->initial;
+   foreach_feature<float, vec_add>(all, ec, temp);
+   return temp;
+ }
 }
 
 #endif
diff --git a/vowpalwabbit/gd_mf.cc b/vowpalwabbit/gd_mf.cc
index 3f1ca331..635d34b5 100644
--- a/vowpalwabbit/gd_mf.cc
+++ b/vowpalwabbit/gd_mf.cc
@@ -14,7 +14,6 @@ license as described in the file LICENSE.
 #endif
 
 #include "constant.h"
-#include "sparse_dense.h"
 #include "gd.h"
 #include "simple_label.h"
 #include "rand48.h"
@@ -48,7 +47,7 @@ void mf_print_offset_features(vw& all, example& ec, size_t offset)
 	  
 	  cout << "\tConstant:";
 	  cout << ((index >> all.reg.stride_shift) & all.parse_mask) << ':' << f->x;
-	  cout  << ':' << trunc_weight(weights[index], (float)all.sd->gravity) * (float)all.sd->contraction;
+	  cout  << ':' << weights;
 	}
   for (vector<string>::iterator i = all.pairs.begin(); i != all.pairs.end();i++) 
     if (ec.atomics[(int)(*i)[0]].size() > 0 && ec.atomics[(int)(*i)[1]].size() > 0)
@@ -96,7 +95,7 @@ float mf_predict(vw& all, example& ec)
   float linear_prediction = 0.;
   // linear terms
   for (unsigned char* i = ec.indices.begin; i != ec.indices.end; i++) 
-    GD::foreach_feature<float, vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[*i].begin, ec.atomics[*i].end, linear_prediction);
+    GD::foreach_feature<float, GD::vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[*i].begin, ec.atomics[*i].end, linear_prediction);
 
   // store constant + linear prediction
   // note: constant is now automatically added
@@ -115,12 +114,12 @@ float mf_predict(vw& all, example& ec)
 	      // l^k is from index+1 to index+all.rank
 	      //float x_dot_l = sd_offset_add(weights, mask, ec.atomics[(int)(*i)[0]].begin, ec.atomics[(int)(*i)[0]].end, k);
               float x_dot_l = 0.;
-	      GD::foreach_feature<float, vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[(int)(*i)[0]].begin, ec.atomics[(int)(*i)[0]].end, x_dot_l, k);
+	      GD::foreach_feature<float, GD::vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[(int)(*i)[0]].begin, ec.atomics[(int)(*i)[0]].end, x_dot_l, k);
 	      // x_r * r^k
 	      // r^k is from index+all.rank+1 to index+2*all.rank
 	      //float x_dot_r = sd_offset_add(weights, mask, ec.atomics[(int)(*i)[1]].begin, ec.atomics[(int)(*i)[1]].end, k+all.rank);
               float x_dot_r = 0.;
-	      GD::foreach_feature<float,vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[(int)(*i)[1]].begin, ec.atomics[(int)(*i)[1]].end, x_dot_r, k+all.rank);
+	      GD::foreach_feature<float,GD::vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[(int)(*i)[1]].begin, ec.atomics[(int)(*i)[1]].end, x_dot_r, k+all.rank);
 
 	      prediction += x_dot_l * x_dot_r;
 
@@ -156,6 +155,12 @@ float mf_predict(vw& all, example& ec)
 }
 
 
+void sd_offset_update(weight* weights, size_t mask, feature* begin, feature* end, size_t offset, float update, float regularization)
+{
+  for (feature* f = begin; f!= end; f++) 
+    weights[(f->weight_index + offset) & mask] += update * f->x - regularization * weights[(f->weight_index + offset) & mask];
+}
+
 void mf_train(vw& all, example& ec, float update)
 {
       weight* weights = all.reg.weight_vector;
diff --git a/vowpalwabbit/global_data.cc b/vowpalwabbit/global_data.cc
index 0f122ec6..1b939a36 100644
--- a/vowpalwabbit/global_data.cc
+++ b/vowpalwabbit/global_data.cc
@@ -215,6 +215,25 @@ void compile_gram(vector<string> grams, uint32_t* dest, char* descriptor, bool q
     }
 }
 
+po::variables_map add_options(vw& all, po::options_description& opts)
+{
+  all.opts.add(opts);
+  po::variables_map new_vm;
+
+  //parse local opts once for notifications.
+  po::parsed_options parsed = po::command_line_parser(all.args).
+    style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
+    options(opts).allow_unregistered().run();
+  po::store(parsed, new_vm);
+  po::notify(new_vm); 
+  //parse all opts for a complete variable map.
+  parsed = po::command_line_parser(all.args).
+    style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
+    options(all.opts).allow_unregistered().run();
+  po::store(parsed, new_vm);
+  return new_vm;
+}
+
 vw::vw()
 {
   sd = (shared_data *) calloc_or_die(1, sizeof(shared_data));
@@ -266,7 +285,7 @@ vw::vw()
   per_feature_regularizer_output = "";
   per_feature_regularizer_text = "";
 
-  options_from_file = "";
+  file_options = "";
 
   #ifdef _WIN32
   stdout_fileno = _fileno(stdout);
@@ -304,7 +323,6 @@ vw::vw()
 
   normalized_sum_norm_x = 0.;
   normalized_idx = 2;
-  feature_mask_idx = 3;//by default use the 4th position as mask
 
   add_constant = true;
   audit = false;
diff --git a/vowpalwabbit/global_data.h b/vowpalwabbit/global_data.h
index 1d44924a..1df07f93 100644
--- a/vowpalwabbit/global_data.h
+++ b/vowpalwabbit/global_data.h
@@ -9,6 +9,9 @@ license as described in the file LICENSE.
 #include <map>
 #include <stdint.h>
 #include <cstdio>
+#include <boost/program_options.hpp>
+namespace po = boost::program_options;
+
 #include "v_array.h"
 #include "parse_primitives.h"
 #include "loss_functions.h"
@@ -156,9 +159,9 @@ struct vw {
 
   bool save_resume;
 
-  std::string options_from_file;
-  char** options_from_file_argv;
-  int options_from_file_argc;
+  po::options_description opts;
+  std::string file_options;
+  vector<std::string> args;
 
   bool searn;
   void* /*Searn::searn*/ searnstr;
@@ -215,7 +218,6 @@ struct vw {
 
   float normalized_sum_norm_x;
   size_t normalized_idx; //offset idx where the norm is stored (1 or 2 depending on whether adaptive is true)
-  size_t feature_mask_idx; //offset idx where mask is stored
 
   uint32_t lda;
   float lda_alpha;
@@ -277,6 +279,7 @@ void print_lda_result(vw& all, int f, float* res, float weight, v_array<char> ta
 void get_prediction(int sock, float& res, float& weight);
 void compile_gram(vector<string> grams, uint32_t* dest, char* descriptor, bool quiet);
 int print_tag(std::stringstream& ss, v_array<char> tag);
+po::variables_map add_options(vw& all, po::options_description& opts);
 
 #endif
 
diff --git a/vowpalwabbit/lda_core.cc b/vowpalwabbit/lda_core.cc
index e07783c0..612253ff 100644
--- a/vowpalwabbit/lda_core.cc
+++ b/vowpalwabbit/lda_core.cc
@@ -15,7 +15,6 @@ license as described in the file LICENSE.
 #include <stdio.h>
 #include <assert.h>
 #include "constant.h"
-#include "sparse_dense.h"
 #include "gd.h"
 #include "simple_label.h"
 #include "rand48.h"
@@ -750,7 +749,7 @@ void end_examples(lda& l)
     ld.v.delete_v();
   }
 
-learner* setup(vw&all, vector<string>&opts, po::variables_map& vm)
+learner* setup(vw&all, po::variables_map& vm)
 {
   lda* ld = (lda*)calloc_or_die(1,sizeof(lda));
   ld->sorted_features = vector<index_feature>();
@@ -758,20 +757,15 @@ learner* setup(vw&all, vector<string>&opts, po::variables_map& vm)
   ld->all = &all;
   ld->example_t = all.initial_t;
 
-  po::options_description desc("LDA options");
-  desc.add_options()
+  po::options_description lda_opts("LDA options");
+  lda_opts.add_options()
     ("lda_alpha", po::value<float>(&all.lda_alpha), "Prior on sparsity of per-document topic weights")
     ("lda_rho", po::value<float>(&all.lda_rho), "Prior on sparsity of topic distributions")
     ("lda_D", po::value<float>(&all.lda_D), "Number of documents")
     ("lda_epsilon", po::value<float>(&all.lda_epsilon), "Loop convergence threshold")
     ("minibatch", po::value<size_t>(&all.minibatch), "Minibatch size, for LDA");
 
-  po::parsed_options parsed = po::command_line_parser(opts).
-    style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-    options(desc).allow_unregistered().run();
-  opts = po::collect_unrecognized(parsed.options, po::include_positional);
-  po::store(parsed, vm);
-  po::notify(vm);
+  vm = add_options(all, lda_opts);
 
   all.p->sort_features = true;
   float temp = ceilf(logf((float)(all.lda*2+1)) / logf (2.f));
diff --git a/vowpalwabbit/lda_core.h b/vowpalwabbit/lda_core.h
index b934548f..3dc42be5 100644
--- a/vowpalwabbit/lda_core.h
+++ b/vowpalwabbit/lda_core.h
@@ -7,7 +7,7 @@ license as described in the file LICENSE.
 #define LDA_CORE_H
 
 namespace LDA{
-  LEARNER::learner* setup(vw&, std::vector<std::string>&, po::variables_map&);
+  LEARNER::learner* setup(vw&, po::variables_map&);
 }
 
 #endif
diff --git a/vowpalwabbit/loss_functions.cc b/vowpalwabbit/loss_functions.cc
index 52a619f2..0cdc5fa7 100644
--- a/vowpalwabbit/loss_functions.cc
+++ b/vowpalwabbit/loss_functions.cc
@@ -39,20 +39,20 @@ public:
 	  + 2. * (sd->max_label - label) * (prediction - sd->max_label));
   }
   
-  float getUpdate(float prediction, float label,float eta_t, float norm) {
-
+  float getUpdate(float prediction, float label, float eta_t, float pred_per_update) 
+  {
     if (eta_t < 1e-6){ 
       /* When exp(-eta_t)~= 1 we replace 1-exp(-eta_t) 
        * with its first order Taylor expansion around 0
        * to avoid catastrophic cancellation.
        */
-      return 2.f*(label - prediction)*eta_t/norm;
+      return 2.f*(label - prediction)*eta_t/pred_per_update;
     }
-    return (label - prediction)*(1.f-exp(-2.f*eta_t))/norm;
+    return (label - prediction)*(1.f-exp(-2.f*eta_t))/pred_per_update;
   }
 
-  float getUnsafeUpdate(float prediction, float label,float eta_t,float norm) {
-    return 2.f*(label - prediction)*eta_t/norm;
+  float getUnsafeUpdate(float prediction, float label, float eta_t, float pred_per_update) {
+    return 2.f*(label - prediction)*eta_t/pred_per_update;
   }
 
   float getRevertingWeight(shared_data* sd, float prediction, float eta_t){
@@ -92,12 +92,12 @@ public:
     return example_loss;
   }
   
-  float getUpdate(float prediction, float label,float eta_t, float norm) {
-    return 2.f*eta_t*(label - prediction)/norm;
+  float getUpdate(float prediction, float label,float eta_t, float pred_per_update) {
+    return 2.f*eta_t*(label - prediction)/pred_per_update;
   }
 
-  float getUnsafeUpdate(float prediction, float label,float eta_t,float norm) {
-    return 2.f*(label - prediction)*eta_t/norm;
+  float getUnsafeUpdate(float prediction, float label,float eta_t,float pred_per_update) {
+    return 2.f*(label - prediction)*eta_t/pred_per_update;
   }
   
   float getRevertingWeight(shared_data* sd, float prediction, float eta_t){
@@ -132,16 +132,15 @@ public:
     return (e > 0) ? e : 0;
   }
   
-  float getUpdate(float prediction, float label,float eta_t, float norm) {
+  float getUpdate(float prediction, float label,float eta_t, float pred_per_update) {
     if(label*prediction >= 1) return 0;
     float err = 1 - label*prediction;
-    float normal= eta_t;
-    return label * (normal < err ? normal : err)/norm;
+    return label * (eta_t < err ? eta_t : err)/pred_per_update;
   }
 
-  float getUnsafeUpdate(float prediction, float label,float eta_t, float norm) {
+  float getUnsafeUpdate(float prediction, float label,float eta_t, float pred_per_update) {
     if(label*prediction >= 1) return 0;
-    return label * eta_t/norm;
+    return label * eta_t/pred_per_update;
   }
 
   float getRevertingWeight(shared_data*, float prediction, float eta_t){
@@ -175,23 +174,23 @@ public:
     return log(1 + exp(-label * prediction));
   }
   
-  float getUpdate(float prediction, float label, float eta_t, float norm) {
+  float getUpdate(float prediction, float label, float eta_t, float pred_per_update) {
     float w,x;
     float d = exp(label * prediction);
     if(eta_t < 1e-6){
       /* As with squared loss, for small eta_t we replace the update
        * with its first order Taylor expansion to avoid numerical problems
        */
-      return label*eta_t/((1+d)*norm);
+      return label*eta_t/((1+d)*pred_per_update);
     }
     x = eta_t + label*prediction + d;
     w = wexpmx(x);
-    return -(label*w+prediction)/norm;
+    return -(label*w+prediction)/pred_per_update;
   }
 
-  float getUnsafeUpdate(float prediction, float label, float eta_t, float norm) {
+  float getUnsafeUpdate(float prediction, float label, float eta_t, float pred_per_update) {
     float d = exp(label * prediction);
-    return label*eta_t/((1+d)*norm);
+    return label*eta_t/((1+d)*pred_per_update);
   }
   
   inline float wexpmx(float x){
@@ -246,24 +245,24 @@ public:
     
   }
   
-  float getUpdate(float prediction, float label, float eta_t, float norm) {
+  float getUpdate(float prediction, float label, float eta_t, float pred_per_update) {
     float err = label - prediction;
     if(err == 0) return 0;
     float normal = eta_t;//base update size
     if(err > 0) {
       normal = tau*normal;
-      return (normal < err ? normal : err) / norm;
+      return (normal < err ? normal : err) / pred_per_update;
     } else {
       normal = -(1-tau) * normal;
-      return ( normal > err ?  normal : err) / norm;
+      return ( normal > err ?  normal : err) / pred_per_update;
     }
   }
 
-  float getUnsafeUpdate(float prediction, float label, float eta_t, float norm) {
+  float getUnsafeUpdate(float prediction, float label, float eta_t, float pred_per_update) {
     float err = label - prediction;
     if(err == 0) return 0;
-    if(err > 0) return tau*eta_t/norm;
-    return -(1-tau)*eta_t/norm;
+    if(err > 0) return tau*eta_t/pred_per_update;
+    return -(1-tau)*eta_t/pred_per_update;
   }
   
   float getRevertingWeight(shared_data* sd, float prediction, float eta_t){
diff --git a/vowpalwabbit/lrq.cc b/vowpalwabbit/lrq.cc
index f7e9e466..8e71f85b 100644
--- a/vowpalwabbit/lrq.cc
+++ b/vowpalwabbit/lrq.cc
@@ -178,7 +178,7 @@ namespace LRQ {
       }
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {//parse and set arguments
     LRQstate* lrq = (LRQstate*) calloc (1, sizeof (LRQstate));
     unsigned int maxk = 0;
@@ -186,32 +186,22 @@ namespace LRQ {
 
     size_t random_seed = 0;
     if (vm.count("random_seed")) random_seed = vm["random_seed"].as<size_t> ();
-    if (vm_file.count("random_seed")) random_seed = vm_file["random_seed"].as<size_t> ();
 
     lrq->initial_seed = lrq->seed = random_seed | 8675309;
-    lrq->dropout = vm.count("lrqdropout") || vm_file.count("lrqdropout");
+    lrq->dropout = vm.count("lrqdropout");
 
-    if (lrq->dropout && !vm_file.count("lrqdropout"))
-      all.options_from_file.append(" --lrqdropout");
-
-    if (!vm_file.count("lrq"))
-      {
-        lrq->lrpairs = vm["lrq"].as<vector<string> > ();
-
-        // TODO: doesn't work for non-printable stuff
-        
-        stringstream ss;
-        for (vector<string>::iterator i = lrq->lrpairs.begin (); 
-             i != lrq->lrpairs.end (); 
-             ++i)
-          {
-            ss << " --lrq " << *i;
-          }
-
-        all.options_from_file.append(ss.str());
-      }
-    else
-      lrq->lrpairs = vm_file["lrq"].as<vector<string> > ();
+    all.file_options.append(" --lrqdropout");
+    
+    lrq->lrpairs = vm["lrq"].as<vector<string> > ();
+    
+    stringstream ss;
+    for (vector<string>::iterator i = lrq->lrpairs.begin (); 
+	 i != lrq->lrpairs.end (); 
+	 ++i)
+      ss << " --lrq " << *i;
+    
+    all.file_options.append(ss.str());
+    
 
     if (! all.quiet)
       {
diff --git a/vowpalwabbit/lrq.h b/vowpalwabbit/lrq.h
index 6c4254e8..13015cda 100644
--- a/vowpalwabbit/lrq.h
+++ b/vowpalwabbit/lrq.h
@@ -2,6 +2,6 @@
 #ifndef LRQ_HEADER
 #define LRQ_HEADER
 namespace LRQ {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 #endif
diff --git a/vowpalwabbit/nn.cc b/vowpalwabbit/nn.cc
index 278b6e55..fe22eec9 100644
--- a/vowpalwabbit/nn.cc
+++ b/vowpalwabbit/nn.cc
@@ -304,59 +304,34 @@ CONVERSE: // That's right, I'm using goto.  So sue me.
     free (n.output_layer.atomics[nn_output_namespace].begin);
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     nn* n = (nn*)calloc_or_die(1,sizeof(nn));
     n->all = &all;
 
-    po::options_description desc("NN options");
-    desc.add_options()
+    po::options_description nn_opts("NN options");
+    nn_opts.add_options()
       ("inpass", "Train or test sigmoidal feedforward network with input passthrough.")
       ("dropout", "Train or test sigmoidal feedforward network using dropout.")
       ("meanfield", "Train or test sigmoidal feedforward network using mean field.");
 
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
-
-    po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc,all.options_from_file_argv).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    po::store(parsed_file, vm_file);
-    po::notify(vm_file);
+    vm = add_options(all, nn_opts);
 
     //first parse for number of hidden units
-    n->k = 0;
-    if( vm_file.count("nn") ) {
-      n->k = (uint32_t)vm_file["nn"].as<size_t>();
-      if( vm.count("nn") && (uint32_t)vm["nn"].as<size_t>() != n->k )
-        std::cerr << "warning: you specified a different number of hidden units through --nn than the one loaded from predictor. Pursuing with loaded value of: " << n->k << endl;
-    }
-    else {
-      n->k = (uint32_t)vm["nn"].as<size_t>();
-
-      std::stringstream ss;
-      ss << " --nn " << n->k;
-      all.options_from_file.append(ss.str());
-    }
+    n->k = (uint32_t)vm["nn"].as<size_t>();
+    
+    std::stringstream ss;
+    ss << " --nn " << n->k;
+    all.file_options.append(ss.str());
 
-    if( vm_file.count("dropout") ) {
-      n->dropout = all.training || vm.count("dropout");
-
-      if (! n->dropout && ! vm.count("meanfield") && ! all.quiet) 
-        std::cerr << "using mean field for testing, specify --dropout explicitly to override" << std::endl;
-    }
-    else if ( vm.count("dropout") ) {
+    if ( vm.count("dropout") ) {
       n->dropout = true;
-
+      
       std::stringstream ss;
       ss << " --dropout ";
-      all.options_from_file.append(ss.str());
+      all.file_options.append(ss.str());
     }
-
+    
     if ( vm.count("meanfield") ) {
       n->dropout = false;
       if (! all.quiet) 
@@ -371,15 +346,12 @@ CONVERSE: // That's right, I'm using goto.  So sue me.
                   << (all.training ? "training" : "testing") 
                   << std::endl;
 
-    if( vm_file.count("inpass") ) {
-      n->inpass = true;
-    }
-    else if (vm.count ("inpass")) {
+    if (vm.count ("inpass")) {
       n->inpass = true;
 
       std::stringstream ss;
       ss << " --inpass";
-      all.options_from_file.append(ss.str());
+      all.file_options.append(ss.str());
     }
 
     if (n->inpass && ! all.quiet)
diff --git a/vowpalwabbit/nn.h b/vowpalwabbit/nn.h
index 9e3237cc..a0eaa7c9 100644
--- a/vowpalwabbit/nn.h
+++ b/vowpalwabbit/nn.h
@@ -11,7 +11,7 @@ license as described in the file LICENSE.
 
 namespace NN
 {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 
 #endif
diff --git a/vowpalwabbit/oaa.cc b/vowpalwabbit/oaa.cc
index 836b4d8a..cdd97062 100644
--- a/vowpalwabbit/oaa.cc
+++ b/vowpalwabbit/oaa.cc
@@ -82,24 +82,17 @@ namespace OAA {
     VW::finish_example(all, &ec);
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     oaa* data = (oaa*)calloc_or_die(1, sizeof(oaa));
     //first parse for number of actions
-    if( vm_file.count("oaa") ) {
-      data->k = (uint32_t)vm_file["oaa"].as<size_t>();
-      if( vm.count("oaa") && (uint32_t)vm["oaa"].as<size_t>() != data->k )
-        std::cerr << "warning: you specified a different number of actions through --oaa than the one loaded from predictor. Pursuing with loaded value of: " << data->k << endl;
-    }
-    else {
-      data->k = (uint32_t)vm["oaa"].as<size_t>();
-
-      //append oaa with nb_actions to options_from_file so it is saved to regressor later
-      std::stringstream ss;
-      ss << " --oaa " << data->k;
-      all.options_from_file.append(ss.str());
-    }
 
+    data->k = (uint32_t)vm["oaa"].as<size_t>();
+    
+    //append oaa with nb_actions to options_from_file so it is saved to regressor later
+    std::stringstream ss;
+    ss << " --oaa " << data->k;
+    all.file_options.append(ss.str());
 
     data->shouldOutput = all.raw_prediction > 0;
     data->all = &all;
diff --git a/vowpalwabbit/oaa.h b/vowpalwabbit/oaa.h
index 74ec99b0..302d15dd 100644
--- a/vowpalwabbit/oaa.h
+++ b/vowpalwabbit/oaa.h
@@ -8,7 +8,7 @@ license as described in the file LICENSE.
 
 namespace OAA
 {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 
 #endif
diff --git a/vowpalwabbit/parse_args.cc b/vowpalwabbit/parse_args.cc
index 3edc34f5..47925673 100644
--- a/vowpalwabbit/parse_args.cc
+++ b/vowpalwabbit/parse_args.cc
@@ -103,14 +103,18 @@ void parse_affix_argument(vw&all, string str) {
   free(cstr);
 }
 
-void parse_diagnostics(vw& all, po::variables_map& vm, po::options_description& desc, int argc)
+void parse_diagnostics(vw& all, po::variables_map& vm, int argc)
 {
-  // Begin diagnostic options
-  if (vm.count("help") || argc == 1) {
-    /* upon direct query for help -- spit it out to stdout */
-    cout << "\n" << desc << "\n";
-    exit(0);
-  }
+  po::options_description diag_opt("Diagnostic options");
+
+  diag_opt.add_options()
+    ("version","Version information")
+    ("audit,a", "print weights of features")
+    ("progress,P", po::value< string >(), "Progress update frequency. int: additive, float: multiplicative")
+    ("quiet", "Don't output disgnostics and progress updates")
+    ("help,h","Look here: http://hunch.net/~vw/ and click on Tutorial.");
+  
+  vm = add_options(all, diag_opt);
 
   if (vm.count("version")) {
     /* upon direct query for version -- spit it out to stdout */
@@ -122,6 +126,9 @@ void parse_diagnostics(vw& all, po::variables_map& vm, po::options_description&
     all.quiet = true;
     // --quiet wins over --progress
   } else {
+    if (argc == 1)
+      cerr << "For more information use: vw --help" << endl;
+
     all.quiet = false;
 
     if (vm.count("progress")) {
@@ -166,6 +173,34 @@ void parse_diagnostics(vw& all, po::variables_map& vm, po::options_description&
 
 void parse_source(vw& all, po::variables_map& vm)
 {
+  po::options_description in_opt("Input options");
+  
+  in_opt.add_options()
+    ("data,d", po::value< string >(), "Example Set")
+    ("daemon", "persistent daemon mode on port 26542")
+    ("port", po::value<size_t>(),"port to listen on; use 0 to pick unused port")
+    ("num_children", po::value<size_t>(&(all.num_children)), "number of children for persistent daemon mode")
+    ("pid_file", po::value< string >(), "Write pid file in persistent daemon mode")
+    ("port_file", po::value< string >(), "Write port used in persistent daemon mode")
+    ("cache,c", "Use a cache.  The default is <data>.cache")
+    ("cache_file", po::value< vector<string> >(), "The location(s) of cache_file.")
+    ("kill_cache,k", "do not reuse existing cache: create a new one always")
+    ("compressed", "use gzip format whenever possible. If a cache file is being created, this option creates a compressed cache file. A mixture of raw-text & compressed inputs are supported with autodetection.")
+    ("no_stdin", "do not default to reading from stdin");
+  
+  vm = add_options(all, in_opt);
+
+  // Be friendly: if -d was left out, treat positional param as data file
+  po::positional_options_description p;  
+  p.add("data", -1);
+  
+  vm = po::variables_map();
+  po::parsed_options pos = po::command_line_parser(all.args).
+    style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
+    options(all.opts).positional(p).run();
+  vm = po::variables_map();
+  po::store(pos, vm);
+ 
   //begin input source
   if (vm.count("no_stdin"))
     all.stdin_off = true;
@@ -192,10 +227,41 @@ void parse_source(vw& all, po::variables_map& vm)
       set_compressed(all.p);
   } else
     all.data_filename = "";
+
+  if ((vm.count("cache") || vm.count("cache_file")) && vm.count("invert_hash"))
+    {
+      cout << "invert_hash is incompatible with a cache file.  Use it in single pass mode only." << endl;
+      throw exception();
+    }
+
+  if(!all.holdout_set_off && (vm.count("output_feature_regularizer_binary") || vm.count("output_feature_regularizer_text")))
+    {
+      all.holdout_set_off = true;
+      cerr<<"Making holdout_set_off=true since output regularizer specified\n";
+    }
 }
 
-void parse_feature_tweaks(vw& all, po::variables_map& vm, po::variables_map& vm_file)
+void parse_feature_tweaks(vw& all, po::variables_map& vm)
 {
+  po::options_description feature_opt("Feature options");
+  feature_opt.add_options()
+    ("hash", po::value< string > (), "how to hash the features. Available options: strings, all")
+    ("ignore", po::value< vector<unsigned char> >(), "ignore namespaces beginning with character <arg>")
+    ("keep", po::value< vector<unsigned char> >(), "keep namespaces beginning with character <arg>")
+    ("bit_precision,b", po::value<size_t>(), "number of bits in the feature table")
+    ("noconstant", "Don't add a constant feature")
+    ("constant,C", po::value<float>(&(all.initial_constant)), "Set initial value of constant")
+    ("ngram", po::value< vector<string> >(), "Generate N grams. To generate N grams for a single namespace 'foo', arg should be fN.")
+    ("skips", po::value< vector<string> >(), "Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace 'foo', arg should be fn.")
+    ("affix", po::value<string>(), "generate prefixes/suffixes of features; argument '+2a,-3b,+1' means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace")
+    ("spelling", po::value< vector<string> >(), "compute spelling features for a give namespace (use '_' for default namespace)")
+    ("quadratic,q", po::value< vector<string> > (), "Create and use quadratic features")
+    ("q:", po::value< string >(), ": corresponds to a wildcard for all printable characters")
+    ("cubic", po::value< vector<string> > (),
+     "Create and use cubic features");
+
+  vm = add_options(all, feature_opt);
+
   //feature manipulation
   string hash_function("strings");
   if(vm.count("hash")) 
@@ -209,17 +275,11 @@ void parse_feature_tweaks(vw& all, po::variables_map& vm, po::variables_map& vm_
       else all.spelling_features[(size_t)spelling_ns[id][0]] = true;
   }
 
-  if (vm_file.count("affix") && vm.count("affix")) {
-    cerr << "should not specify --affix when loading a model trained with affix features (they're turned on by default)" << endl;
-    throw exception();
-  }
-  if (vm_file.count("affix"))
-    parse_affix_argument(all, vm_file["affix"].as<string>());
   if (vm.count("affix")) {
     parse_affix_argument(all, vm["affix"].as<string>());
     stringstream ss;
     ss << " --affix " << vm["affix"].as<string>();
-    all.options_from_file.append(ss.str());
+    all.file_options.append(ss.str());
   }
 
   if(vm.count("ngram")){
@@ -301,6 +361,9 @@ void parse_feature_tweaks(vw& all, po::variables_map& vm, po::variables_map& vm_
         else if((*i)[0]==':'&&(*i)[1]==':'){
 	  cout << "in pair creation" << endl;
           newpairs.reserve(newpairs.size() + valid_ns_size*valid_ns_size);
+	  stringstream ss;
+	  ss << ' ' << ' ';
+	  newpairs.push_back(ss.str());
           for (char j=printable_start; j<=printable_end; j++){
             if(valid_ns(j)){
               for (char k=printable_start; k<=printable_end; k++){
@@ -392,6 +455,27 @@ void parse_feature_tweaks(vw& all, po::variables_map& vm, po::variables_map& vm_
 
 void parse_example_tweaks(vw& all, po::variables_map& vm)
 {
+  po::options_description example_opts("Example options");
+  
+  example_opts.add_options()
+    ("testonly,t", "Ignore label information and just test")
+    ("holdout_off", "no holdout data in multiple passes")
+    ("holdout_period", po::value<uint32_t>(&(all.holdout_period)), "holdout period for test only, default 10")
+    ("holdout_after", po::value<uint32_t>(&(all.holdout_after)), "holdout after n training examples, default off (disables holdout_period)")
+    ("early_terminate", po::value<size_t>(), "Specify the number of passes tolerated when holdout loss doesn't decrease before early termination, default is 3")
+    ("passes", po::value<size_t>(&(all.numpasses)),"Number of Training Passes")
+    ("initial_pass_length", po::value<size_t>(&(all.pass_length)), "initial number of examples per pass")
+    ("examples", po::value<size_t>(&(all.max_examples)), "number of examples to parse")
+    ("min_prediction", po::value<float>(&(all.sd->min_label)), "Smallest prediction to output")
+    ("max_prediction", po::value<float>(&(all.sd->max_label)), "Largest prediction to output")
+    ("sort_features", "turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes")
+    ("loss_function", po::value<string>()->default_value("squared"), "Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.")
+    ("quantile_tau", po::value<float>()->default_value(0.5), "Parameter \\tau associated with Quantile loss. Defaults to 0.5")
+    ("l1", po::value<float>(&(all.l1_lambda)), "l_1 lambda")
+    ("l2", po::value<float>(&(all.l2_lambda)), "l_2 lambda");
+
+  vm = add_options(all, example_opts);
+
   if (vm.count("testonly") || all.eta == 0.)
     {
       if (!all.quiet)
@@ -409,12 +493,6 @@ void parse_example_tweaks(vw& all, po::variables_map& vm)
   if(vm.count("holdout_off"))
       all.holdout_set_off = true;
 
-  if(!all.holdout_set_off && (vm.count("output_feature_regularizer_binary") || vm.count("output_feature_regularizer_text")))
-  {
-      all.holdout_set_off = true;
-      cerr<<"Making holdout_set_off=true since output regularizer specified\n";
-  }
-
   if(vm.count("sort_features"))
     all.p->sort_features = true;
   
@@ -455,8 +533,17 @@ void parse_example_tweaks(vw& all, po::variables_map& vm)
     }
 }
 
-void parse_output_preds(vw& all, po::variables_map& vm, po::variables_map& vm_file)
+void parse_output_preds(vw& all, po::variables_map& vm)
 {
+  po::options_description out_opt("Output options");
+
+  out_opt.add_options()
+    ("predictions,p", po::value< string >(), "File to output predictions to")
+    ("raw_predictions,r", po::value< string >(), "File to output unnormalized predictions to")
+    ;
+
+  vm = add_options(all, out_opt);
+
   if (vm.count("predictions")) {
     if (!all.quiet)
       cerr << "predictions = " <<  vm["predictions"].as< string >() << endl;
@@ -482,7 +569,7 @@ void parse_output_preds(vw& all, po::variables_map& vm, po::variables_map& vm_fi
   if (vm.count("raw_predictions")) {
     if (!all.quiet) {
       cerr << "raw predictions = " <<  vm["raw_predictions"].as< string >() << endl;
-      if (vm.count("binary") || vm_file.count("binary"))
+      if (vm.count("binary"))
         cerr << "Warning: --raw has no defined value when --binary specified, expect no output" << endl;
     }
     if (strcmp(vm["raw_predictions"].as< string >().c_str(), "stdout") == 0)
@@ -503,6 +590,19 @@ void parse_output_preds(vw& all, po::variables_map& vm, po::variables_map& vm_fi
 
 void parse_output_model(vw& all, po::variables_map& vm)
 {
+  po::options_description output_model("Output model");
+  
+  output_model.add_options()
+    ("final_regressor,f", po::value< string >(), "Final regressor")
+    ("readable_model", po::value< string >(), "Output human-readable final regressor with numeric features")
+    ("invert_hash", po::value< string >(), "Output human-readable final regressor with feature names.  Computationally expensive.")
+    ("save_resume", "save extra state so learning can be resumed later with new data")
+    ("save_per_pass", "Save the model after every pass over data")
+    ("output_feature_regularizer_binary", po::value< string >(&(all.per_feature_regularizer_output)), "Per feature regularization output file")
+    ("output_feature_regularizer_text", po::value< string >(&(all.per_feature_regularizer_text)), "Per feature regularization output file, in text");
+  
+  vm = add_options(all, output_model);
+
   if (vm.count("final_regressor")) {
     all.final_regressor_name = vm["final_regressor"].as<string>();
     if (!all.quiet)
@@ -526,13 +626,30 @@ void parse_output_model(vw& all, po::variables_map& vm)
     all.save_resume = true;
 }
 
-void parse_base_algorithm(vw& all, vector<string>& to_pass_further, po::variables_map& vm)
+void parse_base_algorithm(vw& all, po::variables_map& vm)
 {
   //base learning algorithm.
+  po::options_description base_opt("base algorithms (these are exclusive)");
+  
+  base_opt.add_options()
+    ("sgd", "use regular stochastic gradient descent update.")
+    ("adaptive", "use adaptive, individual learning rates.")
+    ("invariant", "use safe/importance aware updates.")
+    ("normalized", "use per feature normalized updates")
+    ("exact_adaptive_norm", "use current default invariant normalized adaptive update rule")
+    ("bfgs", "use bfgs optimization")
+    ("lda", po::value<uint32_t>(&(all.lda)), "Run lda with <int> topics")
+    ("rank", po::value<uint32_t>(&(all.rank)), "rank for matrix factorization.")
+    ("noop","do no learning")
+    ("print","print examples")
+    ("sendto", po::value< vector<string> >(), "send examples to <host>");
+
+  vm = add_options(all, base_opt);
+
   if (vm.count("bfgs") || vm.count("conjugate_gradient"))
-    all.l = BFGS::setup(all, to_pass_further, vm);
+    all.l = BFGS::setup(all, vm);
   else if (vm.count("lda"))
-    all.l = LDA::setup(all, to_pass_further, vm);
+    all.l = LDA::setup(all, vm);
   else if (vm.count("noop"))
     all.l = NOOP::setup(all);
   else if (vm.count("print"))
@@ -571,192 +688,191 @@ void load_input_model(vw& all, po::variables_map& vm, io_buf& io_temp)
   }
 }
 
-void parse_scorer_reductions(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file)
+void parse_scorer_reductions(vw& all, po::variables_map& vm)
 {
-  if(vm.count("nn") || vm_file.count("nn") )
-    all.l = NN::setup(all, to_pass_further, vm, vm_file);
+  po::options_description score_mod_opt("Score modifying options (can be combined)");
+
+  score_mod_opt.add_options()
+    ("nn", po::value<size_t>(), "Use sigmoidal feedforward network with <k> hidden units")
+    ("new_mf", "use new, reduction-based matrix factorization")
+    ("autolink", po::value<size_t>(), "create link function with polynomial d")
+    ("lrq", po::value<vector<string> > (), "use low rank quadratic features")
+    ("lrqdropout", "use dropout training for low rank quadratic features");
+
+  vm = add_options(all, score_mod_opt);
+
+  if(vm.count("nn"))
+    all.l = NN::setup(all, vm);
   
   if (vm.count("new_mf") && all.rank > 0)
     all.l = MF::setup(all, vm);
   
-  if(vm.count("autolink") || vm_file.count("autolink") )
-    all.l = ALINK::setup(all, to_pass_further, vm, vm_file);
+  if(vm.count("autolink"))
+    all.l = ALINK::setup(all, vm);
   
-  if (vm.count("lrq") || vm_file.count("lrq"))
-    all.l = LRQ::setup(all, to_pass_further, vm, vm_file);
+  if (vm.count("lrq"))
+    all.l = LRQ::setup(all, vm);
   
-  all.l = Scorer::setup(all, to_pass_further, vm, vm_file);
+  all.l = Scorer::setup(all, vm);
 }
 
-LEARNER::learner* exclusive_setup(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file, bool& score_consumer, LEARNER::learner* (*setup)(vw&, vector<string>&, po::variables_map&, po::variables_map&))
+LEARNER::learner* exclusive_setup(vw& all, po::variables_map& vm, bool& score_consumer, LEARNER::learner* (*setup)(vw&, po::variables_map&))
 {
   if (score_consumer) { cerr << "error: cannot specify multiple direct score consumers" << endl; throw exception(); }
   score_consumer = true;
-  return setup(all, to_pass_further, vm, vm_file);
+  return setup(all, vm);
 }
 
-void parse_score_users(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file, bool& got_cs)
+void parse_score_users(vw& all, po::variables_map& vm, bool& got_cs)
 {
+  po::options_description multiclass_opt("Score user options (these are exclusive)");
+  multiclass_opt.add_options()
+    ("top", po::value<size_t>(), "top k recommendation")
+    ("binary", "report loss as binary classification on -1,1")
+    ("oaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> labels")
+    ("ect", po::value<size_t>(), "Use error correcting tournament with <k> labels")
+    ("csoaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> costs")
+    ("wap", po::value<size_t>(), "Use weighted all-pairs multiclass learning with <k> costs")
+    ("csoaa_ldf", po::value<string>(), "Use one-against-all multiclass learning with label dependent features.  Specify singleline or multiline.")
+    ("wap_ldf", po::value<string>(), "Use weighted all-pairs multiclass learning with label dependent features.  Specify singleline or multiline.")
+    ;
+
+  vm = add_options(all, multiclass_opt);
   bool score_consumer = false;
   
-  if(vm.count("top") || vm_file.count("top") )
-    all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, TOPK::setup);
+  if(vm.count("top"))
+    all.l = exclusive_setup(all, vm, score_consumer, TOPK::setup);
   
-  if (vm.count("binary") || vm_file.count("binary"))
-    all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, BINARY::setup);
+  if (vm.count("binary"))
+    all.l = exclusive_setup(all, vm, score_consumer, BINARY::setup);
   
-  if (vm.count("oaa") || vm_file.count("oaa") ) 
-    all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, OAA::setup);
+  if (vm.count("oaa")) 
+    all.l = exclusive_setup(all, vm, score_consumer, OAA::setup);
   
-  if (vm.count("ect") || vm_file.count("ect") ) 
-    all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, ECT::setup);
+  if (vm.count("ect")) 
+    all.l = exclusive_setup(all, vm, score_consumer, ECT::setup);
   
-  if(vm.count("csoaa") || vm_file.count("csoaa") ) {
-    all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, CSOAA::setup);
+  if(vm.count("csoaa")) {
+    all.l = exclusive_setup(all, vm, score_consumer, CSOAA::setup);
     all.cost_sensitive = all.l;
     got_cs = true;
   }
   
-  if(vm.count("wap") || vm_file.count("wap") ) {
-    all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, WAP::setup);
+  if(vm.count("wap")) {
+    all.l = exclusive_setup(all, vm, score_consumer, WAP::setup);
     all.cost_sensitive = all.l;
     got_cs = true;
   }
   
-  if(vm.count("csoaa_ldf") || vm_file.count("csoaa_ldf")) {
-    all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, CSOAA_AND_WAP_LDF::setup);
+  if(vm.count("csoaa_ldf") || vm.count("csoaa_ldf")) {
+    all.l = exclusive_setup(all, vm, score_consumer, CSOAA_AND_WAP_LDF::setup);
     all.cost_sensitive = all.l;
     got_cs = true;
   }
   
-  if(vm.count("wap_ldf") || vm_file.count("wap_ldf") ) {
-    all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, CSOAA_AND_WAP_LDF::setup);
+  if(vm.count("wap_ldf") || vm.count("wap_ldf") ) {
+    all.l = exclusive_setup(all, vm, score_consumer, CSOAA_AND_WAP_LDF::setup);
     all.cost_sensitive = all.l;
     got_cs = true;
   }
 }
 
-void parse_cb(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file, bool& got_cs, bool& got_cb)
+void parse_cb(vw& all, po::variables_map& vm, bool& got_cs, bool& got_cb)
 {
-  if( vm.count("cb") || vm_file.count("cb") )
+  po::options_description cb_opts("Contextual Bandit options");
+    
+  cb_opts.add_options()
+    ("cb", po::value<size_t>(), "Use contextual bandit learning with <k> costs")
+    ("cbify", po::value<size_t>(), "Convert multiclass on <k> classes into a contextual bandit problem and solve");
+
+  vm = add_options(all,cb_opts);
+  
+  if( vm.count("cb"))
     {
       if(!got_cs) {
-	if( vm_file.count("cb") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm_file["cb"]));
+	if( vm.count("cb") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["cb"]));
 	else vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["cb"]));
 	
-	all.l = CSOAA::setup(all, to_pass_further, vm, vm_file);  // default to CSOAA unless wap is specified
+	all.l = CSOAA::setup(all, vm);  // default to CSOAA unless wap is specified
 	all.cost_sensitive = all.l;
 	got_cs = true;
       }
       
-      all.l = CB_ALGS::setup(all, to_pass_further, vm, vm_file);
+      all.l = CB_ALGS::setup(all, vm);
       got_cb = true;
     }
 
-  if (vm.count("cbify") || vm_file.count("cbify"))
+  if (vm.count("cbify"))
     {
       if(!got_cs) {
-	if( vm_file.count("cbify") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm_file["cbify"]));
-	else vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["cbify"]));
+	vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["cbify"]));
 	
-	all.l = CSOAA::setup(all, to_pass_further, vm, vm_file);  // default to CSOAA unless wap is specified
+	all.l = CSOAA::setup(all, vm);  // default to CSOAA unless wap is specified
 	all.cost_sensitive = all.l;
 	got_cs = true;
       }
-
+      
       if (!got_cb) {
-	if( vm_file.count("cbify") ) vm.insert(pair<string,po::variable_value>(string("cb"),vm_file["cbify"]));
-	else vm.insert(pair<string,po::variable_value>(string("cb"),vm["cbify"]));
-	all.l = CB_ALGS::setup(all, to_pass_further, vm, vm_file);
+	vm.insert(pair<string,po::variable_value>(string("cb"),vm["cbify"]));
+	all.l = CB_ALGS::setup(all, vm);
 	got_cb = true;
       }
 
-      all.l = CBIFY::setup(all, to_pass_further, vm, vm_file);
+      all.l = CBIFY::setup(all, vm);
     }
 }
 
-void parse_search(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file, bool& got_cs, bool& got_cb)
+void parse_search(vw& all, po::variables_map& vm, bool& got_cs, bool& got_cb)
 {
-  if (vm.count("search") || vm_file.count("search") ) {
+  po::options_description search_opts("Search");
+    
+  search_opts.add_options()
+    ("search", po::value<size_t>(), "use search-based structured prediction, argument=maximum action id or 0 for LDF");
+
+  vm = add_options(all,search_opts);
+
+  if (vm.count("search")) {
     if (!got_cs && !got_cb) {
-      if( vm_file.count("search") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm_file["search"]));
+      if( vm.count("search") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["search"]));
       else vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["search"]));
       
-      all.l = CSOAA::setup(all, to_pass_further, vm, vm_file);  // default to CSOAA unless others have been specified
+      all.l = CSOAA::setup(all, vm);  // default to CSOAA unless others have been specified
       all.cost_sensitive = all.l;
       got_cs = true;
     }
     //all.searnstr = (Searn::searn*)calloc_or_die(1, sizeof(Searn::searn));
-    all.l = Searn::setup(all, to_pass_further, vm, vm_file);
+    all.l = Searn::setup(all, vm);
   }
 }
 
-vw* parse_args(int argc, char *argv[])
+void add_to_args(vw& all, int argc, char* argv[])
 {
-  po::options_description desc("VW options");
+  for (int i = 1; i < argc; i++)
+    all.args.push_back(string(argv[i]));
+}
 
+vw* parse_args(int argc, char *argv[])
+{
   vw* all = new vw();
 
+  add_to_args(*all, argc, argv);
+
   size_t random_seed = 0;
   all->program_name = argv[0];
 
-  po::options_description in_opt("Input options");
-
-  in_opt.add_options()
-    ("data,d", po::value< string >(), "Example Set")
-    ("ring_size", po::value<size_t>(&(all->p->ring_size)), "size of example ring")
-    ("examples", po::value<size_t>(&(all->max_examples)), "number of examples to parse")
-    ("testonly,t", "Ignore label information and just test")
-    ("daemon", "persistent daemon mode on port 26542")
-    ("port", po::value<size_t>(),"port to listen on; use 0 to pick unused port")
-    ("num_children", po::value<size_t>(&(all->num_children)), "number of children for persistent daemon mode")
-    ("pid_file", po::value< string >(), "Write pid file in persistent daemon mode")
-    ("port_file", po::value< string >(), "Write port used in persistent daemon mode")
-    ("passes", po::value<size_t>(&(all->numpasses)),"Number of Training Passes")
-    ("cache,c", "Use a cache.  The default is <data>.cache")
-    ("cache_file", po::value< vector<string> >(), "The location(s) of cache_file.")
-    ("kill_cache,k", "do not reuse existing cache: create a new one always")
-    ("compressed", "use gzip format whenever possible. If a cache file is being created, this option creates a compressed cache file. A mixture of raw-text & compressed inputs are supported with autodetection.")
-    ("no_stdin", "do not default to reading from stdin")
-    ("save_resume", "save extra state so learning can be resumed later with new data")
-    ;
-
-  po::options_description out_opt("Output options");
+  po::options_description desc("VW options");
 
-  out_opt.add_options()
-    ("audit,a", "print weights of features")
-    ("predictions,p", po::value< string >(), "File to output predictions to")
-    ("raw_predictions,r", po::value< string >(), "File to output unnormalized predictions to")
-    ("sendto", po::value< vector<string> >(), "send examples to <host>")
-    ("quiet", "Don't output disgnostics and progress updates")
-    ("progress,P", po::value< string >(), "Progress update frequency. int: additive, float: multiplicative")
-    ("binary", "report loss as binary classification on -1,1")
-    ("min_prediction", po::value<float>(&(all->sd->min_label)), "Smallest prediction to output")
-    ("max_prediction", po::value<float>(&(all->sd->max_label)), "Largest prediction to output")
-    ;
+  desc.add_options()
+    ("random_seed", po::value<size_t>(&random_seed), "seed random number generator")
+    ("ring_size", po::value<size_t>(&(all->p->ring_size)), "size of example ring");
 
   po::options_description update_opt("Update options");
 
   update_opt.add_options()
-    ("sgd", "use regular stochastic gradient descent update.")
-    ("hessian_on", "use second derivative in line search")
-    ("bfgs", "use bfgs optimization")
-    ("mem", po::value<int>(&(all->m)), "memory in bfgs")
-    ("termination", po::value<float>(&(all->rel_threshold)),"Termination threshold")
-    ("adaptive", "use adaptive, individual learning rates.")
-    ("invariant", "use safe/importance aware updates.")
-    ("normalized", "use per feature normalized updates")
-    ("exact_adaptive_norm", "use current default invariant normalized adaptive update rule")
-    ("conjugate_gradient", "use conjugate gradient based optimization")
-    ("l1", po::value<float>(&(all->l1_lambda)), "l_1 lambda")
-    ("l2", po::value<float>(&(all->l2_lambda)), "l_2 lambda")
-    ("learning_rate,l", po::value<float>(&(all->eta)), "Set Learning Rate")
-    ("loss_function", po::value<string>()->default_value("squared"), "Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.")
-    ("quantile_tau", po::value<float>()->default_value(0.5), "Parameter \\tau associated with Quantile loss. Defaults to 0.5")
+    ("learning_rate,l", po::value<float>(&(all->eta)), "Set learning rate")
     ("power_t", po::value<float>(&(all->power_t)), "t power value")
     ("decay_learning_rate",    po::value<float>(&(all->eta_decay_rate)),
      "Set Decay factor for learning_rate between passes")
-    ("initial_pass_length", po::value<size_t>(&(all->pass_length)), "initial number of examples per pass")
     ("initial_t", po::value<double>(&((all->sd->t))), "initial t value")
     ("feature_mask", po::value< string >(), "Use existing regressor to determine which parameters may be updated.  If no initial_regressor given, also used for initial weights.")
     ;
@@ -764,66 +880,10 @@ vw* parse_args(int argc, char *argv[])
   po::options_description weight_opt("Weight options");
 
   weight_opt.add_options()
-    ("bit_precision,b", po::value<size_t>(), "number of bits in the feature table")
     ("initial_regressor,i", po::value< vector<string> >(), "Initial regressor(s)")
-    ("final_regressor,f", po::value< string >(), "Final regressor")
     ("initial_weight", po::value<float>(&(all->initial_weight)), "Set all weights to an initial value of 1.")
     ("random_weights", po::value<bool>(&(all->random_weights)), "make initial weights random")
-    ("readable_model", po::value< string >(), "Output human-readable final regressor with numeric features")
-    ("invert_hash", po::value< string >(), "Output human-readable final regressor with feature names")
-    ("save_per_pass", "Save the model after every pass over data")
     ("input_feature_regularizer", po::value< string >(&(all->per_feature_regularizer_input)), "Per feature regularization input file")
-    ("output_feature_regularizer_binary", po::value< string >(&(all->per_feature_regularizer_output)), "Per feature regularization output file")
-    ("output_feature_regularizer_text", po::value< string >(&(all->per_feature_regularizer_text)), "Per feature regularization output file, in text")
-    ;
-
-  po::options_description holdout_opt("Holdout options");
-  holdout_opt.add_options()
-    ("holdout_off", "no holdout data in multiple passes")
-    ("holdout_period", po::value<uint32_t>(&(all->holdout_period)), "holdout period for test only, default 10")
-    ("holdout_after", po::value<uint32_t>(&(all->holdout_after)), "holdout after n training examples, default off (disables holdout_period)")
-    ("early_terminate", po::value<size_t>(), "Specify the number of passes tolerated when holdout loss doesn't decrease before early termination, default is 3")
-    ;
-
-  po::options_description namespace_opt("Feature namespace options");
-  namespace_opt.add_options()
-    ("hash", po::value< string > (), "how to hash the features. Available options: strings, all")
-    ("ignore", po::value< vector<unsigned char> >(), "ignore namespaces beginning with character <arg>")
-    ("keep", po::value< vector<unsigned char> >(), "keep namespaces beginning with character <arg>")
-    ("noconstant", "Don't add a constant feature")
-    ("constant,C", po::value<float>(&(all->initial_constant)), "Set initial value of constant")
-    ("sort_features", "turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes")
-    ("ngram", po::value< vector<string> >(), "Generate N grams. To generate N grams for a single namespace 'foo', arg should be fN.")
-    ("skips", po::value< vector<string> >(), "Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace 'foo', arg should be fn.")
-    ("affix", po::value<string>(), "generate prefixes/suffixes of features; argument '+2a,-3b,+1' means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace")
-    ("spelling", po::value< vector<string> >(), "compute spelling features for a give namespace (use '_' for default namespace)");
-    ;
-
-  po::options_description mf_opt("Matrix factorization options");
-  mf_opt.add_options()
-    ("quadratic,q", po::value< vector<string> > (),
-     "Create and use quadratic features")
-    ("q:", po::value< string >(), ": corresponds to a wildcard for all printable characters")
-    ("cubic", po::value< vector<string> > (),
-     "Create and use cubic features")
-    ("rank", po::value<uint32_t>(&(all->rank)), "rank for matrix factorization.")
-    ("new_mf", "use new, reduction-based matrix factorization")
-    ;
-
-  po::options_description lrq_opt("Low Rank Quadratic options");
-  lrq_opt.add_options()
-    ("lrq", po::value<vector<string> > (), "use low rank quadratic features")
-    ("lrqdropout", "use dropout training for low rank quadratic features")
-    ;
-
-  po::options_description multiclass_opt("Multiclass options");
-  multiclass_opt.add_options()
-    ("oaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> labels")
-    ("ect", po::value<size_t>(), "Use error correcting tournament with <k> labels")
-    ("csoaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> costs")
-    ("wap", po::value<size_t>(), "Use weighted all-pairs multiclass learning with <k> costs")
-    ("csoaa_ldf", po::value<string>(), "Use one-against-all multiclass learning with label dependent features.  Specify singleline or multiline.")
-    ("wap_ldf", po::value<string>(), "Use weighted all-pairs multiclass learning with label dependent features.  Specify singleline or multiline.")
     ;
 
   po::options_description active_opt("Active Learning options");
@@ -843,60 +903,20 @@ vw* parse_args(int argc, char *argv[])
 
   po::options_description other_opt("Other options");
   other_opt.add_options()
-    ("bs", po::value<size_t>(), "bootstrap mode with k rounds by online importance resampling")
-    ("top", po::value<size_t>(), "top k recommendation")
-    ("bs_type", po::value<string>(), "bootstrap mode - currently 'mean' or 'vote'")
-    ("autolink", po::value<size_t>(), "create link function with polynomial d")
-    ("cb", po::value<size_t>(), "Use contextual bandit learning with <k> costs")
-    ("lda", po::value<uint32_t>(&(all->lda)), "Run lda with <int> topics")
-    ("nn", po::value<size_t>(), "Use sigmoidal feedforward network with <k> hidden units")
-    ("cbify", po::value<size_t>(), "Convert multiclass on <k> classes into a contextual bandit problem and solve")
-    ("search", po::value<size_t>(), "use search-based structured prediction, argument=maximum action id or 0 for LDF")
+    ("bootstrap,B", po::value<size_t>(), "bootstrap mode with k rounds by online importance resampling")
     ;
 
-  // Declare the supported options.
-  desc.add_options()
-    ("help,h","Look here: http://hunch.net/~vw/ and click on Tutorial.")
-    ("version","Version information")
-    ("random_seed", po::value<size_t>(&random_seed), "seed random number generator")
-    ("noop","do no learning")
-    ("print","print examples");
-
-  //po::positional_options_description p;
-  // Be friendly: if -d was left out, treat positional param as data file
-  //p.add("data", -1);
-
-  desc.add(in_opt)
-    .add(out_opt)
-    .add(update_opt)
+  desc.add(update_opt)
     .add(weight_opt)
-    .add(holdout_opt)
-    .add(namespace_opt)
-    .add(mf_opt)
-    .add(lrq_opt)
-    .add(multiclass_opt)
     .add(active_opt)
     .add(cluster_opt)
     .add(other_opt);
 
-  po::variables_map vm = po::variables_map();
-  po::variables_map vm_file = po::variables_map(); //separate variable map for storing flags in regressor file
-
-  po::parsed_options parsed = po::command_line_parser(argc, argv).
-    style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-    options(desc).allow_unregistered().run();   // got rid of ".positional(p)" because it doesn't work well with unrecognized options
-  vector<string> to_pass_further = po::collect_unrecognized(parsed.options, po::include_positional);
-  string last_unrec_arg =
-    (to_pass_further.size() > 0)
-    ? string(to_pass_further[to_pass_further.size()-1])  // we want to write this down in case it's a data argument ala the positional option we got rid of
-    : "";
-
-  po::store(parsed, vm);
-  po::notify(vm);
+  po::variables_map vm = add_options(*all, desc);
 
   msrand48(random_seed);
 
-  parse_diagnostics(*all, vm, desc, argc);
+  parse_diagnostics(*all, vm, argc);
 
   if (vm.count("active_simulation"))
     all->active_simulation = true;
@@ -904,8 +924,6 @@ vw* parse_args(int argc, char *argv[])
   if (vm.count("active_learning") && !all->active_simulation)
     all->active = true;
   
-  parse_source(*all, vm);
-
   all->sd->weighted_unlabeled_examples = all->sd->t;
   all->initial_t = (float)all->sd->t;
 
@@ -915,21 +933,29 @@ vw* parse_args(int argc, char *argv[])
   //Input regressor header
   io_buf io_temp;
   parse_regressor_args(*all, vm, io_temp);
-
-  all->options_from_file_argv = VW::get_argv_from_string(all->options_from_file,all->options_from_file_argc);
-
-  po::parsed_options parsed_file = po::command_line_parser(all->options_from_file_argc, all->options_from_file_argv).
+  
+  int temp_argc = 0;
+  char** temp_argv = VW::get_argv_from_string(all->file_options, temp_argc);
+  add_to_args(*all, temp_argc, temp_argv);
+  for (int i = 0; i < temp_argc; i++)
+    free(temp_argv[i]);
+  free(temp_argv);
+  
+  po::parsed_options pos = po::command_line_parser(all->args).
     style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-    options(desc).allow_unregistered().run();
+    options(all->opts).allow_unregistered().run();
 
-  po::store(parsed_file, vm_file);
-  po::notify(vm_file);
+  vm = po::variables_map();
+
+  po::store(pos, vm);
+  po::notify(vm);
+  all->file_options = "";
 
-  parse_feature_tweaks(*all, vm, vm_file); //feature tweaks
+  parse_feature_tweaks(*all, vm); //feature tweaks
 
   parse_example_tweaks(*all, vm); //example manipulation
 
-  parse_base_algorithm(*all, to_pass_further, vm);
+  parse_base_algorithm(*all, vm);
 
   if (!all->quiet)
     {
@@ -945,53 +971,26 @@ vw* parse_args(int argc, char *argv[])
 
   parse_output_model(*all, vm);
   
-  parse_output_preds(*all, vm, vm_file);
+  parse_output_preds(*all, vm);
 
-  load_input_model(*all, vm, io_temp);
-
-  parse_scorer_reductions(*all, to_pass_further, vm, vm_file);
+  parse_scorer_reductions(*all, vm);
 
   bool got_cs = false;
   
-  parse_score_users(*all, to_pass_further, vm, vm_file, got_cs);
+  parse_score_users(*all, vm, got_cs);
 
   bool got_cb = false;
   
-  parse_cb(*all, to_pass_further, vm, vm_file, got_cs, got_cb);
-
-  parse_search(*all, to_pass_further, vm, vm_file, got_cs, got_cb);
+  parse_cb(*all, vm, got_cs, got_cb);
 
-  if(vm.count("bs") || vm_file.count("bs") )
-    all->l = BS::setup(*all, to_pass_further, vm, vm_file);
+  parse_search(*all, vm, got_cs, got_cb);
 
-  if (to_pass_further.size() > 0) {
-    bool is_actually_okay = false;
+  if(vm.count("bootstrap"))
+    all->l = BS::setup(*all, vm);
 
-    // special case to try to emulate the missing -d
-    if ((to_pass_further.size() == 1) &&
-        (to_pass_further[to_pass_further.size()-1] == last_unrec_arg)) {
-      int f = io_buf().open_file(last_unrec_arg.c_str(), all->stdin_off, io_buf::READ);
-      if (f != -1) {
-#ifdef _WIN32
-		 _close(f);
-#else
-		  close(f);
-#endif
-        all->data_filename = last_unrec_arg;
-        if (ends_with(last_unrec_arg, ".gz"))
-          set_compressed(all->p);
-        is_actually_okay = true;
-      }
-    }
+  load_input_model(*all, vm, io_temp);
 
-    if (!is_actually_okay) {
-      cerr << "unrecognized options:";
-      for (size_t i=0; i<to_pass_further.size(); i++)
-        cerr << " " << to_pass_further[i];
-      cerr << endl;
-      throw exception();
-    }
-  }
+  parse_source(*all, vm);
 
   enable_sources(*all, vm, all->quiet,all->numpasses);
 
@@ -1002,6 +1001,12 @@ vw* parse_args(int argc, char *argv[])
     i++;
   all->wpp = (1 << i) >> all->reg.stride_shift;
 
+  if (vm.count("help")) {
+    /* upon direct query for help -- spit it out to stdout */
+    cout << "\n" << all->opts << "\n";
+    exit(0);
+  }
+  
   return all;
 }
 
@@ -1091,9 +1096,6 @@ namespace VW {
     all.p->parse_name.delete_v();
     free(all.p);
     free(all.sd);
-    for (int i = 0; i < all.options_from_file_argc; i++)
-      free(all.options_from_file_argv[i]);
-    free(all.options_from_file_argv);
     for (size_t i = 0; i < all.final_prediction_sink.size(); i++)
       if (all.final_prediction_sink[i] != 1)
 	io_buf::close_file_or_socket(all.final_prediction_sink[i]);
diff --git a/vowpalwabbit/parse_args.h b/vowpalwabbit/parse_args.h
index 744fece9..df713017 100644
--- a/vowpalwabbit/parse_args.h
+++ b/vowpalwabbit/parse_args.h
@@ -6,10 +6,6 @@ license as described in the file LICENSE.
 #ifndef PARSE_ARGS_H
 #define PARSE_ARGS_H
 
-#include <boost/program_options.hpp>
-#include <boost/program_options/parsers.hpp>
-namespace po = boost::program_options;
-#include "gd.h"
 #include "global_data.h"
 
 vw* parse_args(int argc, char *argv[]);
diff --git a/vowpalwabbit/parse_primitives.h b/vowpalwabbit/parse_primitives.h
index 0fde7236..89f4c0f2 100644
--- a/vowpalwabbit/parse_primitives.h
+++ b/vowpalwabbit/parse_primitives.h
@@ -92,7 +92,8 @@ struct parser {
   bool sorted_cache;
 
   size_t ring_size;
-  uint64_t parsed_examples; // The index of the parsed example.
+  uint64_t begin_parsed_examples; // The index of the beginning parsed example.
+  uint64_t end_parsed_examples; // The index of the fully parsed example.
   uint64_t local_example_number; 
   uint32_t in_pass_counter;
   example* examples;
diff --git a/vowpalwabbit/parse_regressor.cc b/vowpalwabbit/parse_regressor.cc
index cd57ee0e..757e4051 100644
--- a/vowpalwabbit/parse_regressor.cc
+++ b/vowpalwabbit/parse_regressor.cc
@@ -224,16 +224,16 @@ void save_load_header(vw& all, io_buf& model_file, bool read, bool text)
 				"", read, 
 				"\n",1, text);
       
-      text_len = sprintf(buff, "options:%s\n", all.options_from_file.c_str());
-      uint32_t len = (uint32_t)all.options_from_file.length()+1;
-      memcpy(buff2, all.options_from_file.c_str(),len);
+      text_len = sprintf(buff, "options:%s\n", all.file_options.c_str());
+      uint32_t len = (uint32_t)all.file_options.length()+1;
+      memcpy(buff2, all.file_options.c_str(),len);
       if (read)
 	len = buf_size;
       bin_text_read_write(model_file,buff2, len, 
 			  "", read,
 			  buff, text_len, text);
       if (read)
-	all.options_from_file.assign(buff2);
+	all.file_options.assign(buff2);
     }
 
 }
@@ -316,10 +316,6 @@ void parse_mask_regressor_args(vw& all, po::variables_map& vm){
       vector<string> init_filename = vm["initial_regressor"].as< vector<string> >();
       if(mask_filename == init_filename[0]){//-i and -mask are from same file, just generate mask
            
-        for (size_t j = 0; j < length; j++){	 
-          if(all.reg.weight_vector[j << all.reg.stride_shift] != 0.)
-            all.reg.weight_vector[(j << all.reg.stride_shift) + all.feature_mask_idx] = 1.;
-        } 
         return;
       }
     }
@@ -330,10 +326,6 @@ void parse_mask_regressor_args(vw& all, po::variables_map& vm){
     save_load_header(all, io_temp_mask, true, false);
     all.l->save_load(io_temp_mask, true, false);
     io_temp_mask.close_file();
-    for (size_t j = 0; j < length; j++){	 
-      if(all.reg.weight_vector[j << all.reg.stride_shift] != 0.)
-        all.reg.weight_vector[(j << all.reg.stride_shift) + all.feature_mask_idx] = 1.;
-    }
 
     // Deal with the over-written header from initial regressor
     if (vm.count("initial_regressor")) {
@@ -351,7 +343,7 @@ void parse_mask_regressor_args(vw& all, po::variables_map& vm){
       }
     } else {
       // If no initial regressor, just clear out the options loaded from the header.
-      all.options_from_file.assign("");
+      all.file_options.assign("");
     }
   }
 }
diff --git a/vowpalwabbit/parser.cc b/vowpalwabbit/parser.cc
index d42179c3..cb1397eb 100644
--- a/vowpalwabbit/parser.cc
+++ b/vowpalwabbit/parser.cc
@@ -251,7 +251,7 @@ void reset_source(vw& all, size_t numbits)
 	{
 	  // wait for all predictions to be sent back to client
 	  mutex_lock(&all.p->output_lock);
-	  while (all.p->local_example_number != all.p->parsed_examples)
+	  while (all.p->local_example_number != all.p->end_parsed_examples)
 	    condition_variable_wait(&all.p->output_done, &all.p->output_lock);
 	  mutex_unlock(&all.p->output_lock);
 	  
@@ -364,8 +364,6 @@ void parse_cache(vw& all, po::variables_map &vm, string source,
 	make_write_cache(all, caches[i], quiet);
       else {
 	uint32_t c = cache_numbits(all.p->input, f);
-	if (all.default_bits)
-	  all.num_bits = c;
 	if (c < all.num_bits) {
           all.p->input->close_file();          
 	  make_write_cache(all, caches[i], quiet);
@@ -624,11 +622,19 @@ void enable_sources(vw& all, po::variables_map& vm, bool quiet, size_t passes)
     cerr << "num sources = " << all.p->input->files.size() << endl;
 }
 
+/*Race condition hypothesis:
+
+  parser gets an unused example, discovers that it's done, creates an end-of-pass example, and sets done=true
+  learner finishes example before and calls get_example(), no examples remain but done is set, so it returns NULL. 
+  parser_done() returns true, learner thread exits, 
+  parser thread increments parsed_examples, then exits.
+
+ */
 bool parser_done(parser* p)
 {
   if (p->done)
     {
-      if (p->used_index != p->parsed_examples)
+      if (p->used_index != p->begin_parsed_examples)
 	return false;
       return true;
     }
@@ -716,11 +722,12 @@ example* get_unused_example(vw& all)
   while (true)
     {
       mutex_lock(&all.p->examples_lock);
-      if (all.p->examples[all.p->parsed_examples % all.p->ring_size].in_use == false)
+      if (all.p->examples[all.p->begin_parsed_examples % all.p->ring_size].in_use == false)
 	{
-	  all.p->examples[all.p->parsed_examples % all.p->ring_size].in_use = true;
+	  example& ret = all.p->examples[all.p->begin_parsed_examples++ % all.p->ring_size];
+	  ret.in_use = true;
 	  mutex_unlock(&all.p->examples_lock);
-	  return all.p->examples + (all.p->parsed_examples % all.p->ring_size);
+	  return &ret;
 	}
       else 
 	condition_variable_wait(&all.p->example_unused, &all.p->examples_lock);
@@ -734,14 +741,13 @@ bool parse_atomic_example(vw& all, example* ae, bool do_read = true)
     return false;
 
   if(all.p->sort_features && ae->sorted == false)
-    unique_sort_features(all.audit, ae);
+    unique_sort_features(all.audit, all.parse_mask, ae);
 
   if (all.p->write_cache) 
     {
       all.p->lp.cache_label(ae->ld,*(all.p->output));
       cache_features(*(all.p->output), ae, (uint32_t)all.parse_mask);
     }
-
   return true;
 }
 
@@ -759,7 +765,7 @@ void setup_example(vw& all, example* ae)
   ae->total_sum_feat_sq = 0;
   ae->loss = 0.;
   
-  ae->example_counter = (size_t)(all.p->parsed_examples + 1);
+  ae->example_counter = (size_t)(all.p->end_parsed_examples);
   if ((!all.p->emptylines_separate_examples) || example_is_newline(*ae))
     all.p->in_pass_counter++;
 
@@ -855,8 +861,8 @@ namespace VW{
   example* new_unused_example(vw& all) { 
     example* ec = get_unused_example(all);
     all.p->lp.default_label(ec->ld);
-    all.p->parsed_examples++;
-    ec->example_counter = all.p->parsed_examples;
+    all.p->begin_parsed_examples++;
+    ec->example_counter = all.p->begin_parsed_examples;
     return ec;
   }
   example* read_example(vw& all, char* example_line)
@@ -866,7 +872,7 @@ namespace VW{
     read_line(all, ret, example_line);
 	parse_atomic_example(all,ret,false);
     setup_example(all, ret);
-    all.p->parsed_examples++;
+    all.p->end_parsed_examples++;
 
     return ret;
   }
@@ -904,7 +910,7 @@ namespace VW{
       }
 	parse_atomic_example(all,ret,false);
     setup_example(all, ret);
-    all.p->parsed_examples++;
+    all.p->end_parsed_examples++;
     return ret;
   }
 
@@ -1059,7 +1065,7 @@ void *main_parse_loop(void *in)
 	       example_number = 0;
 	     }
 	   mutex_lock(&all->p->examples_lock);
-	   all->p->parsed_examples++;
+	   all->p->end_parsed_examples++;
 	   condition_variable_signal_all(&all->p->example_available);
 	   mutex_unlock(&all->p->examples_lock);
 
@@ -1071,10 +1077,10 @@ namespace VW{
 example* get_example(parser* p)
 {
   mutex_lock(&p->examples_lock);
-  if (p->parsed_examples != p->used_index) {
+  if (p->end_parsed_examples != p->used_index) {
     size_t ring_index = p->used_index++ % p->ring_size;
     if (!(p->examples+ring_index)->in_use)
-      cout << p->used_index << " " << p->parsed_examples << " " << ring_index << endl;
+      cout << p->used_index << " " << p->end_parsed_examples << " " << ring_index << endl;
     assert((p->examples+ring_index)->in_use);
     mutex_unlock(&p->examples_lock);
     
@@ -1103,7 +1109,8 @@ label_data* get_label(example* ec)
 void initialize_examples(vw& all)
 {
   all.p->used_index = 0;
-  all.p->parsed_examples = 0;
+  all.p->begin_parsed_examples = 0;
+  all.p->end_parsed_examples = 0;
   all.p->done = false;
 
   all.p->examples = (example*)calloc_or_die(all.p->ring_size, sizeof(example));
@@ -1117,7 +1124,7 @@ void initialize_examples(vw& all)
 
 void adjust_used_index(vw& all)
 {
-	all.p->used_index=all.p->parsed_examples;
+	all.p->used_index=all.p->begin_parsed_examples;
 }
 
 void initialize_parser_datastructures(vw& all)
diff --git a/vowpalwabbit/scorer.cc b/vowpalwabbit/scorer.cc
index 7d3fcfab..bdef00f3 100644
--- a/vowpalwabbit/scorer.cc
+++ b/vowpalwabbit/scorer.cc
@@ -19,7 +19,7 @@ namespace Scorer {
       base.predict(ec);
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     scorer* s = (scorer*)calloc_or_die(1, sizeof(scorer));
     s->all = &all;
diff --git a/vowpalwabbit/scorer.h b/vowpalwabbit/scorer.h
index 18c0bef5..328b0ad7 100644
--- a/vowpalwabbit/scorer.h
+++ b/vowpalwabbit/scorer.h
@@ -2,6 +2,6 @@
 #define SCORER_H
 
 namespace Scorer {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 #endif
diff --git a/vowpalwabbit/searn.cc b/vowpalwabbit/searn.cc
index 1c93a2b4..618e728b 100644
--- a/vowpalwabbit/searn.cc
+++ b/vowpalwabbit/searn.cc
@@ -38,7 +38,6 @@ namespace Searn
 {
   using namespace std;
 
-  const bool PRINT_DEBUG_INFO =0;
   const bool PRINT_UPDATE_EVERY_EXAMPLE =0;
   const bool PRINT_UPDATE_EVERY_PASS =0;
   const bool PRINT_CLOCK_TIME =0;
@@ -114,7 +113,7 @@ namespace Searn
     bool auto_hamming_loss;     // if you're just optimizing hamming loss, we can do it for you!
     bool examples_dont_change;  // set to true if you don't do any internal example munging
     bool is_ldf;                // set to true if you'll generate LDF data
-    
+
     size_t A;             // total number of actions, [1..A]; 0 means ldf
     SearnState state;           // current state of learning
     size_t learn_t;       // when LEARN, this is the t at which we're varying a
@@ -136,7 +135,7 @@ namespace Searn
     v_array<snapshot_item> beam_restore_to_end;
     v_array<uint32_t> beam_final_action_sequence;
     bool beam_is_training;
-    
+
     size_t   most_recent_snapshot_begin;
     size_t   most_recent_snapshot_end;
     uint32_t most_recent_snapshot_hash;
@@ -172,7 +171,7 @@ namespace Searn
 
     short rollout_method; // 0=policy, 1=oracle, 2=none
     bool  trajectory_oracle; // if true, only construct trajectories using the oracle
-    
+
     bool   allow_current_policy;  // should the current policy be used for training? true for dagger
     //bool   rollout_oracle; //if true then rollout are performed using oracle instead (optimal approximation discussed in searn's paper). this should be set to true for dagger
     bool   adaptive_beta; //used to implement dagger through searn. if true, beta = 1-(1-alpha)^n after n updates, and policy is mixed with oracle as \pi' = (1-beta)\pi^* + beta \pi
@@ -182,7 +181,7 @@ namespace Searn
     size_t beam_size;
     size_t kbest;
     bool   allow_unsafe_fast_forward;
-    
+
     size_t num_features;
     uint32_t total_number_of_policies;
     bool do_snapshot;
@@ -196,7 +195,7 @@ namespace Searn
     size_t total_predictions_made;
 
     bool hit_new_pass;
-    
+
     size_t passes_per_policy;
 
     vector<example*> ec_seq;
@@ -204,7 +203,7 @@ namespace Searn
     LEARNER::learner* base_learner;
     void* valid_labels;
     clock_t start_clock_time;
-    
+
     example*empty_example;
   };
 
@@ -214,7 +213,7 @@ namespace Searn
 
   uint32_t hash_example(example&ec, uint32_t seed) {
     uint32_t hash = seed;
-    
+
     for (unsigned char* i=ec.indices.begin; i != ec.indices.end; i++)
       hash = uniform_hash((unsigned char*) ec.atomics[*i].begin,
                           sizeof(feature) * (ec.atomics[*i].end - ec.atomics[*i].begin),
@@ -223,7 +222,7 @@ namespace Searn
     hash = uniform_hash( (unsigned char*) &ec.ft_offset,
                          sizeof(uint32_t),
                          hash );
-    
+
     return hash;
   }
 
@@ -248,10 +247,10 @@ namespace Searn
       std::cerr << "internal error (bug): no valid policies to choose from!  defaulting to current" << std::endl;
       return (int)current_policy;
     }
-    
+
     int num_valid_policies = (int)current_policy + allow_optimal + allow_current_policy;
     int pid = -1;
-    
+
     if (num_valid_policies == 0) {
       std::cerr << "internal error (bug): no valid policies to choose from!  defaulting to current" << std::endl;
       return (int)current_policy;
@@ -260,7 +259,7 @@ namespace Searn
     } else {
       float r = frand48();
       pid = 0;
-    
+
       if (r > beta) {
         r -= beta;
         while ((r > 0) && (pid < num_valid_policies-1)) {
@@ -272,7 +271,7 @@ namespace Searn
     // figure out which policy pid refers to
     if (allow_optimal && (pid == num_valid_policies-1))
       return -1; // this is the optimal policy
-  
+
     pid = (int)current_policy - pid;
     if (!allow_current_policy)
       pid--;
@@ -348,7 +347,7 @@ namespace Searn
               char buf[32];
               sprintf(buf, "{%d}", f->weight_index);
               fstring = string(buf);
-            } else 
+            } else
               fstring = string(ec->audit_features[*i][feature_index].feature);
             feature_index++;
           }
@@ -357,7 +356,7 @@ namespace Searn
 
           for (uint32_t t=1; t<=hinfo.features; t++) {
             v0 = ((h[hinfo.length-t]+1) * quadratic_constant * (additional_offset+1) + t) * history_constant;
-          
+
             // add the history/feature pair
             feature temp = {history_value, (uint32_t) ( ((v0 + v)*wpp) & all.reg.weight_mask )};
             ec->atomics[history_namespace].push_back(temp);
@@ -533,7 +532,7 @@ namespace Searn
     searn *srn = (searn*)all.searnstr;
     COST_SENSITIVE::label test_label;
     COST_SENSITIVE::cs_label.default_label(&test_label);
-    
+
     // TODO: modify this to handle contextual bandit base learner with ldf
     float best_prediction = 0;
     uint32_t best_action = 0;
@@ -553,14 +552,14 @@ namespace Searn
       cdbg << "predict: partial_prediction[" << action << "] = " << ecs[action].partial_prediction << endl;
       valid_labels->costs[action].partial_prediction = ecs[action].partial_prediction;
 
-      if ((action == 0) || 
+      if ((action == 0) ||
           (ecs[action].partial_prediction < best_prediction)) {
         cdbg << "best action = " << action << endl;
         best_prediction = ecs[action].partial_prediction;
         best_action     = action; // ((COST_SENSITIVE::label*)ecs[action].ld)->costs[0].weight_index;
       }
     }
-        
+
     if ((srn->priv->state == INIT_TEST) && (all.raw_prediction > 0)) {
       string outputString;
       stringstream outputStringStream(outputString);
@@ -573,7 +572,7 @@ namespace Searn
 
     //if (allow_exploration && (srn->priv->exploration_temperature > 0.))
     //  best_action = sample_with_temperature_partial_prediction(ecs, num_ec, srn->priv->exploration_temperature);
-    
+
     return best_action;
   }
 
@@ -631,7 +630,7 @@ namespace Searn
     //   else
     //     final_prediction = sample_with_temperature_cb(ld, srn.priv->exploration_temperature);
     // }
-    
+
     if ((srn.priv->state == INIT_TEST) && (all.raw_prediction > 0) && (srn.priv->rollout_all_actions)) { // srn.priv->rollout_all_actions ==> this is not CB, so we have COST_SENSITIVE::labels
       string outputString;
       stringstream outputStringStream(outputString);
@@ -642,7 +641,7 @@ namespace Searn
       }
       all.print_text(all.raw_prediction, outputStringStream.str(), ec.tag);
     }
-    
+
     ec.ld = old_label;
 
     return final_prediction;
@@ -806,7 +805,7 @@ namespace Searn
       }
     }
   }
-  
+
   // if not LDF:
   //   *ecs should be a pointer to THE example
   //   num_ec == 0
@@ -826,7 +825,7 @@ namespace Searn
   uint32_t searn_predict_without_loss(vw& all, learner& base, example* ecs, size_t num_ec, v_array<uint32_t> *yallowed, v_array<uint32_t> *ystar, bool ystar_is_uint32t)  // num_ec == 0 means normal example, >0 means ldf, yallowed==NULL means all allowed, ystar==NULL means don't know; ystar_is_uint32t means that the ystar ref is really just a uint32_t
   {
     searn* srn=(searn*)all.searnstr;
-    
+
     // check ldf sanity
     if (!srn->priv->is_ldf) {
       assert(num_ec == 0); // searntask is trying to define an ldf example in a non-ldf problem
@@ -902,7 +901,7 @@ namespace Searn
         return srn->priv->learn_a;
       } else { // t > learn_t
         size_t this_a = 0;
-                
+
         if (srn->priv->rollout_method == 1) { // rollout by oracle
           assert(ystar_is_uint32t);
           this_a = *(uint32_t*)ystar;
@@ -1033,9 +1032,9 @@ namespace Searn
       throw exception();
     }
     */
-    
+
     srn->priv->loss_declared = true;
-    
+
     srn->priv->loss_last_step = srn->priv->t;
     cdbg<<"new loss_last_step="<<srn->priv->t<<" incr_loss=" << incr_loss <<endl;
     if (srn->priv->state == INIT_TEST)
@@ -1082,8 +1081,8 @@ namespace Searn
     }
 
     return a;
-  } 
-      
+  }
+
 
   bool snapshot_binary_search_lt(v_array<snapshot_item> a, size_t desired_t, size_t tag, size_t &pos, size_t last_found_pos) {
     size_t hi  = a.size();
@@ -1109,7 +1108,7 @@ namespace Searn
           ((a[pos].pred_step == desired_t) && (a[pos].tag >  tag)))
         hi = pos;
       else break;
-      
+
       pos = hi >> 1;
     }
 
@@ -1129,7 +1128,7 @@ namespace Searn
     if (last_found_pos + 1 < hi) {
       pos = last_found_pos+1;
       if ((a[pos].index == desired_index) && (a[pos].tag == tag)) return true;
-    } else      
+    } else
       pos = (lo + hi) >> 1;
     while (true) {
       if (lo == hi) return false;
@@ -1195,7 +1194,7 @@ namespace Searn
       priv->snapshot_data.push_back(item);
       //cerr << "priv->snapshot_data.push_back(item);" << endl;
       return;
-    }        
+    }
 
     if (priv->state == FAST_FORWARD) { // go to the end!
       snapshot_item &me = priv->snapshot_data[priv->fast_forward_position + tag];  // TODO: generalize or ensure that tags are +=1 each time, also this is broken if they don't auto-history
@@ -1212,7 +1211,7 @@ namespace Searn
       if ((cur_size > 0) && // only need to keep around the NEWEST set of snapshots
           (priv->snapshot_data[cur_size - 1].pred_step < priv->t))
         clear_snapshot(*all, *srn, true);
-      
+
       void* new_data = malloc(sizeof_data);
       memcpy(new_data, data_ptr, sizeof_data);
       snapshot_item item = { index, tag, new_data, sizeof_data, priv->t };
@@ -1269,7 +1268,7 @@ namespace Searn
 
       return;
     }
-    
+
     cerr << "yikes, how did I get here? state = " << priv->state << endl;
     throw exception();
   }
@@ -1305,7 +1304,7 @@ namespace Searn
     if (priv->state == INIT_TRAIN)
         priv->final_snapshot_end = priv->most_recent_snapshot_end;
   }
- 
+
 
   inline bool cmp_size_t(const size_t a, const size_t b) { return a < b; }
 
@@ -1332,7 +1331,7 @@ namespace Searn
       }
       std::sort(timesteps.begin, timesteps.end, cmp_size_t);
     }
-      
+
     return timesteps;
   }
 
@@ -1445,9 +1444,9 @@ namespace Searn
     srn.priv->loss_declared = false;
     if (srn.priv->auto_history) clear_rollout_actions(srn);
 
-    // srn.priv->snapshot_is_equivalent_to_t = (size_t)-1;
-    // srn.priv->snapshot_could_match = false;
-    // srn.priv->snapshot_last_found_pos = (size_t)-1;
+    srn.priv->snapshot_is_equivalent_to_t = (size_t)-1;
+    srn.priv->snapshot_could_match = false;
+    srn.priv->snapshot_last_found_pos = (size_t)-1;
   }
 
   void mark_hyp_pruned(void*data) { ((beam_hyp*)data)->pruned = true; }
@@ -1458,7 +1457,7 @@ namespace Searn
 
     beam* cur_beam   = new beam(srn.priv->beam_size);
     beam* next_beam  = new beam(srn.priv->beam_size);
-    
+
     // initialize first beam
     {
       // in this call to structured_predict, we do the following:
@@ -1466,7 +1465,7 @@ namespace Searn
       //   2) collect the initial snapshot
       //   3) store the final snapshot so we can fast-foward to the end at will
       beam_hyp *hyp = hyp_pool.begin;
-    
+
       hyp->t            = 0;
       hyp->parent       = NULL;
       hyp->action_taken = 0;     // irrelevant because parent==NULL
@@ -1482,7 +1481,7 @@ namespace Searn
       srn.priv->state        = BEAM_INIT;
       srn.priv->cur_beam_hyp = hyp;
       srn.task->structured_predict(srn, ec);
-    
+
       assert(hyp->filled_in_prediction);   // TODO: handle the case that structured_predict just returns or something else weird happens
 
       // collect the costs
@@ -1497,7 +1496,7 @@ namespace Searn
         }
         cdbg << endl;
       }
-      
+
       // collect the final snapshot
       copy_array(srn.priv->beam_restore_to_end, srn.priv->snapshot_data);
 
@@ -1574,7 +1573,7 @@ namespace Searn
         beam_hyp* hyp = (beam_hyp*) be->data;
         /*UNDOME*/cdbg << "\t{ cost=" << be->cost << " t=" << hyp->t << " action_taken=" << hyp->action_taken << " incr_cost=" << hyp->incr_cost << " num_actions=" << hyp->num_actions << " parent=" << hyp->parent << " }" << endl;
       }
-    }    
+    }
 
     // debug print the final beam
     /*UNDOME*/cdbg << "FINAL BEAM =" << endl;
@@ -1614,9 +1613,17 @@ namespace Searn
 
   void beam_predict(vw&all, searn&srn, vector<example*>ec, v_array<beam_hyp> &hyp_pool, size_t &hyp_pool_id, bool is_learn) {
     using namespace Beam;
-    
+
+    if (might_print_update(all)) {
+      reset_searn_structure(srn);
+      srn.priv->state = GET_TRUTH_STRING;
+      srn.priv->should_produce_string = true;
+      srn.priv->truth_string->str("");
+      srn.task->structured_predict(srn, ec);
+    }
+
     beam* final_beam = new beam(max(1, min(srn.priv->beam_size, srn.priv->kbest)));  // at least 1, but otherwise the min of beam_size and kbest
-    
+
     compute_full_beam(all, srn, ec, hyp_pool, hyp_pool_id, final_beam);
     
     if (srn.priv->should_produce_string && !is_learn) { // TODO: check if this is going to be used at all!!!
@@ -1660,7 +1667,7 @@ namespace Searn
         for (int* sink = all.final_prediction_sink.begin; sink != all.final_prediction_sink.end; ++sink)
           all.print_text(*sink, "", ec[0]->tag);
     }
-  
+
     final_beam->erase();
     delete final_beam;
   }
@@ -1688,7 +1695,6 @@ namespace Searn
     if (must_run_test(all, ec)) {
       srn.priv->should_produce_string = might_print_update(all) || (all.final_prediction_sink.size() > 0) || (all.raw_prediction > 0);
       srn.priv->pred_string->str("");
-      
       assert(srn.priv->truth_string != NULL);
       srn.task->structured_predict(srn, ec);
       srn.priv->should_produce_string = false;
@@ -1715,12 +1721,12 @@ namespace Searn
       srn.priv->loss_last_step = 0;
       clear_snapshot(all, srn, true);
 
-      // srn.priv->snapshot_is_equivalent_to_t = (size_t)-1;
-      // srn.priv->snapshot_last_found_pos = (size_t)-1;
-      // srn.priv->snapshot_could_match = false;
+      srn.priv->snapshot_is_equivalent_to_t = (size_t)-1;
+      srn.priv->snapshot_last_found_pos = (size_t)-1;
+      srn.priv->snapshot_could_match = false;
       srn.priv->loss_declared = false;
       srn.priv->should_produce_string = false;
-      
+
       srn.task->structured_predict(srn, ec);
 
       if ( (! srn.priv->loss_declared) &&   // no loss was declared
@@ -1728,7 +1734,7 @@ namespace Searn
            (all.training)              &&   // in training mode
            (! ec[0]->test_only) )           // and not a test example
         cerr << "warning: no loss declared by task on something that looks like a training example!" << endl;
-      
+
       if (srn.priv->t == 0) {
         clear_snapshot(all, srn, true);
         return;  // there was no data
@@ -1769,7 +1775,6 @@ namespace Searn
             srn.priv->learn_loss = 0.f;
             srn.priv->learn_example_len = 0;
             cdbg << "learn_example_len = 0" << endl;
-            
             cdbg << "learn_t = " << srn.priv->learn_t << " || learn_a = " << srn.priv->learn_a << endl;
             // srn.priv->snapshot_is_equivalent_to_t = (size_t)-1;
             // srn.priv->snapshot_could_match = true;
@@ -1806,7 +1811,7 @@ namespace Searn
 
           if (!srn.priv->examples_dont_change) {
             cdbg << "deleting labels for " << srn.priv->learn_example_len << " learn_example_copy items" << endl;
-            for (size_t n=0; n<srn.priv->learn_example_len; n++) 
+            for (size_t n=0; n<srn.priv->learn_example_len; n++)
               //cdbg << "free_example_data[" << n << "]: "; GD::print_audit_features(all, &srn.priv->learn_example_copy[n]);
 	      if (srn.priv->is_ldf) COST_SENSITIVE::cs_label.delete_label(srn.priv->learn_example_copy[n].ld);
               else                  MULTICLASS::mc_label.delete_label(srn.priv->learn_example_copy[n].ld);
@@ -1819,7 +1824,7 @@ namespace Searn
       tset.erase(); tset.delete_v();
       cdbg << endl;
     }
-  
+
     clear_snapshot(all, srn, true);
     srn.priv->train_action.delete_v();
     srn.priv->train_action_ids.delete_v();
@@ -1837,7 +1842,7 @@ namespace Searn
     }
     srn.priv->train_labels.erase();
     srn.priv->train_labels.delete_v();
-    
+
     cdbg << "======================================== DONE (" << srn.priv->current_policy << "," << srn.priv->read_example_last_pass << ") ========================================" << endl;
 
   }
@@ -1863,9 +1868,9 @@ namespace Searn
         out[i] = in[i];
     }
 
-    if (in.length() > max_len) { 
-      out[max_len-2] = '.'; 
-      out[max_len-1] = '.'; 
+    if (in.length() > max_len) {
+      out[max_len-2] = '.';
+      out[max_len-1] = '.';
     }
     out[max_len] = 0;
   }
@@ -1923,7 +1928,7 @@ void print_update(vw& all, searn& srn)
 
     if (!all.holdout_set_off && all.current_pass >= 1)
       fprintf(stderr, " h");
-    
+
     fprintf(stderr, "\n");
 
     all.sd->sum_loss_since_last_dump = 0.0;
@@ -1947,7 +1952,7 @@ void print_update(vw& all, searn& srn)
         size_t  enc_offset = wpp * ((2 * (size_t)(*enc)) + ((*enc < 0) ? 1 : 0));
 
         cdbg << "old_ns = " << old_ns << endl;
-        
+
         if ((n + offset >= 0) && (n + offset < (int32_t)srn.priv->ec_seq.size())) { // we're okay on position
           example*you = srn.priv->ec_seq[n+offset];
           size_t  you_size = you->atomics[old_ns].size();
@@ -1988,7 +1993,7 @@ void print_update(vw& all, searn& srn)
             //cdbg << "copying " << you_size << " features" << endl;
             me->sum_feat_sq[neighbor_namespace] += you->sum_feat_sq[old_ns];
             me->total_sum_feat_sq += you->sum_feat_sq[old_ns];
-            me->num_features += you_size;            
+            me->num_features += you_size;
           }
         } else if ((n + offset == -1) || (n + offset == (int32_t)srn.priv->ec_seq.size())) { // handle <s> and </s>
           size_t bias  = constant * ((n + offset < 0) ? 2 : 3);
@@ -2036,7 +2041,7 @@ void print_update(vw& all, searn& srn)
       cdbg << "del n=" << me->num_features;
       size_t total_size = 0;
       float total_sfs = 0.;
-        
+
       for (int32_t*enc=srn.priv->neighbor_features.begin; enc!=srn.priv->neighbor_features.end; ++enc) {
         int32_t offset = (*enc) >> 24;
         size_t  old_ns = (*enc) & 0xFF;
@@ -2075,10 +2080,10 @@ void print_update(vw& all, searn& srn)
               free(ad->space);
               free(ad->feature);
             }
-          
+
           me->audit_features[neighbor_namespace].end -= total_size;
         }
-            
+
         me->sum_feat_sq[neighbor_namespace] -= total_sfs;
         me->total_sum_feat_sq -= total_sfs;
         me->num_features -= total_size;
@@ -2107,13 +2112,13 @@ void print_update(vw& all, searn& srn)
       priv->learn_losses.erase();
 
       COST_SENSITIVE::label aset;
-      
+
       // TODO: prune training steps ala get_training_timesteps
       cdbg << "t=" << me.t << ", labelset_size=" << me.num_actions << endl;
-      for (size_t aid=0; aid<me.num_actions; aid++) {
+      for (uint32_t aid=0; aid<me.num_actions; aid++) {
         COST_SENSITIVE::wclass my_class = { 0., aid+1, 0., 0. }; // TODO: make this valid for LDF
         aset.costs.push_back( my_class );
-        
+
         if (srn.priv->auto_history)
           clear_rollout_actions(srn);
 
@@ -2122,8 +2127,8 @@ void print_update(vw& all, searn& srn)
         beam_hyp *h = &me;
         for (size_t t=0; t<me.t; t++) {
           assert(h != NULL);
-          priv->train_action[me.t - t - 1] = h->action_taken+1;
-          priv->train_action_ids[me.t - t - 1] = h->action_taken+1;  // TODO: make this valid for ldf
+          priv->train_action[me.t - t - 1] = (uint32_t)h->action_taken+1;
+          priv->train_action_ids[me.t - t - 1] = (uint32_t)h->action_taken+1;  // TODO: make this valid for ldf
           cdbg << "set train_action[" << (me.t-t-1) << "] = " << h->action_taken+1 << endl;
           h = h->parent;
         }
@@ -2133,7 +2138,7 @@ void print_update(vw& all, searn& srn)
         cdbg << "train_action.size = " << priv->train_action.size() << endl;
         for (size_t t=0; t<me.t; t++)
           priv->rollout_action.push_back(priv->train_action[t]);
-        
+
         priv->t = 0;
         priv->learn_a = (uint32_t)aid+1;  // TODO: make this LDF compatible (by remembering what the actions were!)
         priv->loss_last_step = 0;
@@ -2157,7 +2162,7 @@ void print_update(vw& all, searn& srn)
 
         if (!priv->examples_dont_change) {
           cdbg << "deleting labels for " << priv->learn_example_len << " learn_example_copy items" << endl;
-          for (size_t n=0; n<priv->learn_example_len; n++) 
+          for (size_t n=0; n<priv->learn_example_len; n++)
             //cdbg << "free_example_data[" << n << "]: "; GD::print_audit_features(all, &priv->learn_example_copy[n]);
             if (priv->is_ldf) COST_SENSITIVE::cs_label.delete_label(priv->learn_example_copy[n].ld);
             else                  MULTICLASS::mc_label.delete_label(priv->learn_example_copy[n].ld);
@@ -2185,8 +2190,7 @@ void print_update(vw& all, searn& srn)
       srn.priv->truth_string->str("");
       srn.task->structured_predict(srn, srn.priv->ec_seq);
     }
-  
-    
+
     if (srn.priv->beam_size == 0)
       train_single_example<is_learn>(all, srn, srn.priv->ec_seq);
     else {
@@ -2221,7 +2225,7 @@ void print_update(vw& all, searn& srn)
       hyp_pool.delete_v();
       srn.priv->test_loss = cached_test_loss;
     }
-    
+
     del_neighbor_features(srn);
 
     if (srn.priv->ec_seq[0]->test_only) {
@@ -2246,7 +2250,7 @@ void print_update(vw& all, searn& srn)
     srn.priv->base_learner = &base;
     bool is_real_example = true;
 
-    if (example_is_newline(ec) || srn.priv->ec_seq.size() >= all->p->ring_size - 2) { 
+    if (example_is_newline(ec) || srn.priv->ec_seq.size() >= all->p->ring_size - 2) {
       if (srn.priv->ec_seq.size() >= all->p->ring_size - 2) { // give some wiggle room
 	std::cerr << "warning: length of sequence at " << ec.example_counter << " exceeds ring size; breaking apart" << std::endl;
       }
@@ -2254,13 +2258,13 @@ void print_update(vw& all, searn& srn)
       do_actual_learning<is_learn>(*all, srn);
       clear_seq(*all, srn);
       srn.priv->hit_new_pass = false;
-      
+
       //VW::finish_example(*all, ec);
       is_real_example = false;
     } else {
       srn.priv->ec_seq.push_back(&ec);
     }
-    
+
     if (is_real_example) {
       srn.priv->read_example_last_id = ec.example_counter;
     }
@@ -2283,7 +2287,7 @@ void print_update(vw& all, searn& srn)
       //reset searn_trained_nb_policies in options_from_file so it is saved to regressor file later
       std::stringstream ss;
       ss << srn.priv->current_policy;
-      VW::cmd_string_replace_value(all->options_from_file,"--search_trained_nb_policies", ss.str());
+      VW::cmd_string_replace_value(all->file_options,"--search_trained_nb_policies", ss.str());
     }
   }
 
@@ -2304,10 +2308,10 @@ void print_update(vw& all, searn& srn)
       std::stringstream ss2;
       ss1 << ((srn.priv->passes_since_new_policy == 0) ? srn.priv->current_policy : (srn.priv->current_policy+1));
       //use cmd_string_replace_value in case we already loaded a predictor which had a value stored for --search_trained_nb_policies
-      VW::cmd_string_replace_value(all->options_from_file,"--search_trained_nb_policies", ss1.str()); 
+      VW::cmd_string_replace_value(all->file_options,"--search_trained_nb_policies", ss1.str());
       ss2 << srn.priv->total_number_of_policies;
       //use cmd_string_replace_value in case we already loaded a predictor which had a value stored for --search_total_nb_policies
-      VW::cmd_string_replace_value(all->options_from_file,"--search_total_nb_policies", ss2.str());
+      VW::cmd_string_replace_value(all->file_options,"--search_total_nb_policies", ss2.str());
     }
   }
 
@@ -2371,7 +2375,7 @@ void print_update(vw& all, searn& srn)
   void searn_initialize(vw& all, searn& srn)
   {
     srn.priv->examples_dont_change = false;
-    
+
     srn.priv->beta = 0.5;
     srn.priv->alpha = 1e-10f;
     srn.priv->allow_current_policy = false;
@@ -2388,14 +2392,13 @@ void print_update(vw& all, searn& srn)
     srn.priv->beam_size = 0; // 0 ==> no beam
     srn.priv->kbest = 0; // 0 or 1 means just 1 best
     srn.priv->allow_unsafe_fast_forward = true;
-    
     srn.priv->neighbor_features_string = new string();
-    
+
     srn.priv->passes_per_policy = 1;     //this should be set to the same value as --passes for dagger
 
     srn.task = NULL;
     srn.task_data = NULL;
-    
+
     srn.priv->read_example_last_id = 0;
     srn.priv->passes_since_new_policy = 0;
     srn.priv->read_example_last_pass = 0;
@@ -2403,14 +2406,14 @@ void print_update(vw& all, searn& srn)
     srn.priv->total_predictions_made = 0;
     srn.priv->hit_new_pass = false;
     srn.priv->subsample_timesteps = 0.;
-    
+
     srn.priv->total_number_of_policies = 1;
 
     srn.priv->truth_string = new stringstream();
     srn.priv->pred_string  = new stringstream();
     srn.priv->bad_string_stream = new stringstream();
     srn.priv->bad_string_stream->clear(srn.priv->bad_string_stream->badbit);
-    
+
     srn.priv->should_produce_string = false;
 
     srn.priv->printed_output_header = false;
@@ -2422,7 +2425,7 @@ void print_update(vw& all, searn& srn)
 
     snapshot_item_result def_snapshot_result = { 0, -1.f };
     srn.priv->snapshot_map = new snapmap(102341, def_snapshot_result, snapshot_item_ptr_eq, &srn.priv->snapshot_data);
-    
+
     srn.priv->empty_example = alloc_examples(sizeof(COST_SENSITIVE::label), 1);
     COST_SENSITIVE::cs_label.default_label(srn.priv->empty_example->ld);
     srn.priv->empty_example->in_use = true;
@@ -2439,7 +2442,7 @@ void print_update(vw& all, searn& srn)
     delete srn.priv->neighbor_features_string;
     srn.priv->neighbor_features.erase();
     srn.priv->neighbor_features.delete_v();
-    
+
     if (srn.priv->rollout_all_actions) { // dst should be a COST_SENSITIVE::label*
       ((COST_SENSITIVE::label*)srn.priv->valid_labels)->costs.erase();
       ((COST_SENSITIVE::label*)srn.priv->valid_labels)->costs.delete_v();
@@ -2447,7 +2450,7 @@ void print_update(vw& all, searn& srn)
       ((CB::label*)srn.priv->valid_labels)->costs.erase();
       ((CB::label*)srn.priv->valid_labels)->costs.delete_v();
     }
-    
+
     if (srn.priv->rollout_all_actions) // labels are COST_SENSITIVE
       delete (COST_SENSITIVE::label*)srn.priv->valid_labels;
     else // labels are CB
@@ -2460,7 +2463,7 @@ void print_update(vw& all, searn& srn)
 
     clear_snapshot(*all, srn, true);
     srn.priv->snapshot_data.delete_v();
-    
+
     for (size_t i=0; i<srn.priv->train_labels.size(); i++) {
       if (srn.priv->rollout_all_actions) {
         ((COST_SENSITIVE::label*)srn.priv->train_labels[i])->costs.erase();
@@ -2481,11 +2484,11 @@ void print_update(vw& all, searn& srn)
         dealloc_example(delete_label, srn.priv->learn_example_copy[n]);
     }
 
-    
+
     if (srn.task->finish != NULL) {
       srn.task->finish(srn);
     }
-    
+
     srn.priv->train_labels.delete_v();
     srn.priv->train_action.delete_v();
     srn.priv->train_action_ids.delete_v();
@@ -2511,79 +2514,48 @@ void print_update(vw& all, searn& srn)
   bool uint32_equal(uint32_t a, uint32_t b) { return a==b; }
   bool size_equal(size_t a, size_t b) { return a==b; }
 
-  template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, po::variables_map& vm_file, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string) {
-    if (vm_file.count(opt_name)) { // loaded from regressor file
-      ret = vm_file[opt_name].as<T>();
-      if (vm.count(opt_name) && !equal(ret, vm[opt_name].as<T>())) {
-        if (default_to_cmdline)
-          ret = vm[opt_name].as<T>();
-        std::cerr << mismatch_error_string << ret << endl;
-      }
-    } else if (vm.count(opt_name)) {
+  template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string) {
+    if (vm.count(opt_name)) {
       ret = vm[opt_name].as<T>();
       stringstream ss;
       ss << " --" << opt_name << " " << ret;
-      all.options_from_file.append(ss.str());
+      all.file_options.append(ss.str());
     } else if (strlen(required_error_string)>0) {
       std::cerr << required_error_string << endl;
       throw exception();
     }
-  }  
+  }
 
-  void check_option(bool& ret, vw&all, po::variables_map& vm, po::variables_map& vm_file, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string) {
-    if (vm_file.count(opt_name)) { // loaded from regressor file
-      ret = true;
-      if (!vm.count(opt_name)) {
-        if (default_to_cmdline)
-          ret = false;
-        std::cerr << mismatch_error_string << ret << endl;
-      }
-    } else if (vm.count(opt_name)) {
+  void check_option(bool& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string) {
+    if (vm.count(opt_name)) {
       ret = true;
       stringstream ss;
       ss << " " << opt_name;
-      all.options_from_file.append(ss.str());
-    } else {
+      all.file_options.append(ss.str());
+    } else
       ret = false;
-    }
-  }  
-
-  void setup_searn_options(po::options_description& desc, vw&vw, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) {
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
-
-    po::parsed_options parsed_file = po::command_line_parser(vw.options_from_file_argc, vw.options_from_file_argv).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    po::store(parsed_file, vm_file);
-    po::notify(vm_file);
   }
 
-
-  void handle_history_options(vw& vw, history_info &hinfo, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) {
-    po::options_description desc("search options");
-    desc.add_options()
+  void handle_history_options(vw& vw, history_info &hinfo, po::variables_map& vm) {
+    po::options_description history_options("history options");
+    history_options.add_options()
       ("search_history",  po::value<size_t>(), "length of history to use")
       ("search_features", po::value<size_t>(), "length of history to pair with observed features")
       ("search_bigrams",                       "use bigrams from history")
       ("search_bigram_features",               "use bigrams from history paired with observed features");
 
-    setup_searn_options(desc, vw, opts, vm, vm_file);
-    
-    check_option<size_t>(hinfo.length, vw, vm, vm_file, "search_history", false, size_equal,
+    vm = add_options(vw, history_options);
+
+    check_option<size_t>(hinfo.length, vw, vm, "search_history", false, size_equal,
                          "warning: you specified a different value for --search_history than the one loaded from regressor. proceeding with loaded value: ", "");
-    
-    check_option<size_t>(hinfo.features, vw, vm, vm_file, "search_features", false, size_equal,
+
+    check_option<size_t>(hinfo.features, vw, vm, "search_features", false, size_equal,
                          "warning: you specified a different value for --search_features than the one loaded from regressor. proceeding with loaded value: ", "");
-    
-    check_option        (hinfo.bigrams, vw, vm, vm_file, "search_bigrams", false,
+
+    check_option        (hinfo.bigrams, vw, vm, "search_bigrams", false,
                          "warning: you specified --search_bigrams but that wasn't loaded from regressor. proceeding with loaded value: ");
-    
-    check_option        (hinfo.bigram_features, vw, vm, vm_file, "search_bigram_features", false,
+
+    check_option        (hinfo.bigram_features, vw, vm, "search_bigram_features", false,
                          "warning: you specified --search_bigram_features but that wasn't loaded from regressor. proceeding with loaded value: ");
   }
 
@@ -2603,12 +2575,12 @@ void print_update(vw& all, searn& srn)
       count++;
     }
     fclose(f);
-   
+
     v_array<COST_SENSITIVE::label> allowed;
 
     for (size_t from=0; from<A; from++) {
       v_array<COST_SENSITIVE::wclass> costs;
-      
+
       for (size_t to=0; to<A; to++)
         if (bg[from * (A+1) + to]) {
           COST_SENSITIVE::wclass c = { FLT_MAX, (uint32_t)to, 0., 0. };
@@ -2621,7 +2593,7 @@ void print_update(vw& all, searn& srn)
     free(bg);
 
     cerr << "read " << count << " allowed transitions from " << filename << endl;
-    
+
     return allowed;
   }
 
@@ -2654,25 +2626,26 @@ void print_update(vw& all, searn& srn)
       }
       int32_t enc = (posn << 24) | (ns & 0xFF);
       srn.priv->neighbor_features.push_back(enc);
-      
+
       p = strtok(NULL, ",");
     }
     cmd.erase();
     cmd.delete_v();
-    
+
     delete cstr;
   }
 
-  learner* setup(vw&all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw&all, po::variables_map& vm)
   {
     searn* srn = (searn*)calloc_or_die(1,sizeof(searn));
     srn->priv = new searn_private();
     srn->priv->all = &all;
+    srn->all = &all;
 
     searn_initialize(all, *srn);
 
-    po::options_description desc("Searn options");
-    desc.add_options()
+    po::options_description searn_opts("Searn options");
+    searn_opts.add_options()
         ("search_task",              po::value<string>(), "the search task")
         ("search_interpolation",     po::value<string>(), "at what level should interpolation happen? [*data|policy]")
         ("search_rollout",           po::value<string>(), "how should rollouts be executed?           [*policy|oracle|none]")
@@ -2684,7 +2657,9 @@ void print_update(vw& all, searn& srn)
         ("search_alpha",             po::value<float>(),  "annealed beta = 1-(1-alpha)^t (only valid for search_interpolation=data)     [def=1e-10]")
 
         ("search_total_nb_policies", po::value<size_t>(), "if we are going to train the policies through multiple separate calls to vw, we need to specify this parameter and tell vw how many policies are eventually going to be trained")
-        
+
+        ("search_trained_nb_policies", po::value<size_t>(), "the number of trained policies in a file")
+
         ("search_allowed_transitions",po::value<string>(),"read file of allowed transitions [def: all transitions are allowed]")
         ("search_subsample_time",    po::value<float>(),  "instead of training at all timesteps, use a subset. if value in (0,1), train on a random v%. if v>=1, train on precisely v steps per example")
         ("search_neighbor_features", po::value<string>(), "copy features from neighboring lines. argument looks like: '-1:a,+2' meaning copy previous line namespace a and next next line from namespace _unnamed_, where ',' separates them")
@@ -2695,55 +2670,31 @@ void print_update(vw& all, searn& srn)
         ("search_no_snapshot",                            "turn off snapshotting capabilities")
         ("search_no_fastforward",                         "turn off fastforwarding (note: fastforwarding requires snapshotting)");
 
-    
-        // removed options:
-        //("search_allow_current_policy", "allow searn labeling to use the current policy")
-        //("search_rollout_oracle", "allow searn/dagger to do rollouts with the oracle when estimating cost-to-go")
-        //("search_as_dagger", po::value<float>(), "sets options to make searn operate as dagger. parameter is the sliding autonomy rate (rate at which beta tends to 1).")
-        //("search_exploration_temperature", po::value<float>(), "if <0, always choose policy action (default); if T>=0, choose according to e^{-prediction / T} -- done to avoid overfitting")
-    
-    po::options_description add_desc_file("Searn options only available in regressor file");
-    add_desc_file.add_options()("search_trained_nb_policies", po::value<size_t>(), "the number of trained policies in the regressor file");
-
-    po::options_description desc_file;
-    desc_file.add(desc).add(add_desc_file);
-
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
-
-    po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc, all.options_from_file_argv).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc_file).allow_unregistered().run();
-    po::store(parsed_file, vm_file);
-    po::notify(vm_file);
-  
+    vm = add_options(all, searn_opts);
+
     std::string task_string;
     std::string interpolation_string = "data";
     std::string rollout_string = "policy";
     std::string trajectory_string = "policy";
 
-    check_option<string>(task_string, all, vm, vm_file, "search_task", false, string_equal,
+    check_option<string>(task_string, all, vm, "search_task", false, string_equal,
                          "warning: specified --search_task different than the one loaded from regressor. using loaded value of: ",
                          "error: you must specify a task using --search_task");
-    check_option<string>(interpolation_string, all, vm, vm_file, "search_interpolation", false, string_equal,
+    check_option<string>(interpolation_string, all, vm, "search_interpolation", false, string_equal,
                          "warning: specified --search_interpolation different than the one loaded from regressor. using loaded value of: ", "");
-    check_option<string>(rollout_string, all, vm, vm_file, "search_rollout", false, string_equal,
+    check_option<string>(rollout_string, all, vm, "search_rollout", false, string_equal,
                          "warning: specified --search_rollout different than the one loaded from regressor. using loaded value of: ", "");
-    check_option<string>(trajectory_string, all, vm, vm_file, "search_trajectory", false, string_equal,
+    check_option<string>(trajectory_string, all, vm, "search_trajectory", false, string_equal,
                          "warning: specified --search_trajectory different than the one loaded from regressor. using loaded value of: ", "");
-    
+
     if (vm.count("search_passes_per_policy"))       srn->priv->passes_per_policy    = vm["search_passes_per_policy"].as<size_t>();
     if (vm.count("search_beta"))                    srn->priv->beta                 = vm["search_beta"             ].as<float>();
 
     if (vm.count("search_alpha"))                   srn->priv->alpha                = vm["search_alpha"            ].as<float>();
 
     if (vm.count("search_subsample_time"))          srn->priv->subsample_timesteps  = vm["search_subsample_time"].as<float>();
-    
-    check_option<string>(*srn->priv->neighbor_features_string, all, vm, vm_file, "search_neighbor_features", false, string_equal,
+
+    check_option<string>(*srn->priv->neighbor_features_string, all, vm, "search_neighbor_features", false, string_equal,
                          "warning: you specified a different feature structure with --search_neighbor_features than the one loaded from predictor. using loaded value of: ", "");
     parse_neighbor_features(*srn);
 
@@ -2785,11 +2736,11 @@ void print_update(vw& all, searn& srn)
       throw exception();
     }
 
-    //check_option<float >(srn->priv->exploration_temperature, all, vm, vm_file, "search_exploration_temperature", false, float_equal,
+    //check_option<float >(srn->priv->exploration_temperature, all, vm, "search_exploration_temperature", false, float_equal,
     //                     "warning: you specified a different value through --search_exploration_temperature than the one loaded from predictor. using loaded value of: ", "");
-    check_option<size_t>(srn->priv->A, all, vm, vm_file, "search", false, size_equal,
+    check_option<size_t>(srn->priv->A, all, vm, "search", false, size_equal,
                          "warning: you specified a different number of actions through --search than the one loaded from predictor. using loaded value of: ", "");
-    
+
     //if (vm.count("search_allow_current_policy"))    srn->priv->allow_current_policy = true;
     //if (vm.count("search_rollout_oracle"))          srn->priv->rollout_oracle       = true;
 
@@ -2807,24 +2758,19 @@ void print_update(vw& all, searn& srn)
       cerr << "error: kbest must be at least equal to beam_size" << endl;
       throw exception();
     }
-    
+
     //check if the base learner is contextual bandit, in which case, we dont rollout all actions.
-    if (vm.count("cb") || vm_file.count("cb")) {
+    if (vm.count("cb")) {
       srn->priv->rollout_all_actions = false;
       srn->priv->valid_labels = new CB::label();
     } else {
       srn->priv->rollout_all_actions = true;
       srn->priv->valid_labels = new COST_SENSITIVE::label();
     }
-    
+
     //if we loaded a regressor with -i option, --search_trained_nb_policies contains the number of trained policies in the file
     // and --search_total_nb_policies contains the total number of policies in the file
-    if ( vm_file.count("search_total_nb_policies") ) {
-      srn->priv->current_policy = (uint32_t)vm_file["search_trained_nb_policies"].as<size_t>();
-      srn->priv->total_number_of_policies = (uint32_t)vm_file["search_total_nb_policies"].as<size_t>();
-      if (vm.count("search_total_nb_policies") && (uint32_t)vm["search_total_nb_policies"].as<size_t>() != srn->priv->total_number_of_policies)
-        std::cerr << "warning: --search_total_nb_policies doesn't match the total number of policies stored in initial predictor. Using loaded value of: " << srn->priv->total_number_of_policies << endl;
-    } else if (vm.count("search_total_nb_policies"))
+    if (vm.count("search_total_nb_policies"))
       srn->priv->total_number_of_policies = (uint32_t)vm["search_total_nb_policies"].as<size_t>();
 
     ensure_param(srn->priv->beta , 0.0, 1.0, 0.5, "warning: search_beta must be in (0,1); resetting to 0.5");
@@ -2832,11 +2778,11 @@ void print_update(vw& all, searn& srn)
 
     //compute total number of policies we will have at end of training
     // we add current_policy for cases where we start from an initial set of policies loaded through -i option
-    uint32_t tmp_number_of_policies = srn->priv->current_policy; 
+    uint32_t tmp_number_of_policies = srn->priv->current_policy;
     if( all.training )
       tmp_number_of_policies += (int)ceil(((float)all.numpasses) / ((float)srn->priv->passes_per_policy));
 
-    //the user might have specified the number of policies that will eventually be trained through multiple vw calls, 
+    //the user might have specified the number of policies that will eventually be trained through multiple vw calls,
     //so only set total_number_of_policies to computed value if it is larger
     cdbg << "current_policy=" << srn->priv->current_policy << " tmp_number_of_policies=" << tmp_number_of_policies << " total_number_of_policies=" << srn->priv->total_number_of_policies << endl;
     if( tmp_number_of_policies > srn->priv->total_number_of_policies ) {
@@ -2852,8 +2798,8 @@ void print_update(vw& all, searn& srn)
       srn->priv->current_policy--;
 
     std::stringstream ss1, ss2;
-    ss1 << srn->priv->current_policy;           VW::cmd_string_replace_value(all.options_from_file,"--search_trained_nb_policies", ss1.str()); 
-    ss2 << srn->priv->total_number_of_policies; VW::cmd_string_replace_value(all.options_from_file,"--search_total_nb_policies",   ss2.str());
+    ss1 << srn->priv->current_policy;           VW::cmd_string_replace_value(all.file_options,"--search_trained_nb_policies", ss1.str());
+    ss2 << srn->priv->total_number_of_policies; VW::cmd_string_replace_value(all.file_options,"--search_total_nb_policies",   ss2.str());
 
     cdbg << "search current_policy = " << srn->priv->current_policy << " total_number_of_policies = " << srn->priv->total_number_of_policies << endl;
 
@@ -2870,20 +2816,20 @@ void print_update(vw& all, searn& srn)
     all.p->emptylines_separate_examples = true;
 
     // default to OAA labels unless the task wants to override this!
-    all.p->lp = MULTICLASS::mc_label; 
-    srn->task->initialize(*srn, srn->priv->A, opts, vm, vm_file);
+    all.p->lp = MULTICLASS::mc_label;
+    srn->task->initialize(*srn, srn->priv->A, vm);
 
     if (vm.count("search_allowed_transitions"))     read_allowed_transitions((uint32_t)srn->priv->A, vm["search_allowed_transitions"].as<string>().c_str());
-    
+
     // set up auto-history if they want it
     if (srn->priv->auto_history) {
       default_info(&srn->priv->hinfo);
 
-      handle_history_options(all, srn->priv->hinfo, opts, vm, vm_file);
-      
+      handle_history_options(all, srn->priv->hinfo, vm);
+
       if (srn->priv->hinfo.length < srn->priv->hinfo.features)
         srn->priv->hinfo.length = srn->priv->hinfo.features;
-      
+
       if (srn->priv->hinfo.length == 0)
         srn->priv->auto_history = false;
     } else {
@@ -2914,38 +2860,38 @@ void print_update(vw& all, searn& srn)
     l->set_end_examples<searn,end_examples>();
     l->set_finish<searn,searn_finish>();
     l->set_end_pass<searn,end_pass>();
-    
+
     return l;
   }
 
 
   // the interface:
-  uint32_t searn::predict(example* ecs, size_t ec_len, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed) // for LDF
+  uint32_t searn::predictLDF(example* ecs, size_t ec_len, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed) // for LDF
   { return searn_predict(this->priv, ecs, ec_len, yallowed, ystar, false); }
 
-  uint32_t searn::predict(example* ecs, size_t ec_len, uint32_t one_ystar, v_array<uint32_t>* yallowed) // for LDF
+  uint32_t searn::predictLDF(example* ecs, size_t ec_len, uint32_t one_ystar, v_array<uint32_t>* yallowed) // for LDF
   { if (one_ystar == (uint32_t)-1) // test example
       return searn_predict(this->priv, ecs, ec_len, yallowed, NULL, false);
     else
       return searn_predict(this->priv, ecs, ec_len, yallowed, (v_array<uint32_t>*)&one_ystar, true);
   }
-  
+
   uint32_t searn::predict(example* ec, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed) // for not LDF
   { return searn_predict(this->priv, ec, 0, yallowed, ystar, false); }
-  
+
   uint32_t searn::predict(example* ec, uint32_t one_ystar, v_array<uint32_t>* yallowed) // for not LDF
   { if (one_ystar == (uint32_t)-1) // test example
       return searn_predict(this->priv, ec, 0, yallowed, NULL, false);
     else
       return searn_predict(this->priv, ec, 0, yallowed, (v_array<uint32_t>*)&one_ystar, true);
   }
-      
+
   void     searn::loss(float incr_loss, size_t predictions_since_last)
   { searn_declare_loss(this->priv, predictions_since_last, incr_loss); }
-  
+
   void     searn::snapshot(size_t index, size_t tag, void* data_ptr, size_t sizeof_data, bool used_for_prediction)
   { searn_snapshot(this->priv, index, tag, data_ptr, sizeof_data, used_for_prediction); }
-  
+
   stringstream& searn::output() {
     return searn_output_streamstream(this->priv);
   }
@@ -2975,26 +2921,26 @@ void print_update(vw& all, searn& srn)
   NON-LDF BEAM 100
 
   ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence --search 5 -i z2.model -p out --search_beam 100 --search_kbest 100
-  
 
-  
+
+
   LDF TRAIN
   ./vw -k -c -d z2 --passes 50 --invariant --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 --holdout_off -f z2.model --audit --csoaa_ldf m
 
   LDF NO BEAM
-  ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 -i z2.model --audit --csoaa_ldf m -p out  
+  ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 -i z2.model --audit --csoaa_ldf m -p out
 
   LDF BEAM 1
-  
+
   ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 -i z2.model --audit --csoaa_ldf m -p out --search_beam 1
 
   LDF BEAM 100
-  
+
   ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 -i z2.model --audit --csoaa_ldf m -p out --search_beam 100 --search_kbest 100
 
 
   === SPAN ===
-  
+
   TRAIN
 
   ./vw -k -c -d seqtest --passes 50 --invariant --search_as_dagger 1e-8 --search_task sequencespan --search 7 --holdout_off -f seqtest.model
@@ -3004,11 +2950,11 @@ void print_update(vw& all, searn& srn)
   ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out
 
   BEAM 1
-  
+
   ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_beam 1
 
   BEAM 100
-  
+
   ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_beam 100 --search_kbest 100
 
 
@@ -3023,14 +2969,14 @@ void print_update(vw& all, searn& srn)
   ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_bilou
 
   BEAM 1
-  
+
   ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_bilou --search_beam 1
 
   BEAM 100
-  
+
   ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_beam 100 --search_kbest 100 --search_bilou
-  
-  
+
+
  */
 
 
@@ -3075,4 +3021,3 @@ next slow commit ab38c2fedee0ba7ca86bd4cf4d145a27d4a6f5d5
 
 
 */
-
diff --git a/vowpalwabbit/searn.h b/vowpalwabbit/searn.h
index 1caa8f7e..3c6031c2 100644
--- a/vowpalwabbit/searn.h
+++ b/vowpalwabbit/searn.h
@@ -48,20 +48,21 @@ namespace Searn {
     uint32_t predict(example* ec, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed=NULL); // if there are multiple oracle actions
 
     // for making predictions in LDF mode:
-    uint32_t predict(example* ecs, size_t ec_len, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed=NULL); // if there is a single oracle action
-    uint32_t predict(example* ecs, size_t ec_len, uint32_t       one_ystar, v_array<uint32_t>* yallowed=NULL); // if there is are multiple oracle action
+    uint32_t predictLDF(example* ecs, size_t ec_len, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed=NULL); // if there is a single oracle action
+    uint32_t predictLDF(example* ecs, size_t ec_len, uint32_t       one_ystar, v_array<uint32_t>* yallowed=NULL); // if there is are multiple oracle action
 
     // for generating output (check to see if output().good() before attempting to write!)
     stringstream& output();
     
     // internal data
     searn_task*    task;
+    vw* all;
     searn_private* priv;
     void*          task_data;
   };
 
-  template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, po::variables_map& vm_file, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string);
-  void check_option(bool& ret, vw&all, po::variables_map& vm, po::variables_map& vm_file, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string);
+  template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string);
+  void check_option(bool& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string);
   bool string_equal(string a, string b);
   bool float_equal(float a, float b);
   bool uint32_equal(uint32_t a, uint32_t b);
@@ -69,12 +70,12 @@ namespace Searn {
 
   struct searn_task {
     const char* task_name;
-    void (*initialize)(searn&,size_t&,std::vector<std::string>&, po::variables_map&, po::variables_map&);
+    void (*initialize)(searn&,size_t&, po::variables_map&);
     void (*finish)(searn&);
     void (*structured_predict)(searn&, std::vector<example*>);
   };
 
-  LEARNER::learner* setup(vw&, std::vector<std::string>&, po::variables_map&, po::variables_map&);
+  LEARNER::learner* setup(vw&, po::variables_map&);
   void searn_finish(void*);
   void searn_drive(void*);
   void searn_learn(void*,example*);
diff --git a/vowpalwabbit/searn_sequencetask.cc b/vowpalwabbit/searn_sequencetask.cc
index 7c761fb5..df962943 100644
--- a/vowpalwabbit/searn_sequencetask.cc
+++ b/vowpalwabbit/searn_sequencetask.cc
@@ -18,7 +18,7 @@ namespace SequenceSpanTask     {  Searn::searn_task task = { "sequencespan",
 namespace SequenceTask {
   using namespace Searn;
 
-  void initialize(searn& srn, size_t& num_actions, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) {
+  void initialize(searn& srn, size_t& num_actions, po::variables_map& vm) {
     srn.set_options( AUTO_HISTORY         |    // automatically add history features to our examples, please
                      AUTO_HAMMING_LOSS    |    // please just use hamming loss on individual predictions -- we won't declare loss
                      EXAMPLES_DONT_CHANGE );   // we don't do any internal example munging
@@ -44,25 +44,28 @@ namespace ArgmaxTask {
 
   struct task_data {
     float false_negative_cost;
+    float negative_weight;
     bool predict_max;
   };
 
-  void initialize(searn& srn, size_t& num_actions, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) 
+  void initialize(searn& srn, size_t& num_actions, po::variables_map& vm) 
   {
     
     task_data* my_task_data = new task_data();
     
-    po::options_description desc("search sequencespan options");
-    desc.add_options()("cost", po::value<float>(&(my_task_data->false_negative_cost))->default_value(10.0), "False Negative Cost");
-    desc.add_options()("max", po::value<bool>(&(my_task_data->predict_max))->default_value(false), "Disable structure: just predict the max");
+    po::options_description argmax_opts("argmax options");
+    argmax_opts.add_options()
+      ("cost", po::value<float>(&(my_task_data->false_negative_cost))->default_value(10.0), "False Negative Cost")
+      ("negative_weight", po::value<float>(&(my_task_data->negative_weight))->default_value(1), "Relative weight of negative examples")
+      ("max", "Disable structure: just predict the max");
 
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
+    vm = add_options(*srn.all, argmax_opts);
 
+    if (vm.count("max"))
+      my_task_data->predict_max = true;
+    else
+      my_task_data->predict_max = false;      
+	    
     srn.set_task_data(my_task_data);
 
     if (my_task_data->predict_max)
@@ -97,9 +100,9 @@ namespace ArgmaxTask {
     }
     float loss = 0.;
     if (max_label > max_prediction)
-      loss = my_task_data->false_negative_cost;
+      loss = my_task_data->false_negative_cost / my_task_data->negative_weight;
     else if (max_prediction > max_label)
-      loss = 1.;		
+      loss = 1.;
     srn.loss(loss);
 
     if (srn.output().good())
@@ -167,20 +170,15 @@ namespace SequenceSpanTask {
     v_array<uint32_t> only_two_allowed;  // used for BILOU encoding
   };
 
-  void initialize(searn& srn, size_t& num_actions, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) {
+  void initialize(searn& srn, size_t& num_actions, po::variables_map& vm) {
     task_data * my_task_data = new task_data();
 
-    po::options_description desc("search sequencespan options");
-    desc.add_options()("search_span_bilou", "switch to (internal) BILOU encoding instead of BIO encoding");
+    po::options_description sspan_opts("search sequencespan options");
+    sspan_opts.add_options()("search_span_bilou", "switch to (internal) BILOU encoding instead of BIO encoding");
 
-    po::parsed_options parsed = po::command_line_parser(opts).
-      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
-      options(desc).allow_unregistered().run();
-    opts = po::collect_unrecognized(parsed.options, po::include_positional);
-    po::store(parsed, vm);
-    po::notify(vm);
+    vm = add_options(*srn.all, sspan_opts);
 
-    if (vm.count("search_span_bilou") || vm_file.count("search_span_bilou")) {
+    if (vm.count("search_span_bilou")) {
       cerr << "switching to BILOU encoding for sequence span labeling" << endl;
       my_task_data->encoding = BILOU;
       num_actions = num_actions * 2 - 1;
@@ -268,7 +266,7 @@ namespace SequenceTask_DemoLDF {  // this is just to debug/show off how to do LD
     size_t   num_actions;
   };
   
-  void initialize(searn& srn, size_t& num_actions, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) {
+  void initialize(searn& srn, size_t& num_actions, po::variables_map& vm) {
     COST_SENSITIVE::wclass default_wclass = { 0., 0, 0., 0. };
 
     example* ldf_examples = alloc_examples(sizeof(COST_SENSITIVE::label), num_actions);
@@ -315,7 +313,7 @@ namespace SequenceTask_DemoLDF {  // this is just to debug/show off how to do LD
         lab->costs[0].wap_value = 0.;
       }
       
-      size_t pred_id = srn.predict(data->ldf_examples, data->num_actions, MULTICLASS::get_example_label(ec[i]) - 1);
+      size_t pred_id = srn.predictLDF(data->ldf_examples, data->num_actions, MULTICLASS::get_example_label(ec[i]) - 1);
       size_t prediction = pred_id + 1;  // or ldf_examples[pred_id]->ld.costs[0].weight_index
       
       if (srn.output().good())
diff --git a/vowpalwabbit/searn_sequencetask.h b/vowpalwabbit/searn_sequencetask.h
index e7a8a703..92621e68 100644
--- a/vowpalwabbit/searn_sequencetask.h
+++ b/vowpalwabbit/searn_sequencetask.h
@@ -9,28 +9,28 @@ license as described in the file LICENSE.
 #include "searn.h"
 
 namespace SequenceTask {
-  void initialize(Searn::searn&, size_t&, std::vector<std::string>&, po::variables_map&, po::variables_map&);
+  void initialize(Searn::searn&, size_t&, po::variables_map&);
   void finish(Searn::searn&);
   void structured_predict(Searn::searn&, vector<example*>);
   extern Searn::searn_task task;
 }
 
 namespace ArgmaxTask {
-  void initialize(Searn::searn&, size_t&, std::vector<std::string>&, po::variables_map&, po::variables_map&);
+  void initialize(Searn::searn&, size_t&, po::variables_map&);
   void finish(Searn::searn&);
   void structured_predict(Searn::searn&, vector<example*>);
   extern Searn::searn_task task;
 }
 
 namespace SequenceSpanTask {
-  void initialize(Searn::searn&, size_t&, std::vector<std::string>&, po::variables_map&, po::variables_map&);
+  void initialize(Searn::searn&, size_t&, po::variables_map&);
   void finish(Searn::searn&);
   void structured_predict(Searn::searn&, vector<example*>);
   extern Searn::searn_task task;
 }
 
 namespace SequenceTask_DemoLDF {
-  void initialize(Searn::searn&, size_t&, std::vector<std::string>&, po::variables_map&, po::variables_map&);
+  void initialize(Searn::searn&, size_t&, po::variables_map&);
   void finish(Searn::searn&);
   void structured_predict(Searn::searn&, vector<example*>);
   void update_example_indicies(bool audit, example* ec, uint32_t mult_amount, uint32_t plus_amount);
diff --git a/vowpalwabbit/sparse_dense.cc b/vowpalwabbit/sparse_dense.cc
deleted file mode 100644
index 417cae28..00000000
--- a/vowpalwabbit/sparse_dense.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
-Copyright (c) by respective owners including Yahoo!, Microsoft, and
-individual contributors. All rights reserved.  Released under a BSD (revised)
-license as described in the file LICENSE.
- */
-
-#include "sparse_dense.h"
-#include "constant.h"
-#include <math.h>
-
-
-void sd_offset_update(weight* weights, size_t mask, feature* begin, feature* end, size_t offset, float update, float regularization)
-{
-  for (feature* f = begin; f!= end; f++) 
-    weights[(f->weight_index + offset) & mask] += update * f->x - regularization * weights[(f->weight_index + offset) & mask];
-} 
-
diff --git a/vowpalwabbit/sparse_dense.h b/vowpalwabbit/sparse_dense.h
deleted file mode 100644
index 1d6140e0..00000000
--- a/vowpalwabbit/sparse_dense.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
-Copyright (c) by respective owners including Yahoo!, Microsoft, and
-individual contributors. All rights reserved.  Released under a BSD
-license as described in the file LICENSE.
- */
-#ifndef SPARSE_DENSE_H
-#define SPARSE_DENSE_H
-
-#include <math.h>
-#include "parse_example.h"
-#include "constant.h"
-
-template<class R> 
-struct predict_data {
-  float prediction;
-  R extra;
-};
-  
-inline float sign(float w){ if (w < 0.) return -1.; else  return 1.;}
-
-inline float trunc_weight(const float w, const float gravity){
-  return (gravity < fabsf(w)) ? w - sign(w) * gravity : 0.f;
-}
-
-inline void vec_add(float& p, const float fx, float& fw) {
-  p += fw * fx;
-}
-
-inline void vec_add_trunc(predict_data<float>& p, const float fx, float& fw) {
-  p.prediction += trunc_weight(fw, p.extra) * fx;
-}
-
-template <bool adaptive, size_t normalized_idx>
-inline void vec_add_rescale(float& p, const float fx, float& fw) {
-  weight* w = &fw;
-  float x_abs = fabs(fx);
-  if( x_abs > w[normalized_idx] ) {// new scale discovered
-    if( w[normalized_idx] > 0. ) {//If the normalizer is > 0 then rescale the weight so it's as if the new scale was the old scale.
-      float rescale = (w[normalized_idx]/x_abs);
-      w[0] *= (adaptive ? rescale : rescale*rescale);
-    }
-    w[normalized_idx] = x_abs;
-  }
-  p += fw * fx;
-}
-
-template <bool adaptive, size_t normalized_idx>
-inline void vec_add_trunc_rescale(predict_data<float>& p, const float fx, float& fw) {
-  weight* w = &fw;
-  float x_abs = fabs(fx);
-  if( x_abs > w[normalized_idx] ) {
-    if( w[normalized_idx] > 0. ) {
-      float rescale = (w[normalized_idx]/x_abs);
-      w[0] *= (adaptive ? rescale : rescale*rescale);
-    }
-    w[normalized_idx] = x_abs;
-  }
-  p.prediction += trunc_weight(fw, p.extra) * fx;
-}
-
-template <bool adaptive, size_t normalized_idx>
-inline void vec_add_rescale_general(predict_data<float>& p, const float fx, float& fw) {
-  weight* w = &fw;
-  float x_abs = fabs(fx);
-  float power_t_norm = 1.f - (adaptive ? p.extra : 0.f);
-  if( x_abs > w[normalized_idx] ) {
-    if( w[normalized_idx] > 0. ) {
-      float rescale = (w[normalized_idx]/x_abs);
-      w[0] *= powf(rescale*rescale,power_t_norm);
-    }
-    w[normalized_idx] = x_abs;
-  }
-  p.prediction += fw * fx;
-}
-
-struct gnp {
-  float gravity;
-  float power_t;
-};
-
-template <bool adaptive, size_t normalized_idx>
-inline void vec_add_trunc_rescale_general(predict_data<gnp>& p, const float fx, float& fw) {
-  weight* w = &fw;
-  float x_abs = fabs(fx);
-  float power_t_norm = 1.f - (adaptive ? p.extra.power_t : 0.f);
-  if( x_abs > w[normalized_idx] ) {
-    if( w[normalized_idx] > 0. ) {
-      float rescale = (w[normalized_idx]/x_abs);
-      w[0] *= powf(rescale*rescale,power_t_norm);
-    }
-    w[normalized_idx] = x_abs;
-  }
-  p.prediction += trunc_weight(fw, p.extra.gravity) * fx;
-}
-
-void sd_offset_update(weight* weights, size_t mask, feature* begin, feature* end, size_t offset, float update, float regularization);
-
-#endif
diff --git a/vowpalwabbit/topk.cc b/vowpalwabbit/topk.cc
index e0a50393..89432fa2 100644
--- a/vowpalwabbit/topk.cc
+++ b/vowpalwabbit/topk.cc
@@ -109,7 +109,7 @@ namespace TOPK {
     VW::finish_example(all, &ec);
   }
 
-  learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     topk* data = (topk*)calloc_or_die(1, sizeof(topk));
 
diff --git a/vowpalwabbit/topk.h b/vowpalwabbit/topk.h
index b1b8adf3..e29e4a09 100644
--- a/vowpalwabbit/topk.h
+++ b/vowpalwabbit/topk.h
@@ -16,7 +16,7 @@ license as described in the file LICENSE.
 
 namespace TOPK
 {
-  LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw& all, po::variables_map& vm);
 }
 
 #endif
diff --git a/vowpalwabbit/unique_sort.cc b/vowpalwabbit/unique_sort.cc
index da88b226..8022d1ec 100644
--- a/vowpalwabbit/unique_sort.cc
+++ b/vowpalwabbit/unique_sort.cc
@@ -41,21 +41,29 @@ void unique_audit_features(v_array<audit_data> &features)
   features.end = ++last;
 }
 
-void unique_sort_features(bool audit, example* ae)
+void unique_sort_features(bool audit, uint32_t parse_mask, example* ae)
 {
   ae->sorted=true;
   for (unsigned char* b = ae->indices.begin; b != ae->indices.end; b++)
     {
-      qsort(ae->atomics[*b].begin, ae->atomics[*b].size(), sizeof(feature), 
+      v_array<feature> features = ae->atomics[*b];
+
+      for (size_t i = 0; i < features.size(); i++)
+	features[i].weight_index &= parse_mask;
+      qsort(features.begin, features.size(), sizeof(feature), 
 	    order_features);
       unique_features(ae->atomics[*b]);
       
       if (audit)
 	{
-	  qsort(ae->audit_features[*b].begin, ae->audit_features[*b].size(), sizeof(audit_data), 
+	  v_array<audit_data> afeatures = ae->audit_features[*b];
+
+	  for (size_t i = 0; i < ae->atomics[*b].size(); i++)
+	    afeatures[i].weight_index &= parse_mask;
+	  
+	  qsort(afeatures.begin, afeatures.size(), sizeof(audit_data), 
 		order_audit_features);
-	  unique_audit_features(ae->audit_features[*b]);
+	  unique_audit_features(afeatures);
 	}
     }
 }
-
diff --git a/vowpalwabbit/unique_sort.h b/vowpalwabbit/unique_sort.h
index 5b3b91b7..3e5e0d05 100644
--- a/vowpalwabbit/unique_sort.h
+++ b/vowpalwabbit/unique_sort.h
@@ -8,6 +8,6 @@ license as described in the file LICENSE.
 #include "parser.h"
 #include "example.h"
 
-void unique_sort_features(bool audit, example* ae);
+void unique_sort_features(bool audit, uint32_t parse_mask, example* ae);
 
 #endif
diff --git a/vowpalwabbit/vw_static.vcxproj b/vowpalwabbit/vw_static.vcxproj
index 9ea2895b..42656934 100644
--- a/vowpalwabbit/vw_static.vcxproj
+++ b/vowpalwabbit/vw_static.vcxproj
@@ -281,7 +281,6 @@
     <ClInclude Include="searn_sequencetask.h" />
     <ClInclude Include="sender.h" />
     <ClInclude Include="simple_label.h" />
-    <ClInclude Include="sparse_dense.h" />
     <ClInclude Include="targetver.h" />
     <ClInclude Include="topk.h" />
     <ClInclude Include="unique_sort.h" />
@@ -333,7 +332,6 @@
     <ClCompile Include="searn_sequencetask.cc" />
     <ClCompile Include="sender.cc" />
     <ClCompile Include="simple_label.cc" />
-    <ClCompile Include="sparse_dense.cc" />
     <ClCompile Include="topk.cc" />
     <ClCompile Include="unique_sort.cc" />
     <ClCompile Include="wap.cc" />
diff --git a/vowpalwabbit/wap.cc b/vowpalwabbit/wap.cc
index 35d796c5..1a0cef0a 100644
--- a/vowpalwabbit/wap.cc
+++ b/vowpalwabbit/wap.cc
@@ -234,25 +234,19 @@ namespace WAP {
     VW::finish_example(all, &ec);
   }
   
-  learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file)
+  learner* setup(vw& all, po::variables_map& vm)
   {
     wap* w=(wap*)calloc_or_die(1,sizeof(wap));
     w->all = &all;
     uint32_t nb_actions = 0;
-    if( vm_file.count("wap") ) { //if loaded options from regressor
-      nb_actions = (uint32_t)vm_file["wap"].as<size_t>();
-      if( vm.count("wap") && (uint32_t)vm["wap"].as<size_t>() != nb_actions )
-        std::cerr << "warning: you specified a different number of actions through --wap than the one loaded from regressor. Pursuing with loaded value of: " << nb_actions << endl;
-    }
-    else {
-      nb_actions = (uint32_t)vm["wap"].as<size_t>();
 
-     //append wap with nb_actions to options_from_file so it is saved to regressor later
-     std::stringstream ss;
-     ss << " --wap " << nb_actions;
-     all.options_from_file.append(ss.str());
-    }
+    nb_actions = (uint32_t)vm["wap"].as<size_t>();
+    
+    //append wap with nb_actions to options_from_file so it is saved to regressor later
+    std::stringstream ss;
+    ss << " --wap " << nb_actions;
 
+    all.file_options.append(ss.str());
     all.p->lp = COST_SENSITIVE::cs_label;
 
     all.sd->k = (uint32_t)nb_actions;
diff --git a/vowpalwabbit/wap.h b/vowpalwabbit/wap.h
index f1254858..decd9e03 100644
--- a/vowpalwabbit/wap.h
+++ b/vowpalwabbit/wap.h
@@ -10,7 +10,7 @@ license as described in the file LICENSE.
 #include "parse_args.h"
 
 namespace WAP {
-  LEARNER::learner* setup(vw&, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file);
+  LEARNER::learner* setup(vw&, po::variables_map& vm);
 }
 
 #endif
author	Hal Daume III <me@hal3.name>	2014-05-25 00:56:42 +0400
committer	Hal Daume III <me@hal3.name>	2014-05-25 00:56:42 +0400
commit	99f9974e4002e0aaef22741f656e5bc9baf9dc40 (patch)
tree	2e26b0c90c94b852c229653ca302f10cd3716b67 /vowpalwabbit
parent	7703b4bf0cece9bdccc662e8680725d333d408c3 (diff)