diff options
author | Hal Daume III <me@hal3.name> | 2014-05-25 00:56:42 +0400 |
---|---|---|
committer | Hal Daume III <me@hal3.name> | 2014-05-25 00:56:42 +0400 |
commit | 99f9974e4002e0aaef22741f656e5bc9baf9dc40 (patch) | |
tree | 2e26b0c90c94b852c229653ca302f10cd3716b67 /vowpalwabbit | |
parent | 7703b4bf0cece9bdccc662e8680725d333d408c3 (diff) |
merged john's changes
Diffstat (limited to 'vowpalwabbit')
56 files changed, 1149 insertions, 1599 deletions
diff --git a/vowpalwabbit/Makefile.am b/vowpalwabbit/Makefile.am index 34b150f5..cc2e1a71 100644 --- a/vowpalwabbit/Makefile.am +++ b/vowpalwabbit/Makefile.am @@ -4,7 +4,7 @@ liballreduce_la_SOURCES = allreduce.cc bin_PROGRAMS = vw active_interactor -libvw_la_SOURCES = hash.cc memory.cc global_data.cc io_buf.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc rand48.cc simple_label.cc multiclass.cc oaa.cc ect.cc autolink.cc binary.cc lrq.cc cost_sensitive.cc csoaa.cc cb.cc cb_algs.cc wap.cc searn.cc searn_sequencetask.cc parse_example.cc scorer.cc sparse_dense.cc network.cc parse_args.cc accumulate.cc gd.cc learner.cc lda_core.cc gd_mf.cc mf.cc bfgs.cc noop.cc print.cc example.cc parser.cc loss_functions.cc sender.cc nn.cc bs.cc cbify.cc topk.cc +libvw_la_SOURCES = hash.cc memory.cc global_data.cc io_buf.cc parse_regressor.cc parse_primitives.cc unique_sort.cc cache.cc rand48.cc simple_label.cc multiclass.cc oaa.cc ect.cc autolink.cc binary.cc lrq.cc cost_sensitive.cc csoaa.cc cb.cc cb_algs.cc wap.cc searn.cc searn_sequencetask.cc parse_example.cc scorer.cc network.cc parse_args.cc accumulate.cc gd.cc learner.cc lda_core.cc gd_mf.cc mf.cc bfgs.cc noop.cc print.cc example.cc parser.cc loss_functions.cc sender.cc nn.cc bs.cc cbify.cc topk.cc # accumulate.cc uses all_reduce libvw_la_LIBADD = liballreduce.la diff --git a/vowpalwabbit/accumulate.cc b/vowpalwabbit/accumulate.cc index ed0b6136..daa6a75f 100644 --- a/vowpalwabbit/accumulate.cc +++ b/vowpalwabbit/accumulate.cc @@ -104,42 +104,8 @@ void accumulate_weighted_avg(vw& all, string master_location, regressor& reg) { weights[stride*i] = 0;
}
- if(!all.feature_mask_idx) //do in place all_reduce when the feature mask is absent
- all_reduce<float>(weights, length*stride, master_location, all.unique_id, all.total, all.node, all.socks);
+ all_reduce<float>(weights, length*stride, master_location, all.unique_id, all.total, all.node, all.socks);
- else {
-
- //Find weighted averaged weight
- all_reduce<float>(local_weights, length, master_location, all.unique_id, all.total, all.node, all.socks);
-
- for(uint32_t i = 0;i < length;i++)
- {
- weights[stride*i] = local_weights[i];
- local_weights[i] = weights[stride*i+1];
-
- }
-
- //Find weighted average for adaptation
- all_reduce(local_weights, length, master_location, all.unique_id, all.total, all.node, all.socks);
-
- for(uint32_t i = 0;i < length;i++)
- {
- weights[stride*i+1] = local_weights[i];
- if (all.normalized_updates)
- local_weights[i] = weights[stride*i+all.normalized_idx];
-
- }
-
- if (all.normalized_updates)
- {
- //Find weighted average for normalization
- all_reduce(local_weights, length, master_location, all.unique_id, all.total, all.node, all.socks);
- for(uint32_t i = 0;i < length;i++)
- weights[stride*i+all.normalized_idx] = local_weights[i];
- }
- }
-
-
delete[] local_weights;
}
diff --git a/vowpalwabbit/allreduce.cc b/vowpalwabbit/allreduce.cc index 782da0c1..28226901 100644 --- a/vowpalwabbit/allreduce.cc +++ b/vowpalwabbit/allreduce.cc @@ -36,18 +36,18 @@ socket_t sock_connect(const uint32_t ip, const int port) { sockaddr_in far_end; far_end.sin_family = AF_INET; far_end.sin_port = port; - + far_end.sin_addr = *(in_addr*)&ip; memset(&far_end.sin_zero, '\0',8); - + { char hostname[NI_MAXHOST]; char servInfo[NI_MAXSERV]; getnameinfo((sockaddr *) &far_end, sizeof(sockaddr), hostname, NI_MAXHOST, servInfo, NI_MAXSERV, NI_NUMERICSERV); - + cerr << "connecting to " << hostname << ':' << ntohs(port) << endl; } - + size_t count = 0; int ret; while ( (ret =connect(sock,(sockaddr*)&far_end, sizeof(far_end))) == -1 && count < 100) @@ -59,7 +59,7 @@ socket_t sock_connect(const uint32_t ip, const int port) { cerr << "can't connect to: " ; uint32_t pip = ntohl(ip); unsigned char * pp = (unsigned char*)&pip; - + for (size_t i = 0; i < 4; i++) { cerr << static_cast<unsigned int>(static_cast<unsigned short>(pp[3-i])) << "."; @@ -67,7 +67,7 @@ socket_t sock_connect(const uint32_t ip, const int port) { cerr << ':' << ntohs(port) << endl; perror(NULL); count++; -#ifdef _WIN32 +#ifdef _WIN32 Sleep(1); #else sleep(1); @@ -90,7 +90,7 @@ socket_t getsock() // of VW on the same machine to potentially contact the wrong tree node. #ifndef _WIN32 int on = 1; - if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) < 0) + if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) < 0) perror("setsockopt SO_REUSEADDR"); #endif return sock; @@ -104,7 +104,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const int lastError = WSAGetLastError(); #endif - + struct hostent* master = gethostbyname(master_location.c_str()); @@ -119,11 +119,11 @@ void all_reduce_init(const string master_location, const size_t unique_id, const socket_t master_sock = sock_connect(master_ip, htons(port)); if(send(master_sock, (const char*)&unique_id, sizeof(unique_id), 0) < (int)sizeof(unique_id)) - cerr << "write failed!" << endl; + cerr << "write failed!" << endl; if(send(master_sock, (const char*)&total, sizeof(total), 0) < (int)sizeof(total)) - cerr << "write failed!" << endl; + cerr << "write failed!" << endl; if(send(master_sock, (char*)&node, sizeof(node), 0) < (int)sizeof(node)) - cerr << "write failed!" << endl; + cerr << "write failed!" << endl; int ok; if (recv(master_sock, (char*)&ok, sizeof(ok), 0) < (int)sizeof(ok)) cerr << "read 1 failed!" << endl; @@ -173,7 +173,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const if (listen(sock, kid_count) < 0) { perror("listen failed! "); - shutdown(sock, SHUT_RDWR); + CLOSESOCK(sock); sock = getsock(); } else @@ -192,7 +192,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const if(recv(master_sock, (char*)&parent_port, sizeof(parent_port), 0) < (int)sizeof(parent_port)) cerr << "read 4 failed!" << endl; - shutdown(master_sock, SHUT_RDWR); + CLOSESOCK(master_sock); if(parent_ip != (uint32_t)-1) { socks.parent = sock_connect(parent_ip, parent_port); @@ -205,7 +205,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const { sockaddr_in child_address; socklen_t size = sizeof(child_address); - socket_t f = accept(sock,(sockaddr*)&child_address,&size); + socket_t f = accept(sock,(sockaddr*)&child_address,&size); if (f < 0) { cerr << "bad client socket!" << endl; @@ -219,7 +219,7 @@ void all_reduce_init(const string master_location, const size_t unique_id, const } if (kid_count > 0) - shutdown(sock, SHUT_RDWR); + CLOSESOCK(sock); } @@ -229,9 +229,9 @@ void pass_down(char* buffer, const size_t parent_read_pos, size_t& children_sent if(my_bufsize > 0) { //going to pass up this chunk of data to the children - if(child_sockets[0] != -1 && send(child_sockets[0], buffer+children_sent_pos, (int)my_bufsize, 0) < (int)my_bufsize) + if(child_sockets[0] != -1 && send(child_sockets[0], buffer+children_sent_pos, (int)my_bufsize, 0) < (int)my_bufsize) cerr<<"Write to left child failed\n"; - if(child_sockets[1] != -1 && send(child_sockets[1], buffer+children_sent_pos, (int)my_bufsize, 0) < (int)my_bufsize) + if(child_sockets[1] != -1 && send(child_sockets[1], buffer+children_sent_pos, (int)my_bufsize, 0) < (int)my_bufsize) cerr<<"Write to right child failed\n"; children_sent_pos += my_bufsize; @@ -241,16 +241,16 @@ void pass_down(char* buffer, const size_t parent_read_pos, size_t& children_sent void broadcast(char* buffer, const size_t n, const socket_t parent_sock, const socket_t * child_sockets) { - + size_t parent_read_pos = 0; //First unread float from parent size_t children_sent_pos = 0; //First unsent float to children //parent_sent_pos <= left_read_pos //parent_sent_pos <= right_read_pos - + if(parent_sock == -1) { - parent_read_pos = n; + parent_read_pos = n; } - if(child_sockets[0] == -1 && child_sockets[1] == -1) + if(child_sockets[0] == -1 && child_sockets[1] == -1) children_sent_pos = n; while (parent_read_pos < n || children_sent_pos < n) @@ -270,8 +270,7 @@ void broadcast(char* buffer, const size_t n, const socket_t parent_sock, const s cerr <<" Read from parent failed\n"; perror(NULL); } - parent_read_pos += read_size; + parent_read_pos += read_size; } } } - diff --git a/vowpalwabbit/allreduce.h b/vowpalwabbit/allreduce.h index 35185456..7c2bdfd1 100644 --- a/vowpalwabbit/allreduce.h +++ b/vowpalwabbit/allreduce.h @@ -3,7 +3,7 @@ Copyright (c) by respective owners including Yahoo!, Microsoft, and individual contributors. All rights reserved. Released under a BSD license as described in the file LICENSE. */ -// This implements the allreduce function of MPI. +// This implements the allreduce function of MPI. #ifndef ALLREDUCE_H #define ALLREDUCE_H @@ -15,7 +15,7 @@ typedef unsigned int uint32_t; typedef unsigned short uint16_t; typedef int socklen_t; typedef SOCKET socket_t; -#define SHUT_RDWR SD_BOTH +#define CLOSESOCK closesocket #else #include <sys/socket.h> #include <sys/socket.h> @@ -25,6 +25,7 @@ typedef SOCKET socket_t; #include <stdlib.h> #include <stdio.h> typedef int socket_t; +#define CLOSESOCK close #endif using namespace std; @@ -39,11 +40,11 @@ struct node_socks { { if(current_master != "") { if(parent != -1) - shutdown(this->parent, SHUT_RDWR); - if(children[0] != -1) - shutdown(this->children[0], SHUT_RDWR); + CLOSESOCK(this->parent); + if(children[0] != -1) + CLOSESOCK(this->children[0]); if(children[1] != -1) - shutdown(this->children[1], SHUT_RDWR); + CLOSESOCK(this->children[1]); } } node_socks () @@ -54,7 +55,7 @@ struct node_socks { template <class T> void addbufs(T* buf1, const T* buf2, const size_t n) { - for(size_t i = 0;i < n;i++) + for(size_t i = 0;i < n;i++) buf1[i] += buf2[i]; } @@ -63,11 +64,11 @@ void all_reduce_init(const string master_location, const size_t unique_id, const template <class T> void pass_up(char* buffer, size_t left_read_pos, size_t right_read_pos, size_t& parent_sent_pos, socket_t parent_sock, size_t n) { size_t my_bufsize = min(ar_buf_size, ((int)(floor(left_read_pos/((float)sizeof(T)))*sizeof(T)) - parent_sent_pos)); my_bufsize = min(my_bufsize, ((int)(floor(right_read_pos/((float)sizeof(T)))*sizeof(T)) - parent_sent_pos)); - + if(my_bufsize > 0) { //going to pass up this chunk of data to the parent int write_size = send(parent_sock, buffer+parent_sent_pos, (int)my_bufsize, 0); - if(write_size < (int)my_bufsize) + if(write_size < (int)my_bufsize) cerr<<"Write to parent failed "<<my_bufsize<<" "<<write_size<<" "<<parent_sent_pos<<" "<<left_read_pos<<" "<<right_read_pos<<endl ; parent_sent_pos += my_bufsize; } @@ -90,12 +91,12 @@ template <class T>void reduce(char* buffer, const size_t n, const socket_t paren size_t parent_sent_pos = 0; //First unsent float to parent //parent_sent_pos <= left_read_pos //parent_sent_pos <= right_read_pos - + if(child_sockets[0] == -1) { child_read_pos[0] = n; } if(child_sockets[1] == -1) { - child_read_pos[1] = n; + child_read_pos[1] = n; } while (parent_sent_pos < n || child_read_pos[0] < n || child_read_pos[1] < n) @@ -112,7 +113,7 @@ template <class T>void reduce(char* buffer, const size_t n, const socket_t paren perror(NULL); throw exception(); } - + for(int i = 0;i < 2;i++) { if(child_sockets[i] != -1 && FD_ISSET(child_sockets[i],&fds)) { //there is data to be left from left child @@ -120,8 +121,8 @@ template <class T>void reduce(char* buffer, const size_t n, const socket_t paren cerr<<"I think child has no data to send but he thinks he has "<<FD_ISSET(child_sockets[0],&fds)<<" "<<FD_ISSET(child_sockets[1],&fds)<<endl; throw exception(); } - - + + size_t count = min(ar_buf_size,n - child_read_pos[i]); int read_size = recv(child_sockets[i], child_read_buf[i] + child_unprocessed[i], (int)count, 0); if(read_size == -1) { @@ -129,36 +130,36 @@ template <class T>void reduce(char* buffer, const size_t n, const socket_t paren perror(NULL); throw exception(); } - + addbufs((T*)buffer + child_read_pos[i]/sizeof(T), (T*)child_read_buf[i], (child_read_pos[i] + read_size)/sizeof(T) - child_read_pos[i]/sizeof(T)); - + child_read_pos[i] += read_size; int old_unprocessed = child_unprocessed[i]; child_unprocessed[i] = child_read_pos[i] % (int)sizeof(T); for(int j = 0;j < child_unprocessed[i];j++) { child_read_buf[i][j] = child_read_buf[i][((old_unprocessed + read_size)/(int)sizeof(T))*sizeof(T)+j]; } - + if(child_read_pos[i] == n) //Done reading parent FD_CLR(child_sockets[i],&fds); } else if(child_sockets[i] != -1 && child_read_pos[i] != n) - FD_SET(child_sockets[i],&fds); + FD_SET(child_sockets[i],&fds); } } - if(parent_sock == -1 && child_read_pos[0] == n && child_read_pos[1] == n) + if(parent_sock == -1 && child_read_pos[0] == n && child_read_pos[1] == n) parent_sent_pos = n; - } - + } + } void broadcast(char* buffer, const size_t n, const socket_t parent_sock, const socket_t * child_sockets); -template <class T> void all_reduce(T* buffer, const size_t n, const std::string master_location, const size_t unique_id, const size_t total, const size_t node, node_socks& socks) +template <class T> void all_reduce(T* buffer, const size_t n, const std::string master_location, const size_t unique_id, const size_t total, const size_t node, node_socks& socks) { - if(master_location != socks.current_master) + if(master_location != socks.current_master) all_reduce_init(master_location, unique_id, total, node, socks); reduce<T>((char*)buffer, n*sizeof(T), socks.parent, socks.children); broadcast((char*)buffer, n*sizeof(T), socks.parent, socks.children); diff --git a/vowpalwabbit/autolink.cc b/vowpalwabbit/autolink.cc index 87110908..c8aacf5e 100644 --- a/vowpalwabbit/autolink.cc +++ b/vowpalwabbit/autolink.cc @@ -42,18 +42,15 @@ namespace ALINK { ec.total_sum_feat_sq -= sum_sq; } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { autolink* data = (autolink*)calloc_or_die(1,sizeof(autolink)); data->d = (uint32_t)vm["autolink"].as<size_t>(); data->stride_shift = all.reg.stride_shift; - if (!vm_file.count("autolink")) - { - std::stringstream ss; - ss << " --autolink " << data->d << " "; - all.options_from_file.append(ss.str()); - } + std::stringstream ss; + ss << " --autolink " << data->d; + all.file_options = all.file_options+ss.str(); learner* ret = new learner(data, all.l); ret->set_learn<autolink, predict_or_learn<true> >(); diff --git a/vowpalwabbit/autolink.h b/vowpalwabbit/autolink.h index d7f8c6ce..28a3f341 100644 --- a/vowpalwabbit/autolink.h +++ b/vowpalwabbit/autolink.h @@ -2,6 +2,6 @@ #ifndef AUTOLINK_H #define AUTOLINK_H namespace ALINK { - LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/bfgs.cc b/vowpalwabbit/bfgs.cc index 6f9f04c8..8b1778a5 100644 --- a/vowpalwabbit/bfgs.cc +++ b/vowpalwabbit/bfgs.cc @@ -154,7 +154,7 @@ bool test_example(example& ec) float bfgs_predict(vw& all, example& ec) { - ec.partial_prediction = GD::inline_predict<vec_add>(all,ec); + ec.partial_prediction = GD::inline_predict(all,ec); return GD::finalize_prediction(all, ec.partial_prediction); } @@ -198,7 +198,7 @@ void update_preconditioner(vw& all, example& ec) float dot_with_direction(vw& all, example& ec) { ec.ft_offset+= W_DIR; - float ret = GD::inline_predict<vec_add>(all, ec); + float ret = GD::inline_predict(all, ec); ec.ft_offset-= W_DIR; return ret; @@ -802,8 +802,10 @@ void end_pass(bfgs& b) set_done(*all); cerr<<"Early termination reached w.r.t. holdout set error"; } - - } + } if (b.final_pass == b.current_pass) { + finalize_regressor(*all, all->final_regressor_name); + set_done(*all); + } }else{//reaching convergence in the previous pass if(b.output_regularizer) @@ -968,7 +970,7 @@ void save_load(bfgs& b, io_buf& model_file, bool read, bool text) b.backstep_on = true; } -learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm) +learner* setup(vw& all, po::variables_map& vm) { bfgs* b = (bfgs*)calloc_or_die(1,sizeof(bfgs)); b->all = &all; @@ -982,6 +984,16 @@ learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm) b->no_win_counter = 0; b->early_stop_thres = 3; + po::options_description bfgs_opts("LBFGS options"); + + bfgs_opts.add_options() + ("hessian_on", "use second derivative in line search") + ("mem", po::value<int>(&(all.m)), "memory in bfgs") + ("conjugate_gradient", "use conjugate gradient based optimization") + ("termination", po::value<float>(&(all.rel_threshold)),"Termination threshold"); + + vm = add_options(all, bfgs_opts); + if(!all.holdout_set_off) { all.sd->holdout_best_loss = FLT_MAX; diff --git a/vowpalwabbit/bfgs.h b/vowpalwabbit/bfgs.h index ac9e471a..82699fa6 100644 --- a/vowpalwabbit/bfgs.h +++ b/vowpalwabbit/bfgs.h @@ -7,7 +7,7 @@ license as described in the file LICENSE. #define BFGS_H namespace BFGS { - LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/binary.cc b/vowpalwabbit/binary.cc index 585f185a..aa55d666 100644 --- a/vowpalwabbit/binary.cc +++ b/vowpalwabbit/binary.cc @@ -25,15 +25,8 @@ namespace BINARY { ec.loss = 1.; } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) {//parse and set arguments - if (!vm_file.count("binary")) - { - std::stringstream ss; - ss << " --binary "; - all.options_from_file.append(ss.str()); - } - all.sd->binary_label = true; //Create new learner learner* ret = new learner(NULL, all.l); diff --git a/vowpalwabbit/binary.h b/vowpalwabbit/binary.h index 0016f16b..0cc4ab1b 100644 --- a/vowpalwabbit/binary.h +++ b/vowpalwabbit/binary.h @@ -1,6 +1,6 @@ #ifndef BINARY_H #define BINARY_H namespace BINARY { - LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/bs.cc b/vowpalwabbit/bs.cc index c3cb5f25..e7321819 100644 --- a/vowpalwabbit/bs.cc +++ b/vowpalwabbit/bs.cc @@ -192,59 +192,31 @@ namespace BS { d.pred_vec.~vector(); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { bs* data = (bs*)calloc_or_die(1, sizeof(bs)); data->ub = FLT_MAX; data->lb = -FLT_MAX; - po::options_description desc("BS options"); - desc.add_options() + po::options_description bs_options("Bootstrap options"); + bs_options.add_options() ("bs_type", po::value<string>(), "prediction type {mean,vote}"); + + vm = add_options(all, bs_options); - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); - - po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc,all.options_from_file_argv). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - po::store(parsed_file, vm_file); - po::notify(vm_file); + data->B = (uint32_t)vm["bootstrap"].as<size_t>(); - if( vm_file.count("bs") ) { - data->B = (uint32_t)vm_file["bs"].as<size_t>(); - if( vm.count("bs") && (uint32_t)vm["bs"].as<size_t>() != data->B ) - std::cerr << "warning: you specified a different number of samples through --bs than the one loaded from predictor. Pursuing with loaded value of: " << data->B << endl; - } - else { - data->B = (uint32_t)vm["bs"].as<size_t>(); + //append bs with number of samples to options_from_file so it is saved to regressor later + std::stringstream ss; + ss << " --bootstrap " << data->B; + all.file_options.append(ss.str()); - //append bs with number of samples to options_from_file so it is saved to regressor later - std::stringstream ss; - ss << " --bs " << data->B; - all.options_from_file.append(ss.str()); - } + std::string type_string("mean"); - if (vm.count("bs_type") || vm_file.count("bs_type")) + if (vm.count("bs_type")) { - std::string type_string; - - if(vm_file.count("bs_type")) { - type_string = vm_file["bs_type"].as<std::string>(); - if( vm.count("bs_type") && type_string.compare(vm["bs_type"].as<string>()) != 0) - cerr << "You specified a different --bs_type than the one loaded from regressor file. Pursuing with loaded value of: " << type_string << endl; - } - else { - type_string = vm["bs_type"].as<std::string>(); - - all.options_from_file.append(" --bs_type "); - all.options_from_file.append(type_string); - } - + type_string = vm["bs_type"].as<std::string>(); + if (type_string.compare("mean") == 0) { data->bs_type = BS_TYPE_MEAN; } @@ -256,11 +228,10 @@ namespace BS { data->bs_type = BS_TYPE_MEAN; } } - else { - //by default use mean + else //by default use mean data->bs_type = BS_TYPE_MEAN; - all.options_from_file.append(" --bs_type mean"); - } + all.file_options.append(" --bs_type "); + all.file_options.append(type_string); data->pred_vec.reserve(data->B); data->all = &all; diff --git a/vowpalwabbit/bs.h b/vowpalwabbit/bs.h index c3c1e578..9abbafea 100644 --- a/vowpalwabbit/bs.h +++ b/vowpalwabbit/bs.h @@ -13,7 +13,7 @@ license as described in the file LICENSE. namespace BS { - LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); void print_result(int f, float res, float weight, v_array<char> tag, float lb, float ub); void output_example(vw& all, example* ec, float lb, float ub); diff --git a/vowpalwabbit/cb_algs.cc b/vowpalwabbit/cb_algs.cc index 1873b29a..5c79a7fd 100644 --- a/vowpalwabbit/cb_algs.cc +++ b/vowpalwabbit/cb_algs.cc @@ -291,7 +291,7 @@ namespace CB_ALGS vw* all = c.all; CB::label* ld = (CB::label*)ec.ld; - //check if this is a test example where we just want a prediction + //check if this is a test example where we just want a prediction if( is_test_label(ld) ) { //if so just query base cost-sensitive learner @@ -299,6 +299,8 @@ namespace CB_ALGS ec.ld = &c.cb_cs_ld; base.predict(ec); + ld->prediction = c.cb_cs_ld.prediction; + ec.ld = ld; for (size_t i=0; i<ld->costs.size(); i++) ld->costs[i].partial_prediction = c.cb_cs_ld.costs[i].partial_prediction; @@ -477,60 +479,37 @@ namespace CB_ALGS VW::finish_example(all, &ec); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { cb* c = (cb*)calloc_or_die(1, sizeof(cb)); c->all = &all; c->min_cost = 0.; c->max_cost = 1.; - po::options_description desc("CB options"); - desc.add_options() + + uint32_t nb_actions = (uint32_t)vm["cb"].as<size_t>(); + //append cb with nb_actions to file_options so it is saved to regressor later + + po::options_description cb_opts("CB options"); + cb_opts.add_options() ("cb_type", po::value<string>(), "contextual bandit method to use in {ips,dm,dr}"); - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); - - po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc,all.options_from_file_argv). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - po::store(parsed_file, vm_file); - po::notify(vm_file); - - uint32_t nb_actions = 0; - if( vm_file.count("cb") ) { //if loaded options from regressor file already - nb_actions = (uint32_t)vm_file["cb"].as<size_t>(); - if( vm.count("cb") && (uint32_t)vm["cb"].as<size_t>() != nb_actions ) - std::cerr << "warning: you specified a different number of actions through --cb than the one loaded from regressor. Pursuing with loaded value of: " << nb_actions << endl; - } - else { - nb_actions = (uint32_t)vm["cb"].as<size_t>(); - //append cb with nb_actions to options_from_file so it is saved to regressor later - std::stringstream ss; - ss << " --cb " << nb_actions; - all.options_from_file.append(ss.str()); - } + vm = add_options(all, cb_opts); + + std::stringstream ss; + ss << " --cb " << nb_actions; + all.file_options.append(ss.str()); + all.sd->k = nb_actions; size_t problem_multiplier = 2;//default for DR - if (vm.count("cb_type") || vm_file.count("cb_type")) + if (vm.count("cb_type")) { std::string type_string; - if(vm_file.count("cb_type")) { - type_string = vm_file["cb_type"].as<std::string>(); - if( vm.count("cb_type") && type_string.compare(vm["cb_type"].as<string>()) != 0) - cerr << "You specified a different --cb_type than the one loaded from regressor file. Pursuing with loaded value of: " << type_string << endl; - } - else { - type_string = vm["cb_type"].as<std::string>(); - - all.options_from_file.append(" --cb_type "); - all.options_from_file.append(type_string); - } + type_string = vm["cb_type"].as<std::string>(); + + all.file_options.append(" --cb_type "); + all.file_options.append(type_string); if (type_string.compare("dr") == 0) c->cb_type = CB_TYPE_DR; @@ -552,7 +531,7 @@ namespace CB_ALGS else { //by default use doubly robust c->cb_type = CB_TYPE_DR; - all.options_from_file.append(" --cb_type dr"); + all.file_options.append(" --cb_type dr"); } all.p->lp = CB::cb_label; diff --git a/vowpalwabbit/cb_algs.h b/vowpalwabbit/cb_algs.h index 730a548a..8fffadbc 100644 --- a/vowpalwabbit/cb_algs.h +++ b/vowpalwabbit/cb_algs.h @@ -9,7 +9,7 @@ license as described in the file LICENSE. //TODO: extend to handle CSOAA_LDF and WAP_LDF namespace CB_ALGS { - LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); template <bool is_learn> float get_cost_pred(vw& all, CB::cb_class* known_cost, example& ec, uint32_t index, uint32_t base) diff --git a/vowpalwabbit/cbify.cc b/vowpalwabbit/cbify.cc index 270e0b08..4f317380 100644 --- a/vowpalwabbit/cbify.cc +++ b/vowpalwabbit/cbify.cc @@ -61,7 +61,7 @@ namespace CBIFY { ec.loss = loss(ld->label, ld->prediction); data.tau--; uint32_t action = ld->prediction; - CB::cb_class l = {ec.loss, action, 1.f / data.k}; + CB::cb_class l = {ec.loss, action, 1.f / data.k, 0}; data.cb_label.costs.erase(); data.cb_label.costs.push_back(l); ec.ld = &(data.cb_label); @@ -74,6 +74,7 @@ namespace CBIFY { data.cb_label.costs.erase(); ec.ld = &(data.cb_label); base.predict(ec); + ld->prediction = data.cb_label.prediction; ec.loss = loss(ld->label, ld->prediction); } ec.ld = ld; @@ -87,7 +88,7 @@ namespace CBIFY { data.cb_label.costs.erase(); base.predict(ec); - uint32_t action = ld->prediction; + uint32_t action = data.cb_label.prediction; float base_prob = data.epsilon / data.k; if (frand48() < 1. - data.epsilon) @@ -101,17 +102,19 @@ namespace CBIFY { action = do_uniform(data); CB::cb_class l = {loss(ld->label, action), action, base_prob}; - if (action == ld->prediction) + if (action == data.cb_label.prediction) l.probability = 1.f - data.epsilon + base_prob; data.cb_label.costs.push_back(l); } + + cout << data.cb_label.costs[0].probability << endl; if (is_learn) base.learn(ec); ld->prediction = action; - ec.loss = loss(ld->label, ld->prediction); ec.ld = ld; + ec.loss = loss(ld->label, action); } template <bool is_learn> @@ -130,9 +133,9 @@ namespace CBIFY { for (size_t i = 0; i < data.bags; i++) { base.predict(ec,i); - data.count[ld->prediction]++; + data.count[data.cb_label.prediction]++; if (i == bag) - action = ld->prediction; + action = data.cb_label.prediction; } assert(action != 0); if (is_learn) @@ -299,7 +302,7 @@ namespace CBIFY { VW::finish_example(all, &ec); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) {//parse and set arguments cbify* data = (cbify*)calloc_or_die(1, sizeof(cbify)); @@ -307,39 +310,21 @@ namespace CBIFY { data->counter = 0; data->tau = 1000; data->all = &all; - po::options_description desc("CBIFY options"); - desc.add_options() + po::options_description cb_opts("CBIFY options"); + cb_opts.add_options() ("first", po::value<size_t>(), "tau-first exploration") ("epsilon",po::value<float>() ,"epsilon-greedy exploration") ("bag",po::value<size_t>() ,"bagging-based exploration") ("cover",po::value<size_t>() ,"bagging-based exploration"); - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); + vm = add_options(all, cb_opts); - po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc,all.options_from_file_argv). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - po::store(parsed_file, vm_file); - po::notify(vm_file); + data->k = (uint32_t)vm["cbify"].as<size_t>(); - if( vm_file.count("cbify") ) { - data->k = (uint32_t)vm_file["cbify"].as<size_t>(); - if( vm.count("cbify") && (uint32_t)vm["cbify"].as<size_t>() != data->k ) - std::cerr << "warning: you specified a different number of actions through --cbify than the one loaded from predictor. Pursuing with loaded value of: " << data->k << endl; - } - else { - data->k = (uint32_t)vm["cbify"].as<size_t>(); - - //appends nb_actions to options_from_file so it is saved to regressor later - std::stringstream ss; - ss << " --cbify " << data->k; - all.options_from_file.append(ss.str()); - } + //appends nb_actions to options_from_file so it is saved to regressor later + std::stringstream ss; + ss << " --cbify " << data->k; + all.file_options.append(ss.str()); all.p->lp = MULTICLASS::mc_label; learner* l; diff --git a/vowpalwabbit/cbify.h b/vowpalwabbit/cbify.h index eb3ad92a..c347dead 100644 --- a/vowpalwabbit/cbify.h +++ b/vowpalwabbit/cbify.h @@ -7,7 +7,7 @@ license as described in the file LICENSE. #define CBIFY_H namespace CBIFY { - LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/cost_sensitive.cc b/vowpalwabbit/cost_sensitive.cc index 3e8834d9..733d5dff 100644 --- a/vowpalwabbit/cost_sensitive.cc +++ b/vowpalwabbit/cost_sensitive.cc @@ -1,5 +1,7 @@ #include "float.h" #include "cost_sensitive.h" +#include "parse_example.h" +#include "gd.h" namespace COST_SENSITIVE { diff --git a/vowpalwabbit/csoaa.cc b/vowpalwabbit/csoaa.cc index b90cbc6e..d3f7f396 100644 --- a/vowpalwabbit/csoaa.cc +++ b/vowpalwabbit/csoaa.cc @@ -66,25 +66,19 @@ namespace CSOAA { VW::finish_example(all, &ec); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { csoaa* c=(csoaa*)calloc_or_die(1,sizeof(csoaa)); c->all = &all; //first parse for number of actions uint32_t nb_actions = 0; - if( vm_file.count("csoaa") ) { //if loaded options from regressor - nb_actions = (uint32_t)vm_file["csoaa"].as<size_t>(); - if( vm.count("csoaa") && (uint32_t)vm["csoaa"].as<size_t>() != nb_actions ) //if csoaa was also specified in commandline, warn user if its different - std::cerr << "warning: you specified a different number of actions through --csoaa than the one loaded from predictor. Pursuing with loaded value of: " << nb_actions << endl; - } - else { - nb_actions = (uint32_t)vm["csoaa"].as<size_t>(); - //append csoaa with nb_actions to options_from_file so it is saved to regressor later - std::stringstream ss; - ss << " --csoaa " << nb_actions; - all.options_from_file.append(ss.str()); - } + nb_actions = (uint32_t)vm["csoaa"].as<size_t>(); + + //append csoaa with nb_actions to file_options so it is saved to regressor later + std::stringstream ss; + ss << " --csoaa " << nb_actions; + all.file_options.append(ss.str()); all.p->lp = cs_label; all.sd->k = nb_actions; @@ -785,7 +779,7 @@ namespace LabelDict { } } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { ldf* ld = (ldf*)calloc_or_die(1, sizeof(ldf)); @@ -794,33 +788,17 @@ namespace LabelDict { ld->first_pass = true; string ldf_arg; - if(vm_file.count("csoaa_ldf")) { - ldf_arg = vm_file["csoaa_ldf"].as<string>(); - - if(vm.count("csoaa_ldf") && ldf_arg.compare(vm["csoaa_ldf"].as<string>()) != 0) { - ldf_arg = vm["csoaa_ldf"].as<string>(); - //std::cerr << "warning: you specified a different ldf argument through --csoaa_ldf than the one loaded from regressor. Proceeding with value of: " << ldf_arg << endl; - } - } - else if( vm.count("csoaa_ldf") ){ + + if( vm.count("csoaa_ldf") ){ ldf_arg = vm["csoaa_ldf"].as<string>(); - all.options_from_file.append(" --csoaa_ldf "); - all.options_from_file.append(ldf_arg); - } - else if( vm_file.count("wap_ldf") ) { - ldf_arg = vm_file["wap_ldf"].as<string>(); - ld->is_wap = true; - - if(vm.count("wap_ldf") && ldf_arg.compare(vm["wap_ldf"].as<string>()) != 0) { - ldf_arg = vm["csoaa_ldf"].as<string>(); - //std::cerr << "warning: you specified a different value for --wap_ldf than the one loaded from regressor. Proceeding with value of: " << ldf_arg << endl; - } + all.file_options.append(" --csoaa_ldf "); + all.file_options.append(ldf_arg); } else { ldf_arg = vm["wap_ldf"].as<string>(); ld->is_wap = true; - all.options_from_file.append(" --wap_ldf "); - all.options_from_file.append(ldf_arg); + all.file_options.append(" --wap_ldf "); + all.file_options.append(ldf_arg); } all.p->lp = COST_SENSITIVE::cs_label; diff --git a/vowpalwabbit/csoaa.h b/vowpalwabbit/csoaa.h index 41b8933a..c334f20d 100644 --- a/vowpalwabbit/csoaa.h +++ b/vowpalwabbit/csoaa.h @@ -7,11 +7,11 @@ license as described in the file LICENSE. #define CSOAA_H namespace CSOAA { - LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } namespace CSOAA_AND_WAP_LDF { - LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/ect.cc b/vowpalwabbit/ect.cc index 6763a010..76a344a4 100644 --- a/vowpalwabbit/ect.cc +++ b/vowpalwabbit/ect.cc @@ -370,59 +370,30 @@ namespace ECT VW::finish_example(all, &ec); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { ect* data = (ect*)calloc_or_die(1, sizeof(ect)); - po::options_description desc("ECT options"); - desc.add_options() + po::options_description ect_opts("ECT options"); + ect_opts.add_options() ("error", po::value<size_t>(), "error in ECT"); - - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); - - po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc, all.options_from_file_argv). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - po::store(parsed_file, vm_file); - po::notify(vm_file); + + vm = add_options(all, ect_opts); //first parse for number of actions - data->k = 0; - if( vm_file.count("ect") ) { - data->k = (int)vm_file["ect"].as<size_t>(); - if( vm.count("ect") && vm["ect"].as<size_t>() != data->k ) - std::cerr << "warning: you specified a different number of actions through --ect than the one loaded from predictor. Pursuing with loaded value of: " << data->k << endl; - } - else { - data->k = (int)vm["ect"].as<size_t>(); - - //append ect with nb_actions to options_from_file so it is saved to regressor later - std::stringstream ss; - ss << " --ect " << data->k; - all.options_from_file.append(ss.str()); - } + data->k = (int)vm["ect"].as<size_t>(); + + //append ect with nb_actions to options_from_file so it is saved to regressor later + stringstream ss; + ss << " --ect " << data->k; - if(vm_file.count("error")) { - data->errors = (uint32_t)vm_file["error"].as<size_t>(); - if (vm.count("error") && (uint32_t)vm["error"].as<size_t>() != data->errors) { - cerr << "warning: specified value for --error different than the one loaded from predictor file. Pursuing with loaded value of: " << data->errors << endl; - } - } - else if (vm.count("error")) { + if (vm.count("error")) { data->errors = (uint32_t)vm["error"].as<size_t>(); - - //append error flag to options_from_file so it is saved in regressor file later - stringstream ss; - ss << " --error " << data->errors; - all.options_from_file.append(ss.str()); - } else { + } else data->errors = 0; - } - + //append error flag to options_from_file so it is saved in regressor file later + ss << " --error " << data->errors; + all.file_options.append(ss.str()); + all.p->lp = MULTICLASS::mc_label; size_t wpp = create_circuit(all, *data, data->k, data->errors+1); data->all = &all; diff --git a/vowpalwabbit/ect.h b/vowpalwabbit/ect.h index 4d3a47fc..c26b4115 100644 --- a/vowpalwabbit/ect.h +++ b/vowpalwabbit/ect.h @@ -8,7 +8,7 @@ license as described in the file LICENSE. namespace ECT { - LEARNER::learner* setup(vw&, std::vector<std::string>&, po::variables_map&, po::variables_map& vm_file); + LEARNER::learner* setup(vw&, po::variables_map&); } #endif diff --git a/vowpalwabbit/example.cc b/vowpalwabbit/example.cc index df344eb5..54e1b4f0 100644 --- a/vowpalwabbit/example.cc +++ b/vowpalwabbit/example.cc @@ -49,18 +49,83 @@ void vec_store(features_and_source& p, float fx, float& fw) { p.feature_map.push_back(f); } +audit_data copy_audit_data(audit_data &src) { + audit_data dst; + dst.space = (char*)calloc_or_die(strlen(src.space)+1, sizeof(char)); + strcpy(dst.space, src.space); + dst.feature = (char*)calloc_or_die(strlen(src.feature)+1, sizeof(char)); + strcpy(dst.feature, src.feature); + dst.weight_index = src.weight_index; + dst.x = src.x; + dst.alloced = src.alloced; + return dst; +} + namespace VW { +void copy_example_label(example* dst, example* src, size_t label_size, void(*copy_label)(void*&,void*)) { + if (!src->ld) { + if (dst->ld) free(dst->ld); // TODO: this should be a delete_label, really + dst->ld = NULL; + } else { + if ((label_size == 0) && (copy_label == NULL)) { + if (dst->ld) free(dst->ld); // TODO: this should be a delete_label, really + dst->ld = NULL; + } else if (copy_label) { + copy_label(dst->ld, src->ld); + } else { + //dst->ld = (void*)malloc(label_size); + memcpy(dst->ld, src->ld, label_size); + } + } +} + +void copy_example_data(bool audit, example* dst, example* src) +{ + //std::cerr << "copy_example_data dst = " << dst << std::endl; + copy_array(dst->tag, src->tag); + dst->example_counter = src->example_counter; + + copy_array(dst->indices, src->indices); + for (size_t i=0; i<256; i++) + copy_array(dst->atomics[i], src->atomics[i]); + dst->ft_offset = src->ft_offset; + + if (audit) + for (size_t i=0; i<256; i++) + copy_array(dst->audit_features[i], src->audit_features[i], copy_audit_data); + + dst->num_features = src->num_features; + dst->partial_prediction = src->partial_prediction; + copy_array(dst->topic_predictions, src->topic_predictions); + dst->loss = src->loss; + dst->eta_round = src->eta_round; + dst->eta_global = src->eta_global; + dst->example_t = src->example_t; + memcpy(dst->sum_feat_sq, src->sum_feat_sq, 256 * sizeof(float)); + dst->total_sum_feat_sq = src->total_sum_feat_sq; + dst->revert_weight = src->revert_weight; + dst->test_only = src->test_only; + dst->end_pass = src->end_pass; + dst->sorted = src->sorted; + dst->in_use = src->in_use;} + +void copy_example_data(bool audit, example* dst, example* src, size_t label_size, void(*copy_label)(void*&,void*)) { + copy_example_data(audit, dst, src); + copy_example_label(dst, src, label_size, copy_label); +} flat_example* flatten_example(vw& all, example *ec) { flat_example* fec = (flat_example*) calloc_or_die(1,sizeof(flat_example)); - fec->ld = ec->ld; + fec->ld = (label_data*)calloc_or_die(1, sizeof(label_data)); + memcpy(fec->ld, ec->ld, sizeof(label_data)); fec->tag_len = ec->tag.size(); if (fec->tag_len >0) - { - fec->tag = ec->tag.begin; - } + { + fec->tag = (char*)calloc_or_die(fec->tag_len+1, sizeof(char)); + memcpy(fec->tag,ec->tag.begin, fec->tag_len); + } fec->example_counter = ec->example_counter; fec->ft_offset = ec->ft_offset; @@ -82,9 +147,16 @@ flat_example* flatten_example(vw& all, example *ec) } void free_flatten_example(flat_example* fec) -{ - if (fec) - free(fec); +{ //note: The label memory should be freed by by freeing the original example. + if (fec) + { + if (fec->feature_map_len > 0) + free(fec->feature_map); + if (fec->tag_len > 0) + free(fec->tag); + free(fec->ld); + free(fec); + } } } @@ -136,68 +208,3 @@ void dealloc_example(void(*delete_label)(void*), example&ec) ec.indices.delete_v(); } -audit_data copy_audit_data(audit_data &src) { - audit_data dst; - dst.space = (char*)calloc_or_die(strlen(src.space)+1, sizeof(char)); - strcpy(dst.space, src.space); - dst.feature = (char*)calloc_or_die(strlen(src.feature)+1, sizeof(char)); - strcpy(dst.feature, src.feature); - dst.weight_index = src.weight_index; - dst.x = src.x; - dst.alloced = src.alloced; - return dst; -} - -namespace VW { -void copy_example_label(example*dst, example*src, size_t label_size, void(*copy_label)(void*&,void*)) { - if (!src->ld) { - if (dst->ld) free(dst->ld); // TODO: this should be a delete_label, really - dst->ld = NULL; - } else { - if ((label_size == 0) && (copy_label == NULL)) { - if (dst->ld) free(dst->ld); // TODO: this should be a delete_label, really - dst->ld = NULL; - } else if (copy_label) { - copy_label(dst->ld, src->ld); - } else { - //dst->ld = (void*)malloc(label_size); - memcpy(dst->ld, src->ld, label_size); - } - } -} - -void copy_example_data(bool audit, example* dst, example* src) -{ - //std::cerr << "copy_example_data dst = " << dst << std::endl; - copy_array(dst->tag, src->tag); - dst->example_counter = src->example_counter; - - copy_array(dst->indices, src->indices); - for (size_t i=0; i<256; i++) - copy_array(dst->atomics[i], src->atomics[i]); - dst->ft_offset = src->ft_offset; - - if (audit) - for (size_t i=0; i<256; i++) - copy_array(dst->audit_features[i], src->audit_features[i], copy_audit_data); - - dst->num_features = src->num_features; - dst->partial_prediction = src->partial_prediction; - copy_array(dst->topic_predictions, src->topic_predictions); - dst->loss = src->loss; - dst->eta_round = src->eta_round; - dst->eta_global = src->eta_global; - dst->example_t = src->example_t; - memcpy(dst->sum_feat_sq, src->sum_feat_sq, 256 * sizeof(float)); - dst->total_sum_feat_sq = src->total_sum_feat_sq; - dst->revert_weight = src->revert_weight; - dst->test_only = src->test_only; - dst->end_pass = src->end_pass; - dst->sorted = src->sorted; - dst->in_use = src->in_use;} - -void copy_example_data(bool audit, example* dst, example* src, size_t label_size, void(*copy_label)(void*&,void*)) { - copy_example_data(audit, dst, src); - copy_example_label(dst, src, label_size, copy_label); -} -} diff --git a/vowpalwabbit/gd.cc b/vowpalwabbit/gd.cc index 9cb788d2..98d3d71f 100644 --- a/vowpalwabbit/gd.cc +++ b/vowpalwabbit/gd.cc @@ -19,7 +19,6 @@ license as described in the file LICENSE. #include <xmmintrin.h> #endif -#include "sparse_dense.h" #include "gd.h" #include "simple_label.h" #include "accumulate.h" @@ -32,7 +31,6 @@ using namespace LEARNER; namespace GD { struct gd{ - size_t current_pass; bool active; bool active_simulation; float normalized_sum_norm_x; @@ -45,31 +43,17 @@ namespace GD }; void sync_weights(vw& all); + + struct power_data { + float minus_power_t; + float minus_power_t_norm; + }; struct train_data { - float avg_norm; float update; - float power_t; + power_data pt; }; - template <void (*T)(train_data&, float, float&)> - void generic_train(vw& all, example& ec, float update, bool sqrt_norm) - { - if (fabs(update) == 0.) - return; - - float total_weight = ec.example_t; - - if(!all.holdout_set_off) - total_weight -= (float)all.sd->weighted_holdout_examples; //exclude weights from test_only examples - - float avg_norm = all.normalized_sum_norm_x / total_weight; - if (sqrt_norm) avg_norm = sqrt(avg_norm); - - train_data d = {avg_norm, update, all.power_t}; - - foreach_feature<train_data,T>(all, ec, d); - } float InvSqrt(float x){ float xhalf = 0.5f * x; @@ -80,45 +64,82 @@ float InvSqrt(float x){ return x; } - template<bool adaptive, bool normalized_updates, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx> - inline void general_update(train_data& s, float x, float& fw) + template<bool sqrt_rate, size_t adaptive, size_t normalized> + inline float compute_rate_decay(power_data& s, float& fw) { weight* w = &fw; - if(feature_mask_off || w[feature_mask_idx]==1.){ - float t = 1.f; - if(adaptive) t = powf(w[1],-s.power_t); - if(normalized_updates) { - float norm = w[normalized_idx] * s.avg_norm; - float power_t_norm = 1.f - (adaptive ? s.power_t : 0.f); - t *= powf(norm*norm,-power_t_norm); + float rate_decay = 1.f; + if(adaptive) { + if (sqrt_rate) + { +#if defined(__SSE2__) && !defined(VW_LDA_NO_SSE) + __m128 eta = _mm_load_ss(&w[adaptive]); + eta = _mm_rsqrt_ss(eta); + _mm_store_ss(&rate_decay, eta); +#else + rate_decay = InvSqrt(w[adaptive]); +#endif + } + else + rate_decay = powf(w[adaptive],s.minus_power_t); + } + if(normalized) { + if (sqrt_rate) + { + float inv_norm = 1.f / w[normalized]; + if (adaptive) + rate_decay *= inv_norm; + else + rate_decay *= inv_norm*inv_norm; + } + else{ + float norm = w[normalized]; + rate_decay *= powf(norm*norm,s.minus_power_t_norm); } - w[0] += s.update * x * t; } + return rate_decay; } - template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx> - inline void specialized_update(train_data& s, float x, float& fw) + template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized> + inline void update_feature(train_data& s, float x, float& fw) { - weight* w = &fw; - if(feature_mask_off || w[feature_mask_idx]==1.){ - float t = 1.f; - float inv_norm = 1.f; - if(normalized) inv_norm /= (w[normalized_idx] * s.avg_norm); - if(adaptive) { -#if defined(__SSE2__) && !defined(VW_LDA_NO_SSE) - __m128 eta = _mm_load_ss(&w[1]); - eta = _mm_rsqrt_ss(eta); - _mm_store_ss(&t, eta); - t *= inv_norm; -#else - t = InvSqrt(w[1]) * inv_norm; -#endif - } else { - t *= inv_norm*inv_norm; //if only using normalized updates but not adaptive, need to divide by feature norm squared + weight* w = &fw; + if(feature_mask_off || fw != 0.){ + float rate_decay = compute_rate_decay<sqrt_rate, adaptive, normalized>(s.pt, fw); + w[0] += s.update * rate_decay * x; } - w[0] += s.update * x * t; } -} + + template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized> + void train(vw& all, example& ec, float update) + { + if (fabsf(update) == 0.f) + return; + + float total_weight = ec.example_t; + + if(!all.holdout_set_off) + total_weight -= (float)all.sd->weighted_holdout_examples; //exclude weights from test_only examples + + float avg_norm = all.normalized_sum_norm_x / total_weight; + if (sqrt_rate) avg_norm = sqrt(avg_norm); + + float minus_power_t_norm = (adaptive ? all.power_t : 0.f) -1.f; + + if (normalized) { + if (sqrt_rate) + if (adaptive) + update /= avg_norm; + else + update /= (avg_norm * avg_norm); + else + update *= powf(avg_norm * avg_norm, minus_power_t_norm); + } + + train_data d = {update, {-all.power_t, minus_power_t_norm}}; + + foreach_feature<train_data,update_feature<sqrt_rate, feature_mask_off, adaptive, normalized> >(all, ec, d); + } void end_pass(gd& g) { @@ -155,6 +176,12 @@ struct string_value { friend bool operator<(const string_value& first, const string_value& second); }; + inline float sign(float w){ if (w < 0.) return -1.; else return 1.;} + + inline float trunc_weight(const float w, const float gravity){ + return (gravity < fabsf(w)) ? w - sign(w) * gravity : 0.f; + } + bool operator<(const string_value& first, const string_value& second) { return fabs(first.v) > fabs(second.v); @@ -348,82 +375,35 @@ float finalize_prediction(vw& all, float ret) return ret; } -template<bool normalized_training, bool reg_mode_odd, bool power_t_half> + struct trunc_data { + float prediction; + float gravity; + }; + + inline void vec_add_trunc(trunc_data& p, const float fx, float& fw) { + p.prediction += trunc_weight(fw, p.gravity) * fx; + } + + inline float trunc_predict(vw& all, example& ec, float gravity) + { + label_data* ld = (label_data*)ec.ld; + trunc_data temp = {ld->initial, gravity}; + foreach_feature<trunc_data, vec_add_trunc>(all, ec, temp); + return temp.prediction; + } + +template<bool reg_mode_odd> void predict(gd& g, learner& base, example& ec) { vw& all = *g.all; - if (normalized_training) { - if(power_t_half) { - if (reg_mode_odd) - { - float gravity = (float)all.sd->gravity; - if (all.adaptive) - if (all.normalized_idx == 1) - ec.partial_prediction = inline_predict<float, vec_add_trunc_rescale<true, 1> >(all, ec, gravity); - else - ec.partial_prediction = inline_predict<float, vec_add_trunc_rescale<true, 2> >(all, ec, gravity); - else - if (all.normalized_idx == 1) - ec.partial_prediction = inline_predict<float, vec_add_trunc_rescale<false, 1> >(all, ec, gravity); - else - ec.partial_prediction = inline_predict<float, vec_add_trunc_rescale<false, 2> >(all, ec, gravity); - } - else - { - if (all.adaptive) - if (all.normalized_idx == 1) - ec.partial_prediction = inline_predict<vec_add_rescale<true, 1> >(all, ec); - else - ec.partial_prediction = inline_predict<vec_add_rescale<true, 2> >(all, ec); - else - if (all.normalized_idx == 1) - ec.partial_prediction = inline_predict<vec_add_rescale<false, 1> >(all, ec); - else - ec.partial_prediction = inline_predict<vec_add_rescale<false, 2> >(all, ec); - } - } - else { - if (reg_mode_odd) - { - gnp temp = {(float)all.sd->gravity, all.power_t}; - if (all.adaptive) - if (all.normalized_idx == 1) - ec.partial_prediction = inline_predict<gnp, vec_add_trunc_rescale_general<true, 1> >(all, ec, temp); - else - ec.partial_prediction = inline_predict<gnp, vec_add_trunc_rescale_general<true, 2> >(all, ec, temp); - else - if (all.normalized_idx == 1) - ec.partial_prediction = inline_predict<gnp, vec_add_trunc_rescale_general<false, 1> >(all, ec, temp); - else - ec.partial_prediction = inline_predict<gnp, vec_add_trunc_rescale_general<false, 2> >(all, ec, temp); - } - else - { - float power_t = all.power_t; - if (all.adaptive) - if (all.normalized_idx == 1) - ec.partial_prediction = inline_predict<float, vec_add_rescale_general<true, 1> >(all, ec, power_t); - else - ec.partial_prediction = inline_predict<float, vec_add_rescale_general<true, 2> >(all, ec, power_t); - else - if (all.normalized_idx == 1) - ec.partial_prediction = inline_predict<float, vec_add_rescale_general<false, 1> >(all, ec, power_t); - else - ec.partial_prediction = inline_predict<float, vec_add_rescale_general<false, 2> >(all, ec, power_t); - } + if (reg_mode_odd) + { + float gravity = (float)all.sd->gravity; + ec.partial_prediction = trunc_predict(all, ec, gravity); } - } - else { - // no rescaling - if (reg_mode_odd) - { - float gravity = (float)all.sd->gravity; - ec.partial_prediction = inline_predict<float, vec_add_trunc>(all, ec, gravity); - } - else - ec.partial_prediction = inline_predict<vec_add>(all, ec); - } + else + ec.partial_prediction = inline_predict(all, ec); label_data& ld = *(label_data*)ec.ld; ld.prediction = finalize_prediction(all, ec.partial_prediction * (float)all.sd->contraction); @@ -434,97 +414,70 @@ void predict(gd& g, learner& base, example& ec) struct norm_data { float g; - float norm; + float pred_per_update; float norm_x; - float power_t; + power_data pd; }; -template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx> -inline void simple_norm_compute(norm_data& nd, float x, float& fw) -{ + template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized> +inline void pred_per_update_feature(norm_data& nd, float x, float& fw) { weight* w = &fw; - if(feature_mask_off || w[feature_mask_idx]==1.){ + if(feature_mask_off || fw != 0.){ float x2 = x * x; - float t = 1.f; - float inv_norm = 1.f; - float inv_norm2 = 1.f; + if(adaptive) + w[adaptive] += nd.g * x2; if(normalized) { - inv_norm /= w[normalized_idx]; - inv_norm2 = inv_norm*inv_norm; - nd.norm_x += x2 * inv_norm2; - } - if(adaptive){ - w[1] += nd.g * x2; - -#if defined(__SSE2__) && !defined(VW_LDA_NO_SSE) - __m128 eta = _mm_load_ss(&w[1]); - eta = _mm_rsqrt_ss(eta); - _mm_store_ss(&t, eta); - t *= inv_norm; -#else - t = InvSqrt(w[1]) * inv_norm; -#endif - } else { - t *= inv_norm2; //if only using normalized but not adaptive, we're dividing update by feature norm squared + float x_abs = fabsf(x); + if( x_abs > w[normalized] ) {// new scale discovered + if( w[normalized] > 0. ) {//If the normalizer is > 0 then rescale the weight so it's as if the new scale was the old scale. + float rescale = (w[normalized]/x_abs); + if (sqrt_rate) + w[0] *= (adaptive ? rescale : rescale*rescale); + else + w[0] *= powf(rescale*rescale, -nd.pd.minus_power_t_norm); + } + w[normalized] = x_abs; + } + nd.norm_x += x2 / (w[normalized] * w[normalized]); } - nd.norm += x2 * t; - } -} - -template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx> -inline void powert_norm_compute(norm_data& nd, float x, float& fw) { - weight* w = &fw; - if(feature_mask_off || w[feature_mask_idx]==1.){ - float power_t_norm = 1.f - (adaptive ? nd.power_t : 0.f); + float rate_decay = compute_rate_decay<sqrt_rate, adaptive, normalized>(nd.pd, fw); - float x2 = x * x; - float t = 1.f; - if(adaptive){ - w[1] += nd.g * x2; - t = powf(w[1], -nd.power_t); - } - if(normalized) { - float range2 = w[normalized_idx] * w[normalized_idx]; - t *= powf(range2, -power_t_norm); - nd.norm_x += x2 / range2; - } - nd.norm += x2 * t; + nd.pred_per_update += x2 * rate_decay; } } - - template <void (*T)(norm_data&,float,float&)> -float compute_norm(vw& all, example& ec) -{//We must traverse the features in _precisely_ the same order as during training. - label_data* ld = (label_data*)ec.ld; - float g = all.loss->getSquareGrad(ld->prediction, ld->label) * ld->weight; - if (g==0) return 1.; - - norm_data nd = {g, 0., 0., all.power_t}; - - foreach_feature<norm_data,T>(all, ec, nd); - - if(all.normalized_updates) { + + template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized> + float get_pred_per_update(vw& all, example& ec) + {//We must traverse the features in _precisely_ the same order as during training. + label_data* ld = (label_data*)ec.ld; + float g = all.loss->getSquareGrad(ld->prediction, ld->label) * ld->weight; + if (g==0) return 1.; + + float minus_power_t_norm = (adaptive ? all.power_t : 0.f) - 1.f; + norm_data nd = {g, 0., 0., {-all.power_t, minus_power_t_norm}}; + + foreach_feature<norm_data,pred_per_update_feature<sqrt_rate, feature_mask_off, adaptive, normalized> >(all, ec, nd); + + if(normalized) { float total_weight = ec.example_t; - + if(!all.holdout_set_off) total_weight -= (float)all.sd->weighted_holdout_examples; //exclude weights from test_only examples all.normalized_sum_norm_x += ld->weight * nd.norm_x; - - float avg_sq_norm = all.normalized_sum_norm_x / total_weight; - if(all.power_t == 0.5) { - if(all.adaptive) nd.norm /= sqrt(avg_sq_norm); - else nd.norm /= avg_sq_norm; - } else { - float power_t_norm = 1.f - (all.adaptive ? all.power_t : 0.f); - nd.norm *= powf(avg_sq_norm,-power_t_norm); - } + + float avg_norm = all.normalized_sum_norm_x / total_weight; + if(sqrt_rate) { + if(adaptive) nd.pred_per_update /= sqrt(avg_norm); + else nd.pred_per_update /= avg_norm; + } else + nd.pred_per_update *= powf(avg_norm,minus_power_t_norm); } - return nd.norm; + return nd.pred_per_update; } -template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx> + template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized> void compute_update(vw& all, gd& g, example& ec) { label_data* ld = (label_data*)ec.ld; @@ -556,34 +509,28 @@ void compute_update(vw& all, gd& g, example& ec) { if (all.training && ec.loss > 0.) { - float eta_t; - float norm; + float pred_per_update; if(adaptive || normalized) - if(all.power_t == 0.5) - norm = compute_norm<simple_norm_compute<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx> >(all,ec); - else - norm = compute_norm<powert_norm_compute<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx> >(all,ec); + pred_per_update = get_pred_per_update<sqrt_rate, feature_mask_off, adaptive, normalized>(all,ec); else - norm = ec.total_sum_feat_sq; + pred_per_update = ec.total_sum_feat_sq; - eta_t = all.eta * norm * ld->weight; - if(!adaptive && all.power_t != 0) eta_t *= powf(t,-all.power_t); + float delta_pred = pred_per_update * all.eta * ld->weight; + if(!adaptive && all.power_t != 0) delta_pred *= powf(t,-all.power_t); float update = 0.f; if( all.invariant_updates ) - update = all.loss->getUpdate(ld->prediction, ld->label, eta_t, norm); + update = all.loss->getUpdate(ld->prediction, ld->label, delta_pred, pred_per_update); else - update = all.loss->getUnsafeUpdate(ld->prediction, ld->label, eta_t, norm); + update = all.loss->getUnsafeUpdate(ld->prediction, ld->label, delta_pred, pred_per_update); ec.eta_round = (float) (update / all.sd->contraction); - if (all.reg_mode && fabs(ec.eta_round) > 1e-8) { double dev1 = all.loss->first_derivative(all.sd, ld->prediction, ld->label); double eta_bar = (fabs(dev1) > 1e-8) ? (-ec.eta_round / dev1) : 0.0; if (fabs(dev1) > 1e-8) - all.sd->contraction *= (1. - all.l2_lambda * eta_bar * norm); - //all.sd->contraction /= (1. + all.l2_lambda * eta_bar * norm); - all.sd->gravity += eta_bar * sqrt(norm) * all.l1_lambda; + all.sd->contraction *= (1. - all.l2_lambda * eta_bar); + all.sd->gravity += eta_bar * all.l1_lambda; } } } @@ -592,26 +539,23 @@ void compute_update(vw& all, gd& g, example& ec) } - template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx> + template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized> void update(gd& g, learner& base, example& ec) { vw* all = g.all; - compute_update<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx > (*all, g, ec); + compute_update<sqrt_rate, feature_mask_off, adaptive, normalized> (*all, g, ec); if (ec.eta_round != 0.) { - if(all->power_t == 0.5) - generic_train<specialized_update<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx> > (*all,ec,(float)ec.eta_round,true); - else - generic_train<general_update<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx> >(*all,ec,(float)ec.eta_round,false); + train<sqrt_rate, feature_mask_off, adaptive, normalized>(*all,ec,(float)ec.eta_round); if (all->sd->contraction < 1e-10) // updating weights now to avoid numerical instability sync_weights(*all); } } -template<bool adaptive, bool normalized, bool feature_mask_off, size_t normalized_idx, size_t feature_mask_idx> +template<bool sqrt_rate, bool feature_mask_off, size_t adaptive, size_t normalized> void learn(gd& g, learner& base, example& ec) { vw* all = g.all; @@ -622,7 +566,7 @@ void learn(gd& g, learner& base, example& ec) g.predict(g,base,ec); if ((all->holdout_set_off || !ec.test_only) && ld->weight > 0) - update<adaptive, normalized, feature_mask_off, normalized_idx, feature_mask_idx>(g,base,ec); + update<sqrt_rate, feature_mask_off, adaptive, normalized>(g,base,ec); else if(ld->weight > 0) ec.loss = all->loss->getLoss(all->sd, ld->prediction, ld->label) * ld->weight; } @@ -883,6 +827,51 @@ void save_load(gd& g, io_buf& model_file, bool read, bool text) } } +template<bool sqrt_rate, size_t adaptive, size_t normalized, size_t next> +size_t set_learn(vw& all, learner* ret, bool feature_mask_off) +{ + all.normalized_idx = normalized; + if (feature_mask_off) + { + ret->set_learn<gd, learn<sqrt_rate, true, adaptive,normalized> >(); + ret->set_update<gd, update<sqrt_rate, true, adaptive,normalized> >(); + return next; + } + else + { + ret->set_learn<gd, learn<sqrt_rate, false, adaptive,normalized> >(); + ret->set_update<gd, update<sqrt_rate, false, adaptive,normalized> >(); + return next; + } +} + +template<bool sqrt_rate, size_t adaptive> +size_t set_learn(vw& all, learner* ret, bool feature_mask_off) +{ + // select the appropriate learn function based on adaptive, normalization, and feature mask + if (all.normalized_updates) + return set_learn<sqrt_rate, adaptive, adaptive+1, adaptive+2>(all, ret, feature_mask_off); + else + return set_learn<sqrt_rate, adaptive, 0, adaptive+1>(all, ret, feature_mask_off); +} + +template<bool sqrt_rate> +size_t set_learn(vw& all, learner* ret, bool feature_mask_off) +{ + if (all.adaptive) + return set_learn<sqrt_rate, 1>(all, ret, feature_mask_off); + else + return set_learn<sqrt_rate, 0>(all, ret, feature_mask_off); +} + +size_t ceil_log_2(size_t v) +{ + if (v==0) + return 0; + else + return 1 + ceil_log_2(v >> 1); +} + learner* setup(vw& all, po::variables_map& vm) { gd* g = (gd*)calloc_or_die(1, sizeof(gd)); @@ -908,146 +897,52 @@ learner* setup(vw& all, po::variables_map& vm) g->initial_constant = vm["constant"].as<float>(); } - all.reg.stride_shift = 2; //use stride of 4 for default invariant normalized adaptive updates - //if the user specified anything in sgd,adaptive,invariant,normalized, we turn off default update rules and use whatever user specified if( !all.training || ( ( vm.count("sgd") || vm.count("adaptive") || vm.count("invariant") || vm.count("normalized") ) && !vm.count("exact_adaptive_norm")) ) - { - all.adaptive = all.training && vm.count("adaptive"); - all.invariant_updates = all.training && vm.count("invariant"); - all.normalized_updates = all.training && vm.count("normalized"); - - all.reg.stride_shift = 0; - - if( all.adaptive ) all.reg.stride_shift += 1; - else all.normalized_idx = 1; //store per feature norm at 1 index offset from weight value instead of 2 - - if( all.normalized_updates ) all.reg.stride_shift += 1; - - if(!vm.count("learning_rate") && !vm.count("l") && !(all.adaptive && all.normalized_updates)) - if (all.lda == 0) - all.eta = 10; //default learning rate to 10 for non default update rule - - //if not using normalized or adaptive, default initial_t to 1 instead of 0 - if(!all.adaptive && !all.normalized_updates){ - if (!vm.count("initial_t")) { - all.sd->t = 1.f; - all.sd->weighted_unlabeled_examples = 1.f; - all.initial_t = 1.f; - } - all.eta *= powf((float)(all.sd->t), all.power_t); - } - if (vm.count("feature_mask")){ - if(all.reg.stride_shift == 0){ - all.reg.stride_shift += 1;//if --sgd, stride->2 and use the second position as mask - all.feature_mask_idx = 1; - } - else if(all.reg.stride_shift == 1){ - all.reg.stride_shift += 1;//if either normalized or adaptive, stride->4, mask_idx is still 3 + {//nondefault + all.adaptive = all.training && vm.count("adaptive"); + all.invariant_updates = all.training && vm.count("invariant"); + all.normalized_updates = all.training && vm.count("normalized"); + + if(!vm.count("learning_rate") && !vm.count("l") && !(all.adaptive && all.normalized_updates)) + all.eta = 10; //default learning rate to 10 for non default update rule + + //if not using normalized or adaptive, default initial_t to 1 instead of 0 + if(!all.adaptive && !all.normalized_updates){ + if (!vm.count("initial_t")) { + all.sd->t = 1.f; + all.sd->weighted_unlabeled_examples = 1.f; + all.initial_t = 1.f; + } + all.eta *= powf((float)(all.sd->t), all.power_t); } } - } - + if (pow((double)all.eta_decay_rate, (double)all.numpasses) < 0.0001 ) cerr << "Warning: the learning rate for the last pass is multiplied by: " << pow((double)all.eta_decay_rate, (double)all.numpasses) << " adjust --decay_learning_rate larger to avoid this." << endl; - learner* ret = new learner(g, 1 << all.reg.stride_shift); + learner* ret = new learner(g, 1); - // select the appropriate predict function based on normalization, regularization, and power_t - if (all.normalized_updates && all.training) - if (all.reg_mode % 2) - if (all.power_t == 0.5) - { - g->predict = predict<true, true, true>; - ret->set_predict<gd, predict<true, true, true> >(); - } - else - { - g->predict = predict<true, true, false>; - ret->set_predict<gd, predict<true, true, false> >(); - } - else - if (all.power_t == 0.5) - { - g->predict = predict<true, false, true>; - ret->set_predict<gd, predict<true, false, true> >(); - } - else - { - g->predict = predict<true, false, false>; - ret->set_predict<gd, predict<true, false, false> >(); - } + if (all.reg_mode % 2) + { + ret->set_predict<gd, predict<true> >(); + g->predict = predict<true>; + } else - if (all.reg_mode % 2) - if (all.power_t == 0.5) - { - g->predict = predict<false, true, true>; - ret->set_predict<gd, predict<false, true, true> >(); - } - else - { - g->predict = predict<false, true, false>; - ret->set_predict<gd, predict<false, true, false> >(); - } - else - if (all.power_t == 0.5) - { - g->predict = predict<false, false, true>; - ret->set_predict<gd, predict<false, false, true> >(); - } - else - { - g->predict = predict<false, false, true>; - ret->set_predict<gd, predict<false, false, true> >(); - } - - // select the appropriate learn function based on adaptive, normalization, and feature mask - if (all.adaptive) - if (all.normalized_updates) - if (feature_mask_off) - { - ret->set_learn<gd, learn<true,true,true, 2, 0> >(); - ret->set_update<gd, update<true,true,true, 2, 0> >(); - } - else - { - ret->set_learn<gd, learn<true,true,false, 2, 3> >(); - ret->set_update<gd, update<true,true,false, 2, 3> >(); - } - else - if (feature_mask_off) - { - ret->set_learn<gd, learn<true,false,true, 0, 0> >(); - ret->set_update<gd, update<true,false,true, 0, 0> >(); - } - else - { - ret->set_learn<gd, learn<true,false,false, 0, 2> >(); - ret->set_update<gd, update<true,false,false, 0, 2> >(); - } + { + ret->set_predict<gd, predict<false> >(); + g->predict = predict<true>; + } + + size_t stride; + if (all.power_t == 0.5) + stride = set_learn<true>(all, ret, feature_mask_off); else - if (all.normalized_updates) - if (feature_mask_off) - { - ret->set_learn<gd, learn<false,true,true, 1, 0> >(); - ret->set_update<gd, update<false,true,true, 1, 0> >(); - } - else - { - ret->set_learn<gd, learn<false,true,false, 1, 2> >(); - ret->set_update<gd, update<false,true,false, 1, 2> >(); - } - else - if (feature_mask_off) - { - ret->set_learn<gd, learn<false,false,true, 0, 0> >(); - ret->set_update<gd, update<false, false, true, 0, 0> >(); - } - else - { - ret->set_learn<gd, learn<false,false,false, 0, 1> >(); - ret->set_update<gd, update<false, false, false, 0, 1> >(); - } + stride = set_learn<false>(all, ret, feature_mask_off); + + all.reg.stride_shift = ceil_log_2(stride-1); + ret->increment = (1 << all.reg.stride_shift); + ret->set_save_load<gd,save_load>(); ret->set_end_pass<gd, end_pass>(); diff --git a/vowpalwabbit/gd.h b/vowpalwabbit/gd.h index 5c46e008..e8de0629 100644 --- a/vowpalwabbit/gd.h +++ b/vowpalwabbit/gd.h @@ -10,12 +10,11 @@ license as described in the file LICENSE. #include <sys/socket.h> #endif -#include <math.h> #include "example.h" #include "parse_regressor.h" #include "parser.h" -#include "sparse_dense.h" #include "v_array.h" +#include "constant.h" namespace GD{ void print_result(int f, float res, v_array<char> tag); @@ -74,23 +73,17 @@ void output_and_account_example(example* ec); } } - template <class R, void (*T)(predict_data<R>&, const float, float&)> - inline float inline_predict(vw& all, example& ec, R extra) - { - label_data* ld = (label_data*)ec.ld; - predict_data<R> temp = {ld->initial, extra}; - foreach_feature<predict_data<R>, T>(all, ec, temp); - return temp.prediction; - } + inline void vec_add(float& p, const float fx, float& fw) { + p += fw * fx; + } - template <void (*T)(float&, const float, float&)> - inline float inline_predict(vw& all, example& ec) - { - label_data* ld = (label_data*)ec.ld; - float temp = ld->initial; - foreach_feature<float, T>(all, ec, temp); - return temp; - } + inline float inline_predict(vw& all, example& ec) + { + label_data* ld = (label_data*)ec.ld; + float temp = ld->initial; + foreach_feature<float, vec_add>(all, ec, temp); + return temp; + } } #endif diff --git a/vowpalwabbit/gd_mf.cc b/vowpalwabbit/gd_mf.cc index 3f1ca331..635d34b5 100644 --- a/vowpalwabbit/gd_mf.cc +++ b/vowpalwabbit/gd_mf.cc @@ -14,7 +14,6 @@ license as described in the file LICENSE. #endif #include "constant.h" -#include "sparse_dense.h" #include "gd.h" #include "simple_label.h" #include "rand48.h" @@ -48,7 +47,7 @@ void mf_print_offset_features(vw& all, example& ec, size_t offset) cout << "\tConstant:"; cout << ((index >> all.reg.stride_shift) & all.parse_mask) << ':' << f->x; - cout << ':' << trunc_weight(weights[index], (float)all.sd->gravity) * (float)all.sd->contraction; + cout << ':' << weights; } for (vector<string>::iterator i = all.pairs.begin(); i != all.pairs.end();i++) if (ec.atomics[(int)(*i)[0]].size() > 0 && ec.atomics[(int)(*i)[1]].size() > 0) @@ -96,7 +95,7 @@ float mf_predict(vw& all, example& ec) float linear_prediction = 0.; // linear terms for (unsigned char* i = ec.indices.begin; i != ec.indices.end; i++) - GD::foreach_feature<float, vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[*i].begin, ec.atomics[*i].end, linear_prediction); + GD::foreach_feature<float, GD::vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[*i].begin, ec.atomics[*i].end, linear_prediction); // store constant + linear prediction // note: constant is now automatically added @@ -115,12 +114,12 @@ float mf_predict(vw& all, example& ec) // l^k is from index+1 to index+all.rank //float x_dot_l = sd_offset_add(weights, mask, ec.atomics[(int)(*i)[0]].begin, ec.atomics[(int)(*i)[0]].end, k); float x_dot_l = 0.; - GD::foreach_feature<float, vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[(int)(*i)[0]].begin, ec.atomics[(int)(*i)[0]].end, x_dot_l, k); + GD::foreach_feature<float, GD::vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[(int)(*i)[0]].begin, ec.atomics[(int)(*i)[0]].end, x_dot_l, k); // x_r * r^k // r^k is from index+all.rank+1 to index+2*all.rank //float x_dot_r = sd_offset_add(weights, mask, ec.atomics[(int)(*i)[1]].begin, ec.atomics[(int)(*i)[1]].end, k+all.rank); float x_dot_r = 0.; - GD::foreach_feature<float,vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[(int)(*i)[1]].begin, ec.atomics[(int)(*i)[1]].end, x_dot_r, k+all.rank); + GD::foreach_feature<float,GD::vec_add>(all.reg.weight_vector, all.reg.weight_mask, ec.atomics[(int)(*i)[1]].begin, ec.atomics[(int)(*i)[1]].end, x_dot_r, k+all.rank); prediction += x_dot_l * x_dot_r; @@ -156,6 +155,12 @@ float mf_predict(vw& all, example& ec) } +void sd_offset_update(weight* weights, size_t mask, feature* begin, feature* end, size_t offset, float update, float regularization) +{ + for (feature* f = begin; f!= end; f++) + weights[(f->weight_index + offset) & mask] += update * f->x - regularization * weights[(f->weight_index + offset) & mask]; +} + void mf_train(vw& all, example& ec, float update) { weight* weights = all.reg.weight_vector; diff --git a/vowpalwabbit/global_data.cc b/vowpalwabbit/global_data.cc index 0f122ec6..1b939a36 100644 --- a/vowpalwabbit/global_data.cc +++ b/vowpalwabbit/global_data.cc @@ -215,6 +215,25 @@ void compile_gram(vector<string> grams, uint32_t* dest, char* descriptor, bool q } } +po::variables_map add_options(vw& all, po::options_description& opts) +{ + all.opts.add(opts); + po::variables_map new_vm; + + //parse local opts once for notifications. + po::parsed_options parsed = po::command_line_parser(all.args). + style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). + options(opts).allow_unregistered().run(); + po::store(parsed, new_vm); + po::notify(new_vm); + //parse all opts for a complete variable map. + parsed = po::command_line_parser(all.args). + style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). + options(all.opts).allow_unregistered().run(); + po::store(parsed, new_vm); + return new_vm; +} + vw::vw() { sd = (shared_data *) calloc_or_die(1, sizeof(shared_data)); @@ -266,7 +285,7 @@ vw::vw() per_feature_regularizer_output = ""; per_feature_regularizer_text = ""; - options_from_file = ""; + file_options = ""; #ifdef _WIN32 stdout_fileno = _fileno(stdout); @@ -304,7 +323,6 @@ vw::vw() normalized_sum_norm_x = 0.; normalized_idx = 2; - feature_mask_idx = 3;//by default use the 4th position as mask add_constant = true; audit = false; diff --git a/vowpalwabbit/global_data.h b/vowpalwabbit/global_data.h index 1d44924a..1df07f93 100644 --- a/vowpalwabbit/global_data.h +++ b/vowpalwabbit/global_data.h @@ -9,6 +9,9 @@ license as described in the file LICENSE. #include <map> #include <stdint.h> #include <cstdio> +#include <boost/program_options.hpp> +namespace po = boost::program_options; + #include "v_array.h" #include "parse_primitives.h" #include "loss_functions.h" @@ -156,9 +159,9 @@ struct vw { bool save_resume; - std::string options_from_file; - char** options_from_file_argv; - int options_from_file_argc; + po::options_description opts; + std::string file_options; + vector<std::string> args; bool searn; void* /*Searn::searn*/ searnstr; @@ -215,7 +218,6 @@ struct vw { float normalized_sum_norm_x; size_t normalized_idx; //offset idx where the norm is stored (1 or 2 depending on whether adaptive is true) - size_t feature_mask_idx; //offset idx where mask is stored uint32_t lda; float lda_alpha; @@ -277,6 +279,7 @@ void print_lda_result(vw& all, int f, float* res, float weight, v_array<char> ta void get_prediction(int sock, float& res, float& weight); void compile_gram(vector<string> grams, uint32_t* dest, char* descriptor, bool quiet); int print_tag(std::stringstream& ss, v_array<char> tag); +po::variables_map add_options(vw& all, po::options_description& opts); #endif diff --git a/vowpalwabbit/lda_core.cc b/vowpalwabbit/lda_core.cc index e07783c0..612253ff 100644 --- a/vowpalwabbit/lda_core.cc +++ b/vowpalwabbit/lda_core.cc @@ -15,7 +15,6 @@ license as described in the file LICENSE. #include <stdio.h>
#include <assert.h>
#include "constant.h"
-#include "sparse_dense.h"
#include "gd.h"
#include "simple_label.h"
#include "rand48.h"
@@ -750,7 +749,7 @@ void end_examples(lda& l) ld.v.delete_v();
}
-learner* setup(vw&all, vector<string>&opts, po::variables_map& vm)
+learner* setup(vw&all, po::variables_map& vm)
{
lda* ld = (lda*)calloc_or_die(1,sizeof(lda));
ld->sorted_features = vector<index_feature>();
@@ -758,20 +757,15 @@ learner* setup(vw&all, vector<string>&opts, po::variables_map& vm) ld->all = &all;
ld->example_t = all.initial_t;
- po::options_description desc("LDA options");
- desc.add_options()
+ po::options_description lda_opts("LDA options");
+ lda_opts.add_options()
("lda_alpha", po::value<float>(&all.lda_alpha), "Prior on sparsity of per-document topic weights")
("lda_rho", po::value<float>(&all.lda_rho), "Prior on sparsity of topic distributions")
("lda_D", po::value<float>(&all.lda_D), "Number of documents")
("lda_epsilon", po::value<float>(&all.lda_epsilon), "Loop convergence threshold")
("minibatch", po::value<size_t>(&all.minibatch), "Minibatch size, for LDA");
- po::parsed_options parsed = po::command_line_parser(opts).
- style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
- options(desc).allow_unregistered().run();
- opts = po::collect_unrecognized(parsed.options, po::include_positional);
- po::store(parsed, vm);
- po::notify(vm);
+ vm = add_options(all, lda_opts);
all.p->sort_features = true;
float temp = ceilf(logf((float)(all.lda*2+1)) / logf (2.f));
diff --git a/vowpalwabbit/lda_core.h b/vowpalwabbit/lda_core.h index b934548f..3dc42be5 100644 --- a/vowpalwabbit/lda_core.h +++ b/vowpalwabbit/lda_core.h @@ -7,7 +7,7 @@ license as described in the file LICENSE. #define LDA_CORE_H namespace LDA{ - LEARNER::learner* setup(vw&, std::vector<std::string>&, po::variables_map&); + LEARNER::learner* setup(vw&, po::variables_map&); } #endif diff --git a/vowpalwabbit/loss_functions.cc b/vowpalwabbit/loss_functions.cc index 52a619f2..0cdc5fa7 100644 --- a/vowpalwabbit/loss_functions.cc +++ b/vowpalwabbit/loss_functions.cc @@ -39,20 +39,20 @@ public: + 2. * (sd->max_label - label) * (prediction - sd->max_label)); } - float getUpdate(float prediction, float label,float eta_t, float norm) { - + float getUpdate(float prediction, float label, float eta_t, float pred_per_update) + { if (eta_t < 1e-6){ /* When exp(-eta_t)~= 1 we replace 1-exp(-eta_t) * with its first order Taylor expansion around 0 * to avoid catastrophic cancellation. */ - return 2.f*(label - prediction)*eta_t/norm; + return 2.f*(label - prediction)*eta_t/pred_per_update; } - return (label - prediction)*(1.f-exp(-2.f*eta_t))/norm; + return (label - prediction)*(1.f-exp(-2.f*eta_t))/pred_per_update; } - float getUnsafeUpdate(float prediction, float label,float eta_t,float norm) { - return 2.f*(label - prediction)*eta_t/norm; + float getUnsafeUpdate(float prediction, float label, float eta_t, float pred_per_update) { + return 2.f*(label - prediction)*eta_t/pred_per_update; } float getRevertingWeight(shared_data* sd, float prediction, float eta_t){ @@ -92,12 +92,12 @@ public: return example_loss; } - float getUpdate(float prediction, float label,float eta_t, float norm) { - return 2.f*eta_t*(label - prediction)/norm; + float getUpdate(float prediction, float label,float eta_t, float pred_per_update) { + return 2.f*eta_t*(label - prediction)/pred_per_update; } - float getUnsafeUpdate(float prediction, float label,float eta_t,float norm) { - return 2.f*(label - prediction)*eta_t/norm; + float getUnsafeUpdate(float prediction, float label,float eta_t,float pred_per_update) { + return 2.f*(label - prediction)*eta_t/pred_per_update; } float getRevertingWeight(shared_data* sd, float prediction, float eta_t){ @@ -132,16 +132,15 @@ public: return (e > 0) ? e : 0; } - float getUpdate(float prediction, float label,float eta_t, float norm) { + float getUpdate(float prediction, float label,float eta_t, float pred_per_update) { if(label*prediction >= 1) return 0; float err = 1 - label*prediction; - float normal= eta_t; - return label * (normal < err ? normal : err)/norm; + return label * (eta_t < err ? eta_t : err)/pred_per_update; } - float getUnsafeUpdate(float prediction, float label,float eta_t, float norm) { + float getUnsafeUpdate(float prediction, float label,float eta_t, float pred_per_update) { if(label*prediction >= 1) return 0; - return label * eta_t/norm; + return label * eta_t/pred_per_update; } float getRevertingWeight(shared_data*, float prediction, float eta_t){ @@ -175,23 +174,23 @@ public: return log(1 + exp(-label * prediction)); } - float getUpdate(float prediction, float label, float eta_t, float norm) { + float getUpdate(float prediction, float label, float eta_t, float pred_per_update) { float w,x; float d = exp(label * prediction); if(eta_t < 1e-6){ /* As with squared loss, for small eta_t we replace the update * with its first order Taylor expansion to avoid numerical problems */ - return label*eta_t/((1+d)*norm); + return label*eta_t/((1+d)*pred_per_update); } x = eta_t + label*prediction + d; w = wexpmx(x); - return -(label*w+prediction)/norm; + return -(label*w+prediction)/pred_per_update; } - float getUnsafeUpdate(float prediction, float label, float eta_t, float norm) { + float getUnsafeUpdate(float prediction, float label, float eta_t, float pred_per_update) { float d = exp(label * prediction); - return label*eta_t/((1+d)*norm); + return label*eta_t/((1+d)*pred_per_update); } inline float wexpmx(float x){ @@ -246,24 +245,24 @@ public: } - float getUpdate(float prediction, float label, float eta_t, float norm) { + float getUpdate(float prediction, float label, float eta_t, float pred_per_update) { float err = label - prediction; if(err == 0) return 0; float normal = eta_t;//base update size if(err > 0) { normal = tau*normal; - return (normal < err ? normal : err) / norm; + return (normal < err ? normal : err) / pred_per_update; } else { normal = -(1-tau) * normal; - return ( normal > err ? normal : err) / norm; + return ( normal > err ? normal : err) / pred_per_update; } } - float getUnsafeUpdate(float prediction, float label, float eta_t, float norm) { + float getUnsafeUpdate(float prediction, float label, float eta_t, float pred_per_update) { float err = label - prediction; if(err == 0) return 0; - if(err > 0) return tau*eta_t/norm; - return -(1-tau)*eta_t/norm; + if(err > 0) return tau*eta_t/pred_per_update; + return -(1-tau)*eta_t/pred_per_update; } float getRevertingWeight(shared_data* sd, float prediction, float eta_t){ diff --git a/vowpalwabbit/lrq.cc b/vowpalwabbit/lrq.cc index f7e9e466..8e71f85b 100644 --- a/vowpalwabbit/lrq.cc +++ b/vowpalwabbit/lrq.cc @@ -178,7 +178,7 @@ namespace LRQ { } } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) {//parse and set arguments LRQstate* lrq = (LRQstate*) calloc (1, sizeof (LRQstate)); unsigned int maxk = 0; @@ -186,32 +186,22 @@ namespace LRQ { size_t random_seed = 0; if (vm.count("random_seed")) random_seed = vm["random_seed"].as<size_t> (); - if (vm_file.count("random_seed")) random_seed = vm_file["random_seed"].as<size_t> (); lrq->initial_seed = lrq->seed = random_seed | 8675309; - lrq->dropout = vm.count("lrqdropout") || vm_file.count("lrqdropout"); + lrq->dropout = vm.count("lrqdropout"); - if (lrq->dropout && !vm_file.count("lrqdropout")) - all.options_from_file.append(" --lrqdropout"); - - if (!vm_file.count("lrq")) - { - lrq->lrpairs = vm["lrq"].as<vector<string> > (); - - // TODO: doesn't work for non-printable stuff - - stringstream ss; - for (vector<string>::iterator i = lrq->lrpairs.begin (); - i != lrq->lrpairs.end (); - ++i) - { - ss << " --lrq " << *i; - } - - all.options_from_file.append(ss.str()); - } - else - lrq->lrpairs = vm_file["lrq"].as<vector<string> > (); + all.file_options.append(" --lrqdropout"); + + lrq->lrpairs = vm["lrq"].as<vector<string> > (); + + stringstream ss; + for (vector<string>::iterator i = lrq->lrpairs.begin (); + i != lrq->lrpairs.end (); + ++i) + ss << " --lrq " << *i; + + all.file_options.append(ss.str()); + if (! all.quiet) { diff --git a/vowpalwabbit/lrq.h b/vowpalwabbit/lrq.h index 6c4254e8..13015cda 100644 --- a/vowpalwabbit/lrq.h +++ b/vowpalwabbit/lrq.h @@ -2,6 +2,6 @@ #ifndef LRQ_HEADER #define LRQ_HEADER namespace LRQ { - LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/nn.cc b/vowpalwabbit/nn.cc index 278b6e55..fe22eec9 100644 --- a/vowpalwabbit/nn.cc +++ b/vowpalwabbit/nn.cc @@ -304,59 +304,34 @@ CONVERSE: // That's right, I'm using goto. So sue me. free (n.output_layer.atomics[nn_output_namespace].begin); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { nn* n = (nn*)calloc_or_die(1,sizeof(nn)); n->all = &all; - po::options_description desc("NN options"); - desc.add_options() + po::options_description nn_opts("NN options"); + nn_opts.add_options() ("inpass", "Train or test sigmoidal feedforward network with input passthrough.") ("dropout", "Train or test sigmoidal feedforward network using dropout.") ("meanfield", "Train or test sigmoidal feedforward network using mean field."); - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); - - po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc,all.options_from_file_argv). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - po::store(parsed_file, vm_file); - po::notify(vm_file); + vm = add_options(all, nn_opts); //first parse for number of hidden units - n->k = 0; - if( vm_file.count("nn") ) { - n->k = (uint32_t)vm_file["nn"].as<size_t>(); - if( vm.count("nn") && (uint32_t)vm["nn"].as<size_t>() != n->k ) - std::cerr << "warning: you specified a different number of hidden units through --nn than the one loaded from predictor. Pursuing with loaded value of: " << n->k << endl; - } - else { - n->k = (uint32_t)vm["nn"].as<size_t>(); - - std::stringstream ss; - ss << " --nn " << n->k; - all.options_from_file.append(ss.str()); - } + n->k = (uint32_t)vm["nn"].as<size_t>(); + + std::stringstream ss; + ss << " --nn " << n->k; + all.file_options.append(ss.str()); - if( vm_file.count("dropout") ) { - n->dropout = all.training || vm.count("dropout"); - - if (! n->dropout && ! vm.count("meanfield") && ! all.quiet) - std::cerr << "using mean field for testing, specify --dropout explicitly to override" << std::endl; - } - else if ( vm.count("dropout") ) { + if ( vm.count("dropout") ) { n->dropout = true; - + std::stringstream ss; ss << " --dropout "; - all.options_from_file.append(ss.str()); + all.file_options.append(ss.str()); } - + if ( vm.count("meanfield") ) { n->dropout = false; if (! all.quiet) @@ -371,15 +346,12 @@ CONVERSE: // That's right, I'm using goto. So sue me. << (all.training ? "training" : "testing") << std::endl; - if( vm_file.count("inpass") ) { - n->inpass = true; - } - else if (vm.count ("inpass")) { + if (vm.count ("inpass")) { n->inpass = true; std::stringstream ss; ss << " --inpass"; - all.options_from_file.append(ss.str()); + all.file_options.append(ss.str()); } if (n->inpass && ! all.quiet) diff --git a/vowpalwabbit/nn.h b/vowpalwabbit/nn.h index 9e3237cc..a0eaa7c9 100644 --- a/vowpalwabbit/nn.h +++ b/vowpalwabbit/nn.h @@ -11,7 +11,7 @@ license as described in the file LICENSE. namespace NN { - LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/oaa.cc b/vowpalwabbit/oaa.cc index 836b4d8a..cdd97062 100644 --- a/vowpalwabbit/oaa.cc +++ b/vowpalwabbit/oaa.cc @@ -82,24 +82,17 @@ namespace OAA { VW::finish_example(all, &ec); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { oaa* data = (oaa*)calloc_or_die(1, sizeof(oaa)); //first parse for number of actions - if( vm_file.count("oaa") ) { - data->k = (uint32_t)vm_file["oaa"].as<size_t>(); - if( vm.count("oaa") && (uint32_t)vm["oaa"].as<size_t>() != data->k ) - std::cerr << "warning: you specified a different number of actions through --oaa than the one loaded from predictor. Pursuing with loaded value of: " << data->k << endl; - } - else { - data->k = (uint32_t)vm["oaa"].as<size_t>(); - - //append oaa with nb_actions to options_from_file so it is saved to regressor later - std::stringstream ss; - ss << " --oaa " << data->k; - all.options_from_file.append(ss.str()); - } + data->k = (uint32_t)vm["oaa"].as<size_t>(); + + //append oaa with nb_actions to options_from_file so it is saved to regressor later + std::stringstream ss; + ss << " --oaa " << data->k; + all.file_options.append(ss.str()); data->shouldOutput = all.raw_prediction > 0; data->all = &all; diff --git a/vowpalwabbit/oaa.h b/vowpalwabbit/oaa.h index 74ec99b0..302d15dd 100644 --- a/vowpalwabbit/oaa.h +++ b/vowpalwabbit/oaa.h @@ -8,7 +8,7 @@ license as described in the file LICENSE. namespace OAA { - LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/parse_args.cc b/vowpalwabbit/parse_args.cc index 3edc34f5..47925673 100644 --- a/vowpalwabbit/parse_args.cc +++ b/vowpalwabbit/parse_args.cc @@ -103,14 +103,18 @@ void parse_affix_argument(vw&all, string str) { free(cstr); } -void parse_diagnostics(vw& all, po::variables_map& vm, po::options_description& desc, int argc) +void parse_diagnostics(vw& all, po::variables_map& vm, int argc) { - // Begin diagnostic options - if (vm.count("help") || argc == 1) { - /* upon direct query for help -- spit it out to stdout */ - cout << "\n" << desc << "\n"; - exit(0); - } + po::options_description diag_opt("Diagnostic options"); + + diag_opt.add_options() + ("version","Version information") + ("audit,a", "print weights of features") + ("progress,P", po::value< string >(), "Progress update frequency. int: additive, float: multiplicative") + ("quiet", "Don't output disgnostics and progress updates") + ("help,h","Look here: http://hunch.net/~vw/ and click on Tutorial."); + + vm = add_options(all, diag_opt); if (vm.count("version")) { /* upon direct query for version -- spit it out to stdout */ @@ -122,6 +126,9 @@ void parse_diagnostics(vw& all, po::variables_map& vm, po::options_description& all.quiet = true; // --quiet wins over --progress } else { + if (argc == 1) + cerr << "For more information use: vw --help" << endl; + all.quiet = false; if (vm.count("progress")) { @@ -166,6 +173,34 @@ void parse_diagnostics(vw& all, po::variables_map& vm, po::options_description& void parse_source(vw& all, po::variables_map& vm) { + po::options_description in_opt("Input options"); + + in_opt.add_options() + ("data,d", po::value< string >(), "Example Set") + ("daemon", "persistent daemon mode on port 26542") + ("port", po::value<size_t>(),"port to listen on; use 0 to pick unused port") + ("num_children", po::value<size_t>(&(all.num_children)), "number of children for persistent daemon mode") + ("pid_file", po::value< string >(), "Write pid file in persistent daemon mode") + ("port_file", po::value< string >(), "Write port used in persistent daemon mode") + ("cache,c", "Use a cache. The default is <data>.cache") + ("cache_file", po::value< vector<string> >(), "The location(s) of cache_file.") + ("kill_cache,k", "do not reuse existing cache: create a new one always") + ("compressed", "use gzip format whenever possible. If a cache file is being created, this option creates a compressed cache file. A mixture of raw-text & compressed inputs are supported with autodetection.") + ("no_stdin", "do not default to reading from stdin"); + + vm = add_options(all, in_opt); + + // Be friendly: if -d was left out, treat positional param as data file + po::positional_options_description p; + p.add("data", -1); + + vm = po::variables_map(); + po::parsed_options pos = po::command_line_parser(all.args). + style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). + options(all.opts).positional(p).run(); + vm = po::variables_map(); + po::store(pos, vm); + //begin input source if (vm.count("no_stdin")) all.stdin_off = true; @@ -192,10 +227,41 @@ void parse_source(vw& all, po::variables_map& vm) set_compressed(all.p); } else all.data_filename = ""; + + if ((vm.count("cache") || vm.count("cache_file")) && vm.count("invert_hash")) + { + cout << "invert_hash is incompatible with a cache file. Use it in single pass mode only." << endl; + throw exception(); + } + + if(!all.holdout_set_off && (vm.count("output_feature_regularizer_binary") || vm.count("output_feature_regularizer_text"))) + { + all.holdout_set_off = true; + cerr<<"Making holdout_set_off=true since output regularizer specified\n"; + } } -void parse_feature_tweaks(vw& all, po::variables_map& vm, po::variables_map& vm_file) +void parse_feature_tweaks(vw& all, po::variables_map& vm) { + po::options_description feature_opt("Feature options"); + feature_opt.add_options() + ("hash", po::value< string > (), "how to hash the features. Available options: strings, all") + ("ignore", po::value< vector<unsigned char> >(), "ignore namespaces beginning with character <arg>") + ("keep", po::value< vector<unsigned char> >(), "keep namespaces beginning with character <arg>") + ("bit_precision,b", po::value<size_t>(), "number of bits in the feature table") + ("noconstant", "Don't add a constant feature") + ("constant,C", po::value<float>(&(all.initial_constant)), "Set initial value of constant") + ("ngram", po::value< vector<string> >(), "Generate N grams. To generate N grams for a single namespace 'foo', arg should be fN.") + ("skips", po::value< vector<string> >(), "Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace 'foo', arg should be fn.") + ("affix", po::value<string>(), "generate prefixes/suffixes of features; argument '+2a,-3b,+1' means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace") + ("spelling", po::value< vector<string> >(), "compute spelling features for a give namespace (use '_' for default namespace)") + ("quadratic,q", po::value< vector<string> > (), "Create and use quadratic features") + ("q:", po::value< string >(), ": corresponds to a wildcard for all printable characters") + ("cubic", po::value< vector<string> > (), + "Create and use cubic features"); + + vm = add_options(all, feature_opt); + //feature manipulation string hash_function("strings"); if(vm.count("hash")) @@ -209,17 +275,11 @@ void parse_feature_tweaks(vw& all, po::variables_map& vm, po::variables_map& vm_ else all.spelling_features[(size_t)spelling_ns[id][0]] = true; } - if (vm_file.count("affix") && vm.count("affix")) { - cerr << "should not specify --affix when loading a model trained with affix features (they're turned on by default)" << endl; - throw exception(); - } - if (vm_file.count("affix")) - parse_affix_argument(all, vm_file["affix"].as<string>()); if (vm.count("affix")) { parse_affix_argument(all, vm["affix"].as<string>()); stringstream ss; ss << " --affix " << vm["affix"].as<string>(); - all.options_from_file.append(ss.str()); + all.file_options.append(ss.str()); } if(vm.count("ngram")){ @@ -301,6 +361,9 @@ void parse_feature_tweaks(vw& all, po::variables_map& vm, po::variables_map& vm_ else if((*i)[0]==':'&&(*i)[1]==':'){ cout << "in pair creation" << endl; newpairs.reserve(newpairs.size() + valid_ns_size*valid_ns_size); + stringstream ss; + ss << ' ' << ' '; + newpairs.push_back(ss.str()); for (char j=printable_start; j<=printable_end; j++){ if(valid_ns(j)){ for (char k=printable_start; k<=printable_end; k++){ @@ -392,6 +455,27 @@ void parse_feature_tweaks(vw& all, po::variables_map& vm, po::variables_map& vm_ void parse_example_tweaks(vw& all, po::variables_map& vm) { + po::options_description example_opts("Example options"); + + example_opts.add_options() + ("testonly,t", "Ignore label information and just test") + ("holdout_off", "no holdout data in multiple passes") + ("holdout_period", po::value<uint32_t>(&(all.holdout_period)), "holdout period for test only, default 10") + ("holdout_after", po::value<uint32_t>(&(all.holdout_after)), "holdout after n training examples, default off (disables holdout_period)") + ("early_terminate", po::value<size_t>(), "Specify the number of passes tolerated when holdout loss doesn't decrease before early termination, default is 3") + ("passes", po::value<size_t>(&(all.numpasses)),"Number of Training Passes") + ("initial_pass_length", po::value<size_t>(&(all.pass_length)), "initial number of examples per pass") + ("examples", po::value<size_t>(&(all.max_examples)), "number of examples to parse") + ("min_prediction", po::value<float>(&(all.sd->min_label)), "Smallest prediction to output") + ("max_prediction", po::value<float>(&(all.sd->max_label)), "Largest prediction to output") + ("sort_features", "turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes") + ("loss_function", po::value<string>()->default_value("squared"), "Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.") + ("quantile_tau", po::value<float>()->default_value(0.5), "Parameter \\tau associated with Quantile loss. Defaults to 0.5") + ("l1", po::value<float>(&(all.l1_lambda)), "l_1 lambda") + ("l2", po::value<float>(&(all.l2_lambda)), "l_2 lambda"); + + vm = add_options(all, example_opts); + if (vm.count("testonly") || all.eta == 0.) { if (!all.quiet) @@ -409,12 +493,6 @@ void parse_example_tweaks(vw& all, po::variables_map& vm) if(vm.count("holdout_off")) all.holdout_set_off = true; - if(!all.holdout_set_off && (vm.count("output_feature_regularizer_binary") || vm.count("output_feature_regularizer_text"))) - { - all.holdout_set_off = true; - cerr<<"Making holdout_set_off=true since output regularizer specified\n"; - } - if(vm.count("sort_features")) all.p->sort_features = true; @@ -455,8 +533,17 @@ void parse_example_tweaks(vw& all, po::variables_map& vm) } } -void parse_output_preds(vw& all, po::variables_map& vm, po::variables_map& vm_file) +void parse_output_preds(vw& all, po::variables_map& vm) { + po::options_description out_opt("Output options"); + + out_opt.add_options() + ("predictions,p", po::value< string >(), "File to output predictions to") + ("raw_predictions,r", po::value< string >(), "File to output unnormalized predictions to") + ; + + vm = add_options(all, out_opt); + if (vm.count("predictions")) { if (!all.quiet) cerr << "predictions = " << vm["predictions"].as< string >() << endl; @@ -482,7 +569,7 @@ void parse_output_preds(vw& all, po::variables_map& vm, po::variables_map& vm_fi if (vm.count("raw_predictions")) { if (!all.quiet) { cerr << "raw predictions = " << vm["raw_predictions"].as< string >() << endl; - if (vm.count("binary") || vm_file.count("binary")) + if (vm.count("binary")) cerr << "Warning: --raw has no defined value when --binary specified, expect no output" << endl; } if (strcmp(vm["raw_predictions"].as< string >().c_str(), "stdout") == 0) @@ -503,6 +590,19 @@ void parse_output_preds(vw& all, po::variables_map& vm, po::variables_map& vm_fi void parse_output_model(vw& all, po::variables_map& vm) { + po::options_description output_model("Output model"); + + output_model.add_options() + ("final_regressor,f", po::value< string >(), "Final regressor") + ("readable_model", po::value< string >(), "Output human-readable final regressor with numeric features") + ("invert_hash", po::value< string >(), "Output human-readable final regressor with feature names. Computationally expensive.") + ("save_resume", "save extra state so learning can be resumed later with new data") + ("save_per_pass", "Save the model after every pass over data") + ("output_feature_regularizer_binary", po::value< string >(&(all.per_feature_regularizer_output)), "Per feature regularization output file") + ("output_feature_regularizer_text", po::value< string >(&(all.per_feature_regularizer_text)), "Per feature regularization output file, in text"); + + vm = add_options(all, output_model); + if (vm.count("final_regressor")) { all.final_regressor_name = vm["final_regressor"].as<string>(); if (!all.quiet) @@ -526,13 +626,30 @@ void parse_output_model(vw& all, po::variables_map& vm) all.save_resume = true; } -void parse_base_algorithm(vw& all, vector<string>& to_pass_further, po::variables_map& vm) +void parse_base_algorithm(vw& all, po::variables_map& vm) { //base learning algorithm. + po::options_description base_opt("base algorithms (these are exclusive)"); + + base_opt.add_options() + ("sgd", "use regular stochastic gradient descent update.") + ("adaptive", "use adaptive, individual learning rates.") + ("invariant", "use safe/importance aware updates.") + ("normalized", "use per feature normalized updates") + ("exact_adaptive_norm", "use current default invariant normalized adaptive update rule") + ("bfgs", "use bfgs optimization") + ("lda", po::value<uint32_t>(&(all.lda)), "Run lda with <int> topics") + ("rank", po::value<uint32_t>(&(all.rank)), "rank for matrix factorization.") + ("noop","do no learning") + ("print","print examples") + ("sendto", po::value< vector<string> >(), "send examples to <host>"); + + vm = add_options(all, base_opt); + if (vm.count("bfgs") || vm.count("conjugate_gradient")) - all.l = BFGS::setup(all, to_pass_further, vm); + all.l = BFGS::setup(all, vm); else if (vm.count("lda")) - all.l = LDA::setup(all, to_pass_further, vm); + all.l = LDA::setup(all, vm); else if (vm.count("noop")) all.l = NOOP::setup(all); else if (vm.count("print")) @@ -571,192 +688,191 @@ void load_input_model(vw& all, po::variables_map& vm, io_buf& io_temp) } } -void parse_scorer_reductions(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file) +void parse_scorer_reductions(vw& all, po::variables_map& vm) { - if(vm.count("nn") || vm_file.count("nn") ) - all.l = NN::setup(all, to_pass_further, vm, vm_file); + po::options_description score_mod_opt("Score modifying options (can be combined)"); + + score_mod_opt.add_options() + ("nn", po::value<size_t>(), "Use sigmoidal feedforward network with <k> hidden units") + ("new_mf", "use new, reduction-based matrix factorization") + ("autolink", po::value<size_t>(), "create link function with polynomial d") + ("lrq", po::value<vector<string> > (), "use low rank quadratic features") + ("lrqdropout", "use dropout training for low rank quadratic features"); + + vm = add_options(all, score_mod_opt); + + if(vm.count("nn")) + all.l = NN::setup(all, vm); if (vm.count("new_mf") && all.rank > 0) all.l = MF::setup(all, vm); - if(vm.count("autolink") || vm_file.count("autolink") ) - all.l = ALINK::setup(all, to_pass_further, vm, vm_file); + if(vm.count("autolink")) + all.l = ALINK::setup(all, vm); - if (vm.count("lrq") || vm_file.count("lrq")) - all.l = LRQ::setup(all, to_pass_further, vm, vm_file); + if (vm.count("lrq")) + all.l = LRQ::setup(all, vm); - all.l = Scorer::setup(all, to_pass_further, vm, vm_file); + all.l = Scorer::setup(all, vm); } -LEARNER::learner* exclusive_setup(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file, bool& score_consumer, LEARNER::learner* (*setup)(vw&, vector<string>&, po::variables_map&, po::variables_map&)) +LEARNER::learner* exclusive_setup(vw& all, po::variables_map& vm, bool& score_consumer, LEARNER::learner* (*setup)(vw&, po::variables_map&)) { if (score_consumer) { cerr << "error: cannot specify multiple direct score consumers" << endl; throw exception(); } score_consumer = true; - return setup(all, to_pass_further, vm, vm_file); + return setup(all, vm); } -void parse_score_users(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file, bool& got_cs) +void parse_score_users(vw& all, po::variables_map& vm, bool& got_cs) { + po::options_description multiclass_opt("Score user options (these are exclusive)"); + multiclass_opt.add_options() + ("top", po::value<size_t>(), "top k recommendation") + ("binary", "report loss as binary classification on -1,1") + ("oaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> labels") + ("ect", po::value<size_t>(), "Use error correcting tournament with <k> labels") + ("csoaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> costs") + ("wap", po::value<size_t>(), "Use weighted all-pairs multiclass learning with <k> costs") + ("csoaa_ldf", po::value<string>(), "Use one-against-all multiclass learning with label dependent features. Specify singleline or multiline.") + ("wap_ldf", po::value<string>(), "Use weighted all-pairs multiclass learning with label dependent features. Specify singleline or multiline.") + ; + + vm = add_options(all, multiclass_opt); bool score_consumer = false; - if(vm.count("top") || vm_file.count("top") ) - all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, TOPK::setup); + if(vm.count("top")) + all.l = exclusive_setup(all, vm, score_consumer, TOPK::setup); - if (vm.count("binary") || vm_file.count("binary")) - all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, BINARY::setup); + if (vm.count("binary")) + all.l = exclusive_setup(all, vm, score_consumer, BINARY::setup); - if (vm.count("oaa") || vm_file.count("oaa") ) - all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, OAA::setup); + if (vm.count("oaa")) + all.l = exclusive_setup(all, vm, score_consumer, OAA::setup); - if (vm.count("ect") || vm_file.count("ect") ) - all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, ECT::setup); + if (vm.count("ect")) + all.l = exclusive_setup(all, vm, score_consumer, ECT::setup); - if(vm.count("csoaa") || vm_file.count("csoaa") ) { - all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, CSOAA::setup); + if(vm.count("csoaa")) { + all.l = exclusive_setup(all, vm, score_consumer, CSOAA::setup); all.cost_sensitive = all.l; got_cs = true; } - if(vm.count("wap") || vm_file.count("wap") ) { - all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, WAP::setup); + if(vm.count("wap")) { + all.l = exclusive_setup(all, vm, score_consumer, WAP::setup); all.cost_sensitive = all.l; got_cs = true; } - if(vm.count("csoaa_ldf") || vm_file.count("csoaa_ldf")) { - all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, CSOAA_AND_WAP_LDF::setup); + if(vm.count("csoaa_ldf") || vm.count("csoaa_ldf")) { + all.l = exclusive_setup(all, vm, score_consumer, CSOAA_AND_WAP_LDF::setup); all.cost_sensitive = all.l; got_cs = true; } - if(vm.count("wap_ldf") || vm_file.count("wap_ldf") ) { - all.l = exclusive_setup(all, to_pass_further, vm, vm_file, score_consumer, CSOAA_AND_WAP_LDF::setup); + if(vm.count("wap_ldf") || vm.count("wap_ldf") ) { + all.l = exclusive_setup(all, vm, score_consumer, CSOAA_AND_WAP_LDF::setup); all.cost_sensitive = all.l; got_cs = true; } } -void parse_cb(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file, bool& got_cs, bool& got_cb) +void parse_cb(vw& all, po::variables_map& vm, bool& got_cs, bool& got_cb) { - if( vm.count("cb") || vm_file.count("cb") ) + po::options_description cb_opts("Contextual Bandit options"); + + cb_opts.add_options() + ("cb", po::value<size_t>(), "Use contextual bandit learning with <k> costs") + ("cbify", po::value<size_t>(), "Convert multiclass on <k> classes into a contextual bandit problem and solve"); + + vm = add_options(all,cb_opts); + + if( vm.count("cb")) { if(!got_cs) { - if( vm_file.count("cb") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm_file["cb"])); + if( vm.count("cb") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["cb"])); else vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["cb"])); - all.l = CSOAA::setup(all, to_pass_further, vm, vm_file); // default to CSOAA unless wap is specified + all.l = CSOAA::setup(all, vm); // default to CSOAA unless wap is specified all.cost_sensitive = all.l; got_cs = true; } - all.l = CB_ALGS::setup(all, to_pass_further, vm, vm_file); + all.l = CB_ALGS::setup(all, vm); got_cb = true; } - if (vm.count("cbify") || vm_file.count("cbify")) + if (vm.count("cbify")) { if(!got_cs) { - if( vm_file.count("cbify") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm_file["cbify"])); - else vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["cbify"])); + vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["cbify"])); - all.l = CSOAA::setup(all, to_pass_further, vm, vm_file); // default to CSOAA unless wap is specified + all.l = CSOAA::setup(all, vm); // default to CSOAA unless wap is specified all.cost_sensitive = all.l; got_cs = true; } - + if (!got_cb) { - if( vm_file.count("cbify") ) vm.insert(pair<string,po::variable_value>(string("cb"),vm_file["cbify"])); - else vm.insert(pair<string,po::variable_value>(string("cb"),vm["cbify"])); - all.l = CB_ALGS::setup(all, to_pass_further, vm, vm_file); + vm.insert(pair<string,po::variable_value>(string("cb"),vm["cbify"])); + all.l = CB_ALGS::setup(all, vm); got_cb = true; } - all.l = CBIFY::setup(all, to_pass_further, vm, vm_file); + all.l = CBIFY::setup(all, vm); } } -void parse_search(vw& all, vector<string>& to_pass_further, po::variables_map& vm, po::variables_map vm_file, bool& got_cs, bool& got_cb) +void parse_search(vw& all, po::variables_map& vm, bool& got_cs, bool& got_cb) { - if (vm.count("search") || vm_file.count("search") ) { + po::options_description search_opts("Search"); + + search_opts.add_options() + ("search", po::value<size_t>(), "use search-based structured prediction, argument=maximum action id or 0 for LDF"); + + vm = add_options(all,search_opts); + + if (vm.count("search")) { if (!got_cs && !got_cb) { - if( vm_file.count("search") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm_file["search"])); + if( vm.count("search") ) vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["search"])); else vm.insert(pair<string,po::variable_value>(string("csoaa"),vm["search"])); - all.l = CSOAA::setup(all, to_pass_further, vm, vm_file); // default to CSOAA unless others have been specified + all.l = CSOAA::setup(all, vm); // default to CSOAA unless others have been specified all.cost_sensitive = all.l; got_cs = true; } //all.searnstr = (Searn::searn*)calloc_or_die(1, sizeof(Searn::searn)); - all.l = Searn::setup(all, to_pass_further, vm, vm_file); + all.l = Searn::setup(all, vm); } } -vw* parse_args(int argc, char *argv[]) +void add_to_args(vw& all, int argc, char* argv[]) { - po::options_description desc("VW options"); + for (int i = 1; i < argc; i++) + all.args.push_back(string(argv[i])); +} +vw* parse_args(int argc, char *argv[]) +{ vw* all = new vw(); + add_to_args(*all, argc, argv); + size_t random_seed = 0; all->program_name = argv[0]; - po::options_description in_opt("Input options"); - - in_opt.add_options() - ("data,d", po::value< string >(), "Example Set") - ("ring_size", po::value<size_t>(&(all->p->ring_size)), "size of example ring") - ("examples", po::value<size_t>(&(all->max_examples)), "number of examples to parse") - ("testonly,t", "Ignore label information and just test") - ("daemon", "persistent daemon mode on port 26542") - ("port", po::value<size_t>(),"port to listen on; use 0 to pick unused port") - ("num_children", po::value<size_t>(&(all->num_children)), "number of children for persistent daemon mode") - ("pid_file", po::value< string >(), "Write pid file in persistent daemon mode") - ("port_file", po::value< string >(), "Write port used in persistent daemon mode") - ("passes", po::value<size_t>(&(all->numpasses)),"Number of Training Passes") - ("cache,c", "Use a cache. The default is <data>.cache") - ("cache_file", po::value< vector<string> >(), "The location(s) of cache_file.") - ("kill_cache,k", "do not reuse existing cache: create a new one always") - ("compressed", "use gzip format whenever possible. If a cache file is being created, this option creates a compressed cache file. A mixture of raw-text & compressed inputs are supported with autodetection.") - ("no_stdin", "do not default to reading from stdin") - ("save_resume", "save extra state so learning can be resumed later with new data") - ; - - po::options_description out_opt("Output options"); + po::options_description desc("VW options"); - out_opt.add_options() - ("audit,a", "print weights of features") - ("predictions,p", po::value< string >(), "File to output predictions to") - ("raw_predictions,r", po::value< string >(), "File to output unnormalized predictions to") - ("sendto", po::value< vector<string> >(), "send examples to <host>") - ("quiet", "Don't output disgnostics and progress updates") - ("progress,P", po::value< string >(), "Progress update frequency. int: additive, float: multiplicative") - ("binary", "report loss as binary classification on -1,1") - ("min_prediction", po::value<float>(&(all->sd->min_label)), "Smallest prediction to output") - ("max_prediction", po::value<float>(&(all->sd->max_label)), "Largest prediction to output") - ; + desc.add_options() + ("random_seed", po::value<size_t>(&random_seed), "seed random number generator") + ("ring_size", po::value<size_t>(&(all->p->ring_size)), "size of example ring"); po::options_description update_opt("Update options"); update_opt.add_options() - ("sgd", "use regular stochastic gradient descent update.") - ("hessian_on", "use second derivative in line search") - ("bfgs", "use bfgs optimization") - ("mem", po::value<int>(&(all->m)), "memory in bfgs") - ("termination", po::value<float>(&(all->rel_threshold)),"Termination threshold") - ("adaptive", "use adaptive, individual learning rates.") - ("invariant", "use safe/importance aware updates.") - ("normalized", "use per feature normalized updates") - ("exact_adaptive_norm", "use current default invariant normalized adaptive update rule") - ("conjugate_gradient", "use conjugate gradient based optimization") - ("l1", po::value<float>(&(all->l1_lambda)), "l_1 lambda") - ("l2", po::value<float>(&(all->l2_lambda)), "l_2 lambda") - ("learning_rate,l", po::value<float>(&(all->eta)), "Set Learning Rate") - ("loss_function", po::value<string>()->default_value("squared"), "Specify the loss function to be used, uses squared by default. Currently available ones are squared, classic, hinge, logistic and quantile.") - ("quantile_tau", po::value<float>()->default_value(0.5), "Parameter \\tau associated with Quantile loss. Defaults to 0.5") + ("learning_rate,l", po::value<float>(&(all->eta)), "Set learning rate") ("power_t", po::value<float>(&(all->power_t)), "t power value") ("decay_learning_rate", po::value<float>(&(all->eta_decay_rate)), "Set Decay factor for learning_rate between passes") - ("initial_pass_length", po::value<size_t>(&(all->pass_length)), "initial number of examples per pass") ("initial_t", po::value<double>(&((all->sd->t))), "initial t value") ("feature_mask", po::value< string >(), "Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights.") ; @@ -764,66 +880,10 @@ vw* parse_args(int argc, char *argv[]) po::options_description weight_opt("Weight options"); weight_opt.add_options() - ("bit_precision,b", po::value<size_t>(), "number of bits in the feature table") ("initial_regressor,i", po::value< vector<string> >(), "Initial regressor(s)") - ("final_regressor,f", po::value< string >(), "Final regressor") ("initial_weight", po::value<float>(&(all->initial_weight)), "Set all weights to an initial value of 1.") ("random_weights", po::value<bool>(&(all->random_weights)), "make initial weights random") - ("readable_model", po::value< string >(), "Output human-readable final regressor with numeric features") - ("invert_hash", po::value< string >(), "Output human-readable final regressor with feature names") - ("save_per_pass", "Save the model after every pass over data") ("input_feature_regularizer", po::value< string >(&(all->per_feature_regularizer_input)), "Per feature regularization input file") - ("output_feature_regularizer_binary", po::value< string >(&(all->per_feature_regularizer_output)), "Per feature regularization output file") - ("output_feature_regularizer_text", po::value< string >(&(all->per_feature_regularizer_text)), "Per feature regularization output file, in text") - ; - - po::options_description holdout_opt("Holdout options"); - holdout_opt.add_options() - ("holdout_off", "no holdout data in multiple passes") - ("holdout_period", po::value<uint32_t>(&(all->holdout_period)), "holdout period for test only, default 10") - ("holdout_after", po::value<uint32_t>(&(all->holdout_after)), "holdout after n training examples, default off (disables holdout_period)") - ("early_terminate", po::value<size_t>(), "Specify the number of passes tolerated when holdout loss doesn't decrease before early termination, default is 3") - ; - - po::options_description namespace_opt("Feature namespace options"); - namespace_opt.add_options() - ("hash", po::value< string > (), "how to hash the features. Available options: strings, all") - ("ignore", po::value< vector<unsigned char> >(), "ignore namespaces beginning with character <arg>") - ("keep", po::value< vector<unsigned char> >(), "keep namespaces beginning with character <arg>") - ("noconstant", "Don't add a constant feature") - ("constant,C", po::value<float>(&(all->initial_constant)), "Set initial value of constant") - ("sort_features", "turn this on to disregard order in which features have been defined. This will lead to smaller cache sizes") - ("ngram", po::value< vector<string> >(), "Generate N grams. To generate N grams for a single namespace 'foo', arg should be fN.") - ("skips", po::value< vector<string> >(), "Generate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace 'foo', arg should be fn.") - ("affix", po::value<string>(), "generate prefixes/suffixes of features; argument '+2a,-3b,+1' means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace") - ("spelling", po::value< vector<string> >(), "compute spelling features for a give namespace (use '_' for default namespace)"); - ; - - po::options_description mf_opt("Matrix factorization options"); - mf_opt.add_options() - ("quadratic,q", po::value< vector<string> > (), - "Create and use quadratic features") - ("q:", po::value< string >(), ": corresponds to a wildcard for all printable characters") - ("cubic", po::value< vector<string> > (), - "Create and use cubic features") - ("rank", po::value<uint32_t>(&(all->rank)), "rank for matrix factorization.") - ("new_mf", "use new, reduction-based matrix factorization") - ; - - po::options_description lrq_opt("Low Rank Quadratic options"); - lrq_opt.add_options() - ("lrq", po::value<vector<string> > (), "use low rank quadratic features") - ("lrqdropout", "use dropout training for low rank quadratic features") - ; - - po::options_description multiclass_opt("Multiclass options"); - multiclass_opt.add_options() - ("oaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> labels") - ("ect", po::value<size_t>(), "Use error correcting tournament with <k> labels") - ("csoaa", po::value<size_t>(), "Use one-against-all multiclass learning with <k> costs") - ("wap", po::value<size_t>(), "Use weighted all-pairs multiclass learning with <k> costs") - ("csoaa_ldf", po::value<string>(), "Use one-against-all multiclass learning with label dependent features. Specify singleline or multiline.") - ("wap_ldf", po::value<string>(), "Use weighted all-pairs multiclass learning with label dependent features. Specify singleline or multiline.") ; po::options_description active_opt("Active Learning options"); @@ -843,60 +903,20 @@ vw* parse_args(int argc, char *argv[]) po::options_description other_opt("Other options"); other_opt.add_options() - ("bs", po::value<size_t>(), "bootstrap mode with k rounds by online importance resampling") - ("top", po::value<size_t>(), "top k recommendation") - ("bs_type", po::value<string>(), "bootstrap mode - currently 'mean' or 'vote'") - ("autolink", po::value<size_t>(), "create link function with polynomial d") - ("cb", po::value<size_t>(), "Use contextual bandit learning with <k> costs") - ("lda", po::value<uint32_t>(&(all->lda)), "Run lda with <int> topics") - ("nn", po::value<size_t>(), "Use sigmoidal feedforward network with <k> hidden units") - ("cbify", po::value<size_t>(), "Convert multiclass on <k> classes into a contextual bandit problem and solve") - ("search", po::value<size_t>(), "use search-based structured prediction, argument=maximum action id or 0 for LDF") + ("bootstrap,B", po::value<size_t>(), "bootstrap mode with k rounds by online importance resampling") ; - // Declare the supported options. - desc.add_options() - ("help,h","Look here: http://hunch.net/~vw/ and click on Tutorial.") - ("version","Version information") - ("random_seed", po::value<size_t>(&random_seed), "seed random number generator") - ("noop","do no learning") - ("print","print examples"); - - //po::positional_options_description p; - // Be friendly: if -d was left out, treat positional param as data file - //p.add("data", -1); - - desc.add(in_opt) - .add(out_opt) - .add(update_opt) + desc.add(update_opt) .add(weight_opt) - .add(holdout_opt) - .add(namespace_opt) - .add(mf_opt) - .add(lrq_opt) - .add(multiclass_opt) .add(active_opt) .add(cluster_opt) .add(other_opt); - po::variables_map vm = po::variables_map(); - po::variables_map vm_file = po::variables_map(); //separate variable map for storing flags in regressor file - - po::parsed_options parsed = po::command_line_parser(argc, argv). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); // got rid of ".positional(p)" because it doesn't work well with unrecognized options - vector<string> to_pass_further = po::collect_unrecognized(parsed.options, po::include_positional); - string last_unrec_arg = - (to_pass_further.size() > 0) - ? string(to_pass_further[to_pass_further.size()-1]) // we want to write this down in case it's a data argument ala the positional option we got rid of - : ""; - - po::store(parsed, vm); - po::notify(vm); + po::variables_map vm = add_options(*all, desc); msrand48(random_seed); - parse_diagnostics(*all, vm, desc, argc); + parse_diagnostics(*all, vm, argc); if (vm.count("active_simulation")) all->active_simulation = true; @@ -904,8 +924,6 @@ vw* parse_args(int argc, char *argv[]) if (vm.count("active_learning") && !all->active_simulation) all->active = true; - parse_source(*all, vm); - all->sd->weighted_unlabeled_examples = all->sd->t; all->initial_t = (float)all->sd->t; @@ -915,21 +933,29 @@ vw* parse_args(int argc, char *argv[]) //Input regressor header io_buf io_temp; parse_regressor_args(*all, vm, io_temp); - - all->options_from_file_argv = VW::get_argv_from_string(all->options_from_file,all->options_from_file_argc); - - po::parsed_options parsed_file = po::command_line_parser(all->options_from_file_argc, all->options_from_file_argv). + + int temp_argc = 0; + char** temp_argv = VW::get_argv_from_string(all->file_options, temp_argc); + add_to_args(*all, temp_argc, temp_argv); + for (int i = 0; i < temp_argc; i++) + free(temp_argv[i]); + free(temp_argv); + + po::parsed_options pos = po::command_line_parser(all->args). style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); + options(all->opts).allow_unregistered().run(); - po::store(parsed_file, vm_file); - po::notify(vm_file); + vm = po::variables_map(); + + po::store(pos, vm); + po::notify(vm); + all->file_options = ""; - parse_feature_tweaks(*all, vm, vm_file); //feature tweaks + parse_feature_tweaks(*all, vm); //feature tweaks parse_example_tweaks(*all, vm); //example manipulation - parse_base_algorithm(*all, to_pass_further, vm); + parse_base_algorithm(*all, vm); if (!all->quiet) { @@ -945,53 +971,26 @@ vw* parse_args(int argc, char *argv[]) parse_output_model(*all, vm); - parse_output_preds(*all, vm, vm_file); + parse_output_preds(*all, vm); - load_input_model(*all, vm, io_temp); - - parse_scorer_reductions(*all, to_pass_further, vm, vm_file); + parse_scorer_reductions(*all, vm); bool got_cs = false; - parse_score_users(*all, to_pass_further, vm, vm_file, got_cs); + parse_score_users(*all, vm, got_cs); bool got_cb = false; - parse_cb(*all, to_pass_further, vm, vm_file, got_cs, got_cb); - - parse_search(*all, to_pass_further, vm, vm_file, got_cs, got_cb); + parse_cb(*all, vm, got_cs, got_cb); - if(vm.count("bs") || vm_file.count("bs") ) - all->l = BS::setup(*all, to_pass_further, vm, vm_file); + parse_search(*all, vm, got_cs, got_cb); - if (to_pass_further.size() > 0) { - bool is_actually_okay = false; + if(vm.count("bootstrap")) + all->l = BS::setup(*all, vm); - // special case to try to emulate the missing -d - if ((to_pass_further.size() == 1) && - (to_pass_further[to_pass_further.size()-1] == last_unrec_arg)) { - int f = io_buf().open_file(last_unrec_arg.c_str(), all->stdin_off, io_buf::READ); - if (f != -1) { -#ifdef _WIN32 - _close(f); -#else - close(f); -#endif - all->data_filename = last_unrec_arg; - if (ends_with(last_unrec_arg, ".gz")) - set_compressed(all->p); - is_actually_okay = true; - } - } + load_input_model(*all, vm, io_temp); - if (!is_actually_okay) { - cerr << "unrecognized options:"; - for (size_t i=0; i<to_pass_further.size(); i++) - cerr << " " << to_pass_further[i]; - cerr << endl; - throw exception(); - } - } + parse_source(*all, vm); enable_sources(*all, vm, all->quiet,all->numpasses); @@ -1002,6 +1001,12 @@ vw* parse_args(int argc, char *argv[]) i++; all->wpp = (1 << i) >> all->reg.stride_shift; + if (vm.count("help")) { + /* upon direct query for help -- spit it out to stdout */ + cout << "\n" << all->opts << "\n"; + exit(0); + } + return all; } @@ -1091,9 +1096,6 @@ namespace VW { all.p->parse_name.delete_v(); free(all.p); free(all.sd); - for (int i = 0; i < all.options_from_file_argc; i++) - free(all.options_from_file_argv[i]); - free(all.options_from_file_argv); for (size_t i = 0; i < all.final_prediction_sink.size(); i++) if (all.final_prediction_sink[i] != 1) io_buf::close_file_or_socket(all.final_prediction_sink[i]); diff --git a/vowpalwabbit/parse_args.h b/vowpalwabbit/parse_args.h index 744fece9..df713017 100644 --- a/vowpalwabbit/parse_args.h +++ b/vowpalwabbit/parse_args.h @@ -6,10 +6,6 @@ license as described in the file LICENSE. #ifndef PARSE_ARGS_H #define PARSE_ARGS_H -#include <boost/program_options.hpp> -#include <boost/program_options/parsers.hpp> -namespace po = boost::program_options; -#include "gd.h" #include "global_data.h" vw* parse_args(int argc, char *argv[]); diff --git a/vowpalwabbit/parse_primitives.h b/vowpalwabbit/parse_primitives.h index 0fde7236..89f4c0f2 100644 --- a/vowpalwabbit/parse_primitives.h +++ b/vowpalwabbit/parse_primitives.h @@ -92,7 +92,8 @@ struct parser { bool sorted_cache; size_t ring_size; - uint64_t parsed_examples; // The index of the parsed example. + uint64_t begin_parsed_examples; // The index of the beginning parsed example. + uint64_t end_parsed_examples; // The index of the fully parsed example. uint64_t local_example_number; uint32_t in_pass_counter; example* examples; diff --git a/vowpalwabbit/parse_regressor.cc b/vowpalwabbit/parse_regressor.cc index cd57ee0e..757e4051 100644 --- a/vowpalwabbit/parse_regressor.cc +++ b/vowpalwabbit/parse_regressor.cc @@ -224,16 +224,16 @@ void save_load_header(vw& all, io_buf& model_file, bool read, bool text) "", read, "\n",1, text); - text_len = sprintf(buff, "options:%s\n", all.options_from_file.c_str()); - uint32_t len = (uint32_t)all.options_from_file.length()+1; - memcpy(buff2, all.options_from_file.c_str(),len); + text_len = sprintf(buff, "options:%s\n", all.file_options.c_str()); + uint32_t len = (uint32_t)all.file_options.length()+1; + memcpy(buff2, all.file_options.c_str(),len); if (read) len = buf_size; bin_text_read_write(model_file,buff2, len, "", read, buff, text_len, text); if (read) - all.options_from_file.assign(buff2); + all.file_options.assign(buff2); } } @@ -316,10 +316,6 @@ void parse_mask_regressor_args(vw& all, po::variables_map& vm){ vector<string> init_filename = vm["initial_regressor"].as< vector<string> >(); if(mask_filename == init_filename[0]){//-i and -mask are from same file, just generate mask - for (size_t j = 0; j < length; j++){ - if(all.reg.weight_vector[j << all.reg.stride_shift] != 0.) - all.reg.weight_vector[(j << all.reg.stride_shift) + all.feature_mask_idx] = 1.; - } return; } } @@ -330,10 +326,6 @@ void parse_mask_regressor_args(vw& all, po::variables_map& vm){ save_load_header(all, io_temp_mask, true, false); all.l->save_load(io_temp_mask, true, false); io_temp_mask.close_file(); - for (size_t j = 0; j < length; j++){ - if(all.reg.weight_vector[j << all.reg.stride_shift] != 0.) - all.reg.weight_vector[(j << all.reg.stride_shift) + all.feature_mask_idx] = 1.; - } // Deal with the over-written header from initial regressor if (vm.count("initial_regressor")) { @@ -351,7 +343,7 @@ void parse_mask_regressor_args(vw& all, po::variables_map& vm){ } } else { // If no initial regressor, just clear out the options loaded from the header. - all.options_from_file.assign(""); + all.file_options.assign(""); } } } diff --git a/vowpalwabbit/parser.cc b/vowpalwabbit/parser.cc index d42179c3..cb1397eb 100644 --- a/vowpalwabbit/parser.cc +++ b/vowpalwabbit/parser.cc @@ -251,7 +251,7 @@ void reset_source(vw& all, size_t numbits) { // wait for all predictions to be sent back to client mutex_lock(&all.p->output_lock); - while (all.p->local_example_number != all.p->parsed_examples) + while (all.p->local_example_number != all.p->end_parsed_examples) condition_variable_wait(&all.p->output_done, &all.p->output_lock); mutex_unlock(&all.p->output_lock); @@ -364,8 +364,6 @@ void parse_cache(vw& all, po::variables_map &vm, string source, make_write_cache(all, caches[i], quiet); else { uint32_t c = cache_numbits(all.p->input, f); - if (all.default_bits) - all.num_bits = c; if (c < all.num_bits) { all.p->input->close_file(); make_write_cache(all, caches[i], quiet); @@ -624,11 +622,19 @@ void enable_sources(vw& all, po::variables_map& vm, bool quiet, size_t passes) cerr << "num sources = " << all.p->input->files.size() << endl; } +/*Race condition hypothesis: + + parser gets an unused example, discovers that it's done, creates an end-of-pass example, and sets done=true + learner finishes example before and calls get_example(), no examples remain but done is set, so it returns NULL. + parser_done() returns true, learner thread exits, + parser thread increments parsed_examples, then exits. + + */ bool parser_done(parser* p) { if (p->done) { - if (p->used_index != p->parsed_examples) + if (p->used_index != p->begin_parsed_examples) return false; return true; } @@ -716,11 +722,12 @@ example* get_unused_example(vw& all) while (true) { mutex_lock(&all.p->examples_lock); - if (all.p->examples[all.p->parsed_examples % all.p->ring_size].in_use == false) + if (all.p->examples[all.p->begin_parsed_examples % all.p->ring_size].in_use == false) { - all.p->examples[all.p->parsed_examples % all.p->ring_size].in_use = true; + example& ret = all.p->examples[all.p->begin_parsed_examples++ % all.p->ring_size]; + ret.in_use = true; mutex_unlock(&all.p->examples_lock); - return all.p->examples + (all.p->parsed_examples % all.p->ring_size); + return &ret; } else condition_variable_wait(&all.p->example_unused, &all.p->examples_lock); @@ -734,14 +741,13 @@ bool parse_atomic_example(vw& all, example* ae, bool do_read = true) return false; if(all.p->sort_features && ae->sorted == false) - unique_sort_features(all.audit, ae); + unique_sort_features(all.audit, all.parse_mask, ae); if (all.p->write_cache) { all.p->lp.cache_label(ae->ld,*(all.p->output)); cache_features(*(all.p->output), ae, (uint32_t)all.parse_mask); } - return true; } @@ -759,7 +765,7 @@ void setup_example(vw& all, example* ae) ae->total_sum_feat_sq = 0; ae->loss = 0.; - ae->example_counter = (size_t)(all.p->parsed_examples + 1); + ae->example_counter = (size_t)(all.p->end_parsed_examples); if ((!all.p->emptylines_separate_examples) || example_is_newline(*ae)) all.p->in_pass_counter++; @@ -855,8 +861,8 @@ namespace VW{ example* new_unused_example(vw& all) { example* ec = get_unused_example(all); all.p->lp.default_label(ec->ld); - all.p->parsed_examples++; - ec->example_counter = all.p->parsed_examples; + all.p->begin_parsed_examples++; + ec->example_counter = all.p->begin_parsed_examples; return ec; } example* read_example(vw& all, char* example_line) @@ -866,7 +872,7 @@ namespace VW{ read_line(all, ret, example_line); parse_atomic_example(all,ret,false); setup_example(all, ret); - all.p->parsed_examples++; + all.p->end_parsed_examples++; return ret; } @@ -904,7 +910,7 @@ namespace VW{ } parse_atomic_example(all,ret,false); setup_example(all, ret); - all.p->parsed_examples++; + all.p->end_parsed_examples++; return ret; } @@ -1059,7 +1065,7 @@ void *main_parse_loop(void *in) example_number = 0; } mutex_lock(&all->p->examples_lock); - all->p->parsed_examples++; + all->p->end_parsed_examples++; condition_variable_signal_all(&all->p->example_available); mutex_unlock(&all->p->examples_lock); @@ -1071,10 +1077,10 @@ namespace VW{ example* get_example(parser* p) { mutex_lock(&p->examples_lock); - if (p->parsed_examples != p->used_index) { + if (p->end_parsed_examples != p->used_index) { size_t ring_index = p->used_index++ % p->ring_size; if (!(p->examples+ring_index)->in_use) - cout << p->used_index << " " << p->parsed_examples << " " << ring_index << endl; + cout << p->used_index << " " << p->end_parsed_examples << " " << ring_index << endl; assert((p->examples+ring_index)->in_use); mutex_unlock(&p->examples_lock); @@ -1103,7 +1109,8 @@ label_data* get_label(example* ec) void initialize_examples(vw& all) { all.p->used_index = 0; - all.p->parsed_examples = 0; + all.p->begin_parsed_examples = 0; + all.p->end_parsed_examples = 0; all.p->done = false; all.p->examples = (example*)calloc_or_die(all.p->ring_size, sizeof(example)); @@ -1117,7 +1124,7 @@ void initialize_examples(vw& all) void adjust_used_index(vw& all) { - all.p->used_index=all.p->parsed_examples; + all.p->used_index=all.p->begin_parsed_examples; } void initialize_parser_datastructures(vw& all) diff --git a/vowpalwabbit/scorer.cc b/vowpalwabbit/scorer.cc index 7d3fcfab..bdef00f3 100644 --- a/vowpalwabbit/scorer.cc +++ b/vowpalwabbit/scorer.cc @@ -19,7 +19,7 @@ namespace Scorer { base.predict(ec); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { scorer* s = (scorer*)calloc_or_die(1, sizeof(scorer)); s->all = &all; diff --git a/vowpalwabbit/scorer.h b/vowpalwabbit/scorer.h index 18c0bef5..328b0ad7 100644 --- a/vowpalwabbit/scorer.h +++ b/vowpalwabbit/scorer.h @@ -2,6 +2,6 @@ #define SCORER_H namespace Scorer { - LEARNER::learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/searn.cc b/vowpalwabbit/searn.cc index 1c93a2b4..618e728b 100644 --- a/vowpalwabbit/searn.cc +++ b/vowpalwabbit/searn.cc @@ -38,7 +38,6 @@ namespace Searn { using namespace std; - const bool PRINT_DEBUG_INFO =0; const bool PRINT_UPDATE_EVERY_EXAMPLE =0; const bool PRINT_UPDATE_EVERY_PASS =0; const bool PRINT_CLOCK_TIME =0; @@ -114,7 +113,7 @@ namespace Searn bool auto_hamming_loss; // if you're just optimizing hamming loss, we can do it for you! bool examples_dont_change; // set to true if you don't do any internal example munging bool is_ldf; // set to true if you'll generate LDF data - + size_t A; // total number of actions, [1..A]; 0 means ldf SearnState state; // current state of learning size_t learn_t; // when LEARN, this is the t at which we're varying a @@ -136,7 +135,7 @@ namespace Searn v_array<snapshot_item> beam_restore_to_end; v_array<uint32_t> beam_final_action_sequence; bool beam_is_training; - + size_t most_recent_snapshot_begin; size_t most_recent_snapshot_end; uint32_t most_recent_snapshot_hash; @@ -172,7 +171,7 @@ namespace Searn short rollout_method; // 0=policy, 1=oracle, 2=none bool trajectory_oracle; // if true, only construct trajectories using the oracle - + bool allow_current_policy; // should the current policy be used for training? true for dagger //bool rollout_oracle; //if true then rollout are performed using oracle instead (optimal approximation discussed in searn's paper). this should be set to true for dagger bool adaptive_beta; //used to implement dagger through searn. if true, beta = 1-(1-alpha)^n after n updates, and policy is mixed with oracle as \pi' = (1-beta)\pi^* + beta \pi @@ -182,7 +181,7 @@ namespace Searn size_t beam_size; size_t kbest; bool allow_unsafe_fast_forward; - + size_t num_features; uint32_t total_number_of_policies; bool do_snapshot; @@ -196,7 +195,7 @@ namespace Searn size_t total_predictions_made; bool hit_new_pass; - + size_t passes_per_policy; vector<example*> ec_seq; @@ -204,7 +203,7 @@ namespace Searn LEARNER::learner* base_learner; void* valid_labels; clock_t start_clock_time; - + example*empty_example; }; @@ -214,7 +213,7 @@ namespace Searn uint32_t hash_example(example&ec, uint32_t seed) { uint32_t hash = seed; - + for (unsigned char* i=ec.indices.begin; i != ec.indices.end; i++) hash = uniform_hash((unsigned char*) ec.atomics[*i].begin, sizeof(feature) * (ec.atomics[*i].end - ec.atomics[*i].begin), @@ -223,7 +222,7 @@ namespace Searn hash = uniform_hash( (unsigned char*) &ec.ft_offset, sizeof(uint32_t), hash ); - + return hash; } @@ -248,10 +247,10 @@ namespace Searn std::cerr << "internal error (bug): no valid policies to choose from! defaulting to current" << std::endl; return (int)current_policy; } - + int num_valid_policies = (int)current_policy + allow_optimal + allow_current_policy; int pid = -1; - + if (num_valid_policies == 0) { std::cerr << "internal error (bug): no valid policies to choose from! defaulting to current" << std::endl; return (int)current_policy; @@ -260,7 +259,7 @@ namespace Searn } else { float r = frand48(); pid = 0; - + if (r > beta) { r -= beta; while ((r > 0) && (pid < num_valid_policies-1)) { @@ -272,7 +271,7 @@ namespace Searn // figure out which policy pid refers to if (allow_optimal && (pid == num_valid_policies-1)) return -1; // this is the optimal policy - + pid = (int)current_policy - pid; if (!allow_current_policy) pid--; @@ -348,7 +347,7 @@ namespace Searn char buf[32]; sprintf(buf, "{%d}", f->weight_index); fstring = string(buf); - } else + } else fstring = string(ec->audit_features[*i][feature_index].feature); feature_index++; } @@ -357,7 +356,7 @@ namespace Searn for (uint32_t t=1; t<=hinfo.features; t++) { v0 = ((h[hinfo.length-t]+1) * quadratic_constant * (additional_offset+1) + t) * history_constant; - + // add the history/feature pair feature temp = {history_value, (uint32_t) ( ((v0 + v)*wpp) & all.reg.weight_mask )}; ec->atomics[history_namespace].push_back(temp); @@ -533,7 +532,7 @@ namespace Searn searn *srn = (searn*)all.searnstr; COST_SENSITIVE::label test_label; COST_SENSITIVE::cs_label.default_label(&test_label); - + // TODO: modify this to handle contextual bandit base learner with ldf float best_prediction = 0; uint32_t best_action = 0; @@ -553,14 +552,14 @@ namespace Searn cdbg << "predict: partial_prediction[" << action << "] = " << ecs[action].partial_prediction << endl; valid_labels->costs[action].partial_prediction = ecs[action].partial_prediction; - if ((action == 0) || + if ((action == 0) || (ecs[action].partial_prediction < best_prediction)) { cdbg << "best action = " << action << endl; best_prediction = ecs[action].partial_prediction; best_action = action; // ((COST_SENSITIVE::label*)ecs[action].ld)->costs[0].weight_index; } } - + if ((srn->priv->state == INIT_TEST) && (all.raw_prediction > 0)) { string outputString; stringstream outputStringStream(outputString); @@ -573,7 +572,7 @@ namespace Searn //if (allow_exploration && (srn->priv->exploration_temperature > 0.)) // best_action = sample_with_temperature_partial_prediction(ecs, num_ec, srn->priv->exploration_temperature); - + return best_action; } @@ -631,7 +630,7 @@ namespace Searn // else // final_prediction = sample_with_temperature_cb(ld, srn.priv->exploration_temperature); // } - + if ((srn.priv->state == INIT_TEST) && (all.raw_prediction > 0) && (srn.priv->rollout_all_actions)) { // srn.priv->rollout_all_actions ==> this is not CB, so we have COST_SENSITIVE::labels string outputString; stringstream outputStringStream(outputString); @@ -642,7 +641,7 @@ namespace Searn } all.print_text(all.raw_prediction, outputStringStream.str(), ec.tag); } - + ec.ld = old_label; return final_prediction; @@ -806,7 +805,7 @@ namespace Searn } } } - + // if not LDF: // *ecs should be a pointer to THE example // num_ec == 0 @@ -826,7 +825,7 @@ namespace Searn uint32_t searn_predict_without_loss(vw& all, learner& base, example* ecs, size_t num_ec, v_array<uint32_t> *yallowed, v_array<uint32_t> *ystar, bool ystar_is_uint32t) // num_ec == 0 means normal example, >0 means ldf, yallowed==NULL means all allowed, ystar==NULL means don't know; ystar_is_uint32t means that the ystar ref is really just a uint32_t { searn* srn=(searn*)all.searnstr; - + // check ldf sanity if (!srn->priv->is_ldf) { assert(num_ec == 0); // searntask is trying to define an ldf example in a non-ldf problem @@ -902,7 +901,7 @@ namespace Searn return srn->priv->learn_a; } else { // t > learn_t size_t this_a = 0; - + if (srn->priv->rollout_method == 1) { // rollout by oracle assert(ystar_is_uint32t); this_a = *(uint32_t*)ystar; @@ -1033,9 +1032,9 @@ namespace Searn throw exception(); } */ - + srn->priv->loss_declared = true; - + srn->priv->loss_last_step = srn->priv->t; cdbg<<"new loss_last_step="<<srn->priv->t<<" incr_loss=" << incr_loss <<endl; if (srn->priv->state == INIT_TEST) @@ -1082,8 +1081,8 @@ namespace Searn } return a; - } - + } + bool snapshot_binary_search_lt(v_array<snapshot_item> a, size_t desired_t, size_t tag, size_t &pos, size_t last_found_pos) { size_t hi = a.size(); @@ -1109,7 +1108,7 @@ namespace Searn ((a[pos].pred_step == desired_t) && (a[pos].tag > tag))) hi = pos; else break; - + pos = hi >> 1; } @@ -1129,7 +1128,7 @@ namespace Searn if (last_found_pos + 1 < hi) { pos = last_found_pos+1; if ((a[pos].index == desired_index) && (a[pos].tag == tag)) return true; - } else + } else pos = (lo + hi) >> 1; while (true) { if (lo == hi) return false; @@ -1195,7 +1194,7 @@ namespace Searn priv->snapshot_data.push_back(item); //cerr << "priv->snapshot_data.push_back(item);" << endl; return; - } + } if (priv->state == FAST_FORWARD) { // go to the end! snapshot_item &me = priv->snapshot_data[priv->fast_forward_position + tag]; // TODO: generalize or ensure that tags are +=1 each time, also this is broken if they don't auto-history @@ -1212,7 +1211,7 @@ namespace Searn if ((cur_size > 0) && // only need to keep around the NEWEST set of snapshots (priv->snapshot_data[cur_size - 1].pred_step < priv->t)) clear_snapshot(*all, *srn, true); - + void* new_data = malloc(sizeof_data); memcpy(new_data, data_ptr, sizeof_data); snapshot_item item = { index, tag, new_data, sizeof_data, priv->t }; @@ -1269,7 +1268,7 @@ namespace Searn return; } - + cerr << "yikes, how did I get here? state = " << priv->state << endl; throw exception(); } @@ -1305,7 +1304,7 @@ namespace Searn if (priv->state == INIT_TRAIN) priv->final_snapshot_end = priv->most_recent_snapshot_end; } - + inline bool cmp_size_t(const size_t a, const size_t b) { return a < b; } @@ -1332,7 +1331,7 @@ namespace Searn } std::sort(timesteps.begin, timesteps.end, cmp_size_t); } - + return timesteps; } @@ -1445,9 +1444,9 @@ namespace Searn srn.priv->loss_declared = false; if (srn.priv->auto_history) clear_rollout_actions(srn); - // srn.priv->snapshot_is_equivalent_to_t = (size_t)-1; - // srn.priv->snapshot_could_match = false; - // srn.priv->snapshot_last_found_pos = (size_t)-1; + srn.priv->snapshot_is_equivalent_to_t = (size_t)-1; + srn.priv->snapshot_could_match = false; + srn.priv->snapshot_last_found_pos = (size_t)-1; } void mark_hyp_pruned(void*data) { ((beam_hyp*)data)->pruned = true; } @@ -1458,7 +1457,7 @@ namespace Searn beam* cur_beam = new beam(srn.priv->beam_size); beam* next_beam = new beam(srn.priv->beam_size); - + // initialize first beam { // in this call to structured_predict, we do the following: @@ -1466,7 +1465,7 @@ namespace Searn // 2) collect the initial snapshot // 3) store the final snapshot so we can fast-foward to the end at will beam_hyp *hyp = hyp_pool.begin; - + hyp->t = 0; hyp->parent = NULL; hyp->action_taken = 0; // irrelevant because parent==NULL @@ -1482,7 +1481,7 @@ namespace Searn srn.priv->state = BEAM_INIT; srn.priv->cur_beam_hyp = hyp; srn.task->structured_predict(srn, ec); - + assert(hyp->filled_in_prediction); // TODO: handle the case that structured_predict just returns or something else weird happens // collect the costs @@ -1497,7 +1496,7 @@ namespace Searn } cdbg << endl; } - + // collect the final snapshot copy_array(srn.priv->beam_restore_to_end, srn.priv->snapshot_data); @@ -1574,7 +1573,7 @@ namespace Searn beam_hyp* hyp = (beam_hyp*) be->data; /*UNDOME*/cdbg << "\t{ cost=" << be->cost << " t=" << hyp->t << " action_taken=" << hyp->action_taken << " incr_cost=" << hyp->incr_cost << " num_actions=" << hyp->num_actions << " parent=" << hyp->parent << " }" << endl; } - } + } // debug print the final beam /*UNDOME*/cdbg << "FINAL BEAM =" << endl; @@ -1614,9 +1613,17 @@ namespace Searn void beam_predict(vw&all, searn&srn, vector<example*>ec, v_array<beam_hyp> &hyp_pool, size_t &hyp_pool_id, bool is_learn) { using namespace Beam; - + + if (might_print_update(all)) { + reset_searn_structure(srn); + srn.priv->state = GET_TRUTH_STRING; + srn.priv->should_produce_string = true; + srn.priv->truth_string->str(""); + srn.task->structured_predict(srn, ec); + } + beam* final_beam = new beam(max(1, min(srn.priv->beam_size, srn.priv->kbest))); // at least 1, but otherwise the min of beam_size and kbest - + compute_full_beam(all, srn, ec, hyp_pool, hyp_pool_id, final_beam); if (srn.priv->should_produce_string && !is_learn) { // TODO: check if this is going to be used at all!!! @@ -1660,7 +1667,7 @@ namespace Searn for (int* sink = all.final_prediction_sink.begin; sink != all.final_prediction_sink.end; ++sink) all.print_text(*sink, "", ec[0]->tag); } - + final_beam->erase(); delete final_beam; } @@ -1688,7 +1695,6 @@ namespace Searn if (must_run_test(all, ec)) { srn.priv->should_produce_string = might_print_update(all) || (all.final_prediction_sink.size() > 0) || (all.raw_prediction > 0); srn.priv->pred_string->str(""); - assert(srn.priv->truth_string != NULL); srn.task->structured_predict(srn, ec); srn.priv->should_produce_string = false; @@ -1715,12 +1721,12 @@ namespace Searn srn.priv->loss_last_step = 0; clear_snapshot(all, srn, true); - // srn.priv->snapshot_is_equivalent_to_t = (size_t)-1; - // srn.priv->snapshot_last_found_pos = (size_t)-1; - // srn.priv->snapshot_could_match = false; + srn.priv->snapshot_is_equivalent_to_t = (size_t)-1; + srn.priv->snapshot_last_found_pos = (size_t)-1; + srn.priv->snapshot_could_match = false; srn.priv->loss_declared = false; srn.priv->should_produce_string = false; - + srn.task->structured_predict(srn, ec); if ( (! srn.priv->loss_declared) && // no loss was declared @@ -1728,7 +1734,7 @@ namespace Searn (all.training) && // in training mode (! ec[0]->test_only) ) // and not a test example cerr << "warning: no loss declared by task on something that looks like a training example!" << endl; - + if (srn.priv->t == 0) { clear_snapshot(all, srn, true); return; // there was no data @@ -1769,7 +1775,6 @@ namespace Searn srn.priv->learn_loss = 0.f; srn.priv->learn_example_len = 0; cdbg << "learn_example_len = 0" << endl; - cdbg << "learn_t = " << srn.priv->learn_t << " || learn_a = " << srn.priv->learn_a << endl; // srn.priv->snapshot_is_equivalent_to_t = (size_t)-1; // srn.priv->snapshot_could_match = true; @@ -1806,7 +1811,7 @@ namespace Searn if (!srn.priv->examples_dont_change) { cdbg << "deleting labels for " << srn.priv->learn_example_len << " learn_example_copy items" << endl; - for (size_t n=0; n<srn.priv->learn_example_len; n++) + for (size_t n=0; n<srn.priv->learn_example_len; n++) //cdbg << "free_example_data[" << n << "]: "; GD::print_audit_features(all, &srn.priv->learn_example_copy[n]); if (srn.priv->is_ldf) COST_SENSITIVE::cs_label.delete_label(srn.priv->learn_example_copy[n].ld); else MULTICLASS::mc_label.delete_label(srn.priv->learn_example_copy[n].ld); @@ -1819,7 +1824,7 @@ namespace Searn tset.erase(); tset.delete_v(); cdbg << endl; } - + clear_snapshot(all, srn, true); srn.priv->train_action.delete_v(); srn.priv->train_action_ids.delete_v(); @@ -1837,7 +1842,7 @@ namespace Searn } srn.priv->train_labels.erase(); srn.priv->train_labels.delete_v(); - + cdbg << "======================================== DONE (" << srn.priv->current_policy << "," << srn.priv->read_example_last_pass << ") ========================================" << endl; } @@ -1863,9 +1868,9 @@ namespace Searn out[i] = in[i]; } - if (in.length() > max_len) { - out[max_len-2] = '.'; - out[max_len-1] = '.'; + if (in.length() > max_len) { + out[max_len-2] = '.'; + out[max_len-1] = '.'; } out[max_len] = 0; } @@ -1923,7 +1928,7 @@ void print_update(vw& all, searn& srn) if (!all.holdout_set_off && all.current_pass >= 1) fprintf(stderr, " h"); - + fprintf(stderr, "\n"); all.sd->sum_loss_since_last_dump = 0.0; @@ -1947,7 +1952,7 @@ void print_update(vw& all, searn& srn) size_t enc_offset = wpp * ((2 * (size_t)(*enc)) + ((*enc < 0) ? 1 : 0)); cdbg << "old_ns = " << old_ns << endl; - + if ((n + offset >= 0) && (n + offset < (int32_t)srn.priv->ec_seq.size())) { // we're okay on position example*you = srn.priv->ec_seq[n+offset]; size_t you_size = you->atomics[old_ns].size(); @@ -1988,7 +1993,7 @@ void print_update(vw& all, searn& srn) //cdbg << "copying " << you_size << " features" << endl; me->sum_feat_sq[neighbor_namespace] += you->sum_feat_sq[old_ns]; me->total_sum_feat_sq += you->sum_feat_sq[old_ns]; - me->num_features += you_size; + me->num_features += you_size; } } else if ((n + offset == -1) || (n + offset == (int32_t)srn.priv->ec_seq.size())) { // handle <s> and </s> size_t bias = constant * ((n + offset < 0) ? 2 : 3); @@ -2036,7 +2041,7 @@ void print_update(vw& all, searn& srn) cdbg << "del n=" << me->num_features; size_t total_size = 0; float total_sfs = 0.; - + for (int32_t*enc=srn.priv->neighbor_features.begin; enc!=srn.priv->neighbor_features.end; ++enc) { int32_t offset = (*enc) >> 24; size_t old_ns = (*enc) & 0xFF; @@ -2075,10 +2080,10 @@ void print_update(vw& all, searn& srn) free(ad->space); free(ad->feature); } - + me->audit_features[neighbor_namespace].end -= total_size; } - + me->sum_feat_sq[neighbor_namespace] -= total_sfs; me->total_sum_feat_sq -= total_sfs; me->num_features -= total_size; @@ -2107,13 +2112,13 @@ void print_update(vw& all, searn& srn) priv->learn_losses.erase(); COST_SENSITIVE::label aset; - + // TODO: prune training steps ala get_training_timesteps cdbg << "t=" << me.t << ", labelset_size=" << me.num_actions << endl; - for (size_t aid=0; aid<me.num_actions; aid++) { + for (uint32_t aid=0; aid<me.num_actions; aid++) { COST_SENSITIVE::wclass my_class = { 0., aid+1, 0., 0. }; // TODO: make this valid for LDF aset.costs.push_back( my_class ); - + if (srn.priv->auto_history) clear_rollout_actions(srn); @@ -2122,8 +2127,8 @@ void print_update(vw& all, searn& srn) beam_hyp *h = &me; for (size_t t=0; t<me.t; t++) { assert(h != NULL); - priv->train_action[me.t - t - 1] = h->action_taken+1; - priv->train_action_ids[me.t - t - 1] = h->action_taken+1; // TODO: make this valid for ldf + priv->train_action[me.t - t - 1] = (uint32_t)h->action_taken+1; + priv->train_action_ids[me.t - t - 1] = (uint32_t)h->action_taken+1; // TODO: make this valid for ldf cdbg << "set train_action[" << (me.t-t-1) << "] = " << h->action_taken+1 << endl; h = h->parent; } @@ -2133,7 +2138,7 @@ void print_update(vw& all, searn& srn) cdbg << "train_action.size = " << priv->train_action.size() << endl; for (size_t t=0; t<me.t; t++) priv->rollout_action.push_back(priv->train_action[t]); - + priv->t = 0; priv->learn_a = (uint32_t)aid+1; // TODO: make this LDF compatible (by remembering what the actions were!) priv->loss_last_step = 0; @@ -2157,7 +2162,7 @@ void print_update(vw& all, searn& srn) if (!priv->examples_dont_change) { cdbg << "deleting labels for " << priv->learn_example_len << " learn_example_copy items" << endl; - for (size_t n=0; n<priv->learn_example_len; n++) + for (size_t n=0; n<priv->learn_example_len; n++) //cdbg << "free_example_data[" << n << "]: "; GD::print_audit_features(all, &priv->learn_example_copy[n]); if (priv->is_ldf) COST_SENSITIVE::cs_label.delete_label(priv->learn_example_copy[n].ld); else MULTICLASS::mc_label.delete_label(priv->learn_example_copy[n].ld); @@ -2185,8 +2190,7 @@ void print_update(vw& all, searn& srn) srn.priv->truth_string->str(""); srn.task->structured_predict(srn, srn.priv->ec_seq); } - - + if (srn.priv->beam_size == 0) train_single_example<is_learn>(all, srn, srn.priv->ec_seq); else { @@ -2221,7 +2225,7 @@ void print_update(vw& all, searn& srn) hyp_pool.delete_v(); srn.priv->test_loss = cached_test_loss; } - + del_neighbor_features(srn); if (srn.priv->ec_seq[0]->test_only) { @@ -2246,7 +2250,7 @@ void print_update(vw& all, searn& srn) srn.priv->base_learner = &base; bool is_real_example = true; - if (example_is_newline(ec) || srn.priv->ec_seq.size() >= all->p->ring_size - 2) { + if (example_is_newline(ec) || srn.priv->ec_seq.size() >= all->p->ring_size - 2) { if (srn.priv->ec_seq.size() >= all->p->ring_size - 2) { // give some wiggle room std::cerr << "warning: length of sequence at " << ec.example_counter << " exceeds ring size; breaking apart" << std::endl; } @@ -2254,13 +2258,13 @@ void print_update(vw& all, searn& srn) do_actual_learning<is_learn>(*all, srn); clear_seq(*all, srn); srn.priv->hit_new_pass = false; - + //VW::finish_example(*all, ec); is_real_example = false; } else { srn.priv->ec_seq.push_back(&ec); } - + if (is_real_example) { srn.priv->read_example_last_id = ec.example_counter; } @@ -2283,7 +2287,7 @@ void print_update(vw& all, searn& srn) //reset searn_trained_nb_policies in options_from_file so it is saved to regressor file later std::stringstream ss; ss << srn.priv->current_policy; - VW::cmd_string_replace_value(all->options_from_file,"--search_trained_nb_policies", ss.str()); + VW::cmd_string_replace_value(all->file_options,"--search_trained_nb_policies", ss.str()); } } @@ -2304,10 +2308,10 @@ void print_update(vw& all, searn& srn) std::stringstream ss2; ss1 << ((srn.priv->passes_since_new_policy == 0) ? srn.priv->current_policy : (srn.priv->current_policy+1)); //use cmd_string_replace_value in case we already loaded a predictor which had a value stored for --search_trained_nb_policies - VW::cmd_string_replace_value(all->options_from_file,"--search_trained_nb_policies", ss1.str()); + VW::cmd_string_replace_value(all->file_options,"--search_trained_nb_policies", ss1.str()); ss2 << srn.priv->total_number_of_policies; //use cmd_string_replace_value in case we already loaded a predictor which had a value stored for --search_total_nb_policies - VW::cmd_string_replace_value(all->options_from_file,"--search_total_nb_policies", ss2.str()); + VW::cmd_string_replace_value(all->file_options,"--search_total_nb_policies", ss2.str()); } } @@ -2371,7 +2375,7 @@ void print_update(vw& all, searn& srn) void searn_initialize(vw& all, searn& srn) { srn.priv->examples_dont_change = false; - + srn.priv->beta = 0.5; srn.priv->alpha = 1e-10f; srn.priv->allow_current_policy = false; @@ -2388,14 +2392,13 @@ void print_update(vw& all, searn& srn) srn.priv->beam_size = 0; // 0 ==> no beam srn.priv->kbest = 0; // 0 or 1 means just 1 best srn.priv->allow_unsafe_fast_forward = true; - srn.priv->neighbor_features_string = new string(); - + srn.priv->passes_per_policy = 1; //this should be set to the same value as --passes for dagger srn.task = NULL; srn.task_data = NULL; - + srn.priv->read_example_last_id = 0; srn.priv->passes_since_new_policy = 0; srn.priv->read_example_last_pass = 0; @@ -2403,14 +2406,14 @@ void print_update(vw& all, searn& srn) srn.priv->total_predictions_made = 0; srn.priv->hit_new_pass = false; srn.priv->subsample_timesteps = 0.; - + srn.priv->total_number_of_policies = 1; srn.priv->truth_string = new stringstream(); srn.priv->pred_string = new stringstream(); srn.priv->bad_string_stream = new stringstream(); srn.priv->bad_string_stream->clear(srn.priv->bad_string_stream->badbit); - + srn.priv->should_produce_string = false; srn.priv->printed_output_header = false; @@ -2422,7 +2425,7 @@ void print_update(vw& all, searn& srn) snapshot_item_result def_snapshot_result = { 0, -1.f }; srn.priv->snapshot_map = new snapmap(102341, def_snapshot_result, snapshot_item_ptr_eq, &srn.priv->snapshot_data); - + srn.priv->empty_example = alloc_examples(sizeof(COST_SENSITIVE::label), 1); COST_SENSITIVE::cs_label.default_label(srn.priv->empty_example->ld); srn.priv->empty_example->in_use = true; @@ -2439,7 +2442,7 @@ void print_update(vw& all, searn& srn) delete srn.priv->neighbor_features_string; srn.priv->neighbor_features.erase(); srn.priv->neighbor_features.delete_v(); - + if (srn.priv->rollout_all_actions) { // dst should be a COST_SENSITIVE::label* ((COST_SENSITIVE::label*)srn.priv->valid_labels)->costs.erase(); ((COST_SENSITIVE::label*)srn.priv->valid_labels)->costs.delete_v(); @@ -2447,7 +2450,7 @@ void print_update(vw& all, searn& srn) ((CB::label*)srn.priv->valid_labels)->costs.erase(); ((CB::label*)srn.priv->valid_labels)->costs.delete_v(); } - + if (srn.priv->rollout_all_actions) // labels are COST_SENSITIVE delete (COST_SENSITIVE::label*)srn.priv->valid_labels; else // labels are CB @@ -2460,7 +2463,7 @@ void print_update(vw& all, searn& srn) clear_snapshot(*all, srn, true); srn.priv->snapshot_data.delete_v(); - + for (size_t i=0; i<srn.priv->train_labels.size(); i++) { if (srn.priv->rollout_all_actions) { ((COST_SENSITIVE::label*)srn.priv->train_labels[i])->costs.erase(); @@ -2481,11 +2484,11 @@ void print_update(vw& all, searn& srn) dealloc_example(delete_label, srn.priv->learn_example_copy[n]); } - + if (srn.task->finish != NULL) { srn.task->finish(srn); } - + srn.priv->train_labels.delete_v(); srn.priv->train_action.delete_v(); srn.priv->train_action_ids.delete_v(); @@ -2511,79 +2514,48 @@ void print_update(vw& all, searn& srn) bool uint32_equal(uint32_t a, uint32_t b) { return a==b; } bool size_equal(size_t a, size_t b) { return a==b; } - template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, po::variables_map& vm_file, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string) { - if (vm_file.count(opt_name)) { // loaded from regressor file - ret = vm_file[opt_name].as<T>(); - if (vm.count(opt_name) && !equal(ret, vm[opt_name].as<T>())) { - if (default_to_cmdline) - ret = vm[opt_name].as<T>(); - std::cerr << mismatch_error_string << ret << endl; - } - } else if (vm.count(opt_name)) { + template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string) { + if (vm.count(opt_name)) { ret = vm[opt_name].as<T>(); stringstream ss; ss << " --" << opt_name << " " << ret; - all.options_from_file.append(ss.str()); + all.file_options.append(ss.str()); } else if (strlen(required_error_string)>0) { std::cerr << required_error_string << endl; throw exception(); } - } + } - void check_option(bool& ret, vw&all, po::variables_map& vm, po::variables_map& vm_file, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string) { - if (vm_file.count(opt_name)) { // loaded from regressor file - ret = true; - if (!vm.count(opt_name)) { - if (default_to_cmdline) - ret = false; - std::cerr << mismatch_error_string << ret << endl; - } - } else if (vm.count(opt_name)) { + void check_option(bool& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string) { + if (vm.count(opt_name)) { ret = true; stringstream ss; ss << " " << opt_name; - all.options_from_file.append(ss.str()); - } else { + all.file_options.append(ss.str()); + } else ret = false; - } - } - - void setup_searn_options(po::options_description& desc, vw&vw, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) { - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); - - po::parsed_options parsed_file = po::command_line_parser(vw.options_from_file_argc, vw.options_from_file_argv). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - po::store(parsed_file, vm_file); - po::notify(vm_file); } - - void handle_history_options(vw& vw, history_info &hinfo, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) { - po::options_description desc("search options"); - desc.add_options() + void handle_history_options(vw& vw, history_info &hinfo, po::variables_map& vm) { + po::options_description history_options("history options"); + history_options.add_options() ("search_history", po::value<size_t>(), "length of history to use") ("search_features", po::value<size_t>(), "length of history to pair with observed features") ("search_bigrams", "use bigrams from history") ("search_bigram_features", "use bigrams from history paired with observed features"); - setup_searn_options(desc, vw, opts, vm, vm_file); - - check_option<size_t>(hinfo.length, vw, vm, vm_file, "search_history", false, size_equal, + vm = add_options(vw, history_options); + + check_option<size_t>(hinfo.length, vw, vm, "search_history", false, size_equal, "warning: you specified a different value for --search_history than the one loaded from regressor. proceeding with loaded value: ", ""); - - check_option<size_t>(hinfo.features, vw, vm, vm_file, "search_features", false, size_equal, + + check_option<size_t>(hinfo.features, vw, vm, "search_features", false, size_equal, "warning: you specified a different value for --search_features than the one loaded from regressor. proceeding with loaded value: ", ""); - - check_option (hinfo.bigrams, vw, vm, vm_file, "search_bigrams", false, + + check_option (hinfo.bigrams, vw, vm, "search_bigrams", false, "warning: you specified --search_bigrams but that wasn't loaded from regressor. proceeding with loaded value: "); - - check_option (hinfo.bigram_features, vw, vm, vm_file, "search_bigram_features", false, + + check_option (hinfo.bigram_features, vw, vm, "search_bigram_features", false, "warning: you specified --search_bigram_features but that wasn't loaded from regressor. proceeding with loaded value: "); } @@ -2603,12 +2575,12 @@ void print_update(vw& all, searn& srn) count++; } fclose(f); - + v_array<COST_SENSITIVE::label> allowed; for (size_t from=0; from<A; from++) { v_array<COST_SENSITIVE::wclass> costs; - + for (size_t to=0; to<A; to++) if (bg[from * (A+1) + to]) { COST_SENSITIVE::wclass c = { FLT_MAX, (uint32_t)to, 0., 0. }; @@ -2621,7 +2593,7 @@ void print_update(vw& all, searn& srn) free(bg); cerr << "read " << count << " allowed transitions from " << filename << endl; - + return allowed; } @@ -2654,25 +2626,26 @@ void print_update(vw& all, searn& srn) } int32_t enc = (posn << 24) | (ns & 0xFF); srn.priv->neighbor_features.push_back(enc); - + p = strtok(NULL, ","); } cmd.erase(); cmd.delete_v(); - + delete cstr; } - learner* setup(vw&all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw&all, po::variables_map& vm) { searn* srn = (searn*)calloc_or_die(1,sizeof(searn)); srn->priv = new searn_private(); srn->priv->all = &all; + srn->all = &all; searn_initialize(all, *srn); - po::options_description desc("Searn options"); - desc.add_options() + po::options_description searn_opts("Searn options"); + searn_opts.add_options() ("search_task", po::value<string>(), "the search task") ("search_interpolation", po::value<string>(), "at what level should interpolation happen? [*data|policy]") ("search_rollout", po::value<string>(), "how should rollouts be executed? [*policy|oracle|none]") @@ -2684,7 +2657,9 @@ void print_update(vw& all, searn& srn) ("search_alpha", po::value<float>(), "annealed beta = 1-(1-alpha)^t (only valid for search_interpolation=data) [def=1e-10]") ("search_total_nb_policies", po::value<size_t>(), "if we are going to train the policies through multiple separate calls to vw, we need to specify this parameter and tell vw how many policies are eventually going to be trained") - + + ("search_trained_nb_policies", po::value<size_t>(), "the number of trained policies in a file") + ("search_allowed_transitions",po::value<string>(),"read file of allowed transitions [def: all transitions are allowed]") ("search_subsample_time", po::value<float>(), "instead of training at all timesteps, use a subset. if value in (0,1), train on a random v%. if v>=1, train on precisely v steps per example") ("search_neighbor_features", po::value<string>(), "copy features from neighboring lines. argument looks like: '-1:a,+2' meaning copy previous line namespace a and next next line from namespace _unnamed_, where ',' separates them") @@ -2695,55 +2670,31 @@ void print_update(vw& all, searn& srn) ("search_no_snapshot", "turn off snapshotting capabilities") ("search_no_fastforward", "turn off fastforwarding (note: fastforwarding requires snapshotting)"); - - // removed options: - //("search_allow_current_policy", "allow searn labeling to use the current policy") - //("search_rollout_oracle", "allow searn/dagger to do rollouts with the oracle when estimating cost-to-go") - //("search_as_dagger", po::value<float>(), "sets options to make searn operate as dagger. parameter is the sliding autonomy rate (rate at which beta tends to 1).") - //("search_exploration_temperature", po::value<float>(), "if <0, always choose policy action (default); if T>=0, choose according to e^{-prediction / T} -- done to avoid overfitting") - - po::options_description add_desc_file("Searn options only available in regressor file"); - add_desc_file.add_options()("search_trained_nb_policies", po::value<size_t>(), "the number of trained policies in the regressor file"); - - po::options_description desc_file; - desc_file.add(desc).add(add_desc_file); - - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); - - po::parsed_options parsed_file = po::command_line_parser(all.options_from_file_argc, all.options_from_file_argv). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc_file).allow_unregistered().run(); - po::store(parsed_file, vm_file); - po::notify(vm_file); - + vm = add_options(all, searn_opts); + std::string task_string; std::string interpolation_string = "data"; std::string rollout_string = "policy"; std::string trajectory_string = "policy"; - check_option<string>(task_string, all, vm, vm_file, "search_task", false, string_equal, + check_option<string>(task_string, all, vm, "search_task", false, string_equal, "warning: specified --search_task different than the one loaded from regressor. using loaded value of: ", "error: you must specify a task using --search_task"); - check_option<string>(interpolation_string, all, vm, vm_file, "search_interpolation", false, string_equal, + check_option<string>(interpolation_string, all, vm, "search_interpolation", false, string_equal, "warning: specified --search_interpolation different than the one loaded from regressor. using loaded value of: ", ""); - check_option<string>(rollout_string, all, vm, vm_file, "search_rollout", false, string_equal, + check_option<string>(rollout_string, all, vm, "search_rollout", false, string_equal, "warning: specified --search_rollout different than the one loaded from regressor. using loaded value of: ", ""); - check_option<string>(trajectory_string, all, vm, vm_file, "search_trajectory", false, string_equal, + check_option<string>(trajectory_string, all, vm, "search_trajectory", false, string_equal, "warning: specified --search_trajectory different than the one loaded from regressor. using loaded value of: ", ""); - + if (vm.count("search_passes_per_policy")) srn->priv->passes_per_policy = vm["search_passes_per_policy"].as<size_t>(); if (vm.count("search_beta")) srn->priv->beta = vm["search_beta" ].as<float>(); if (vm.count("search_alpha")) srn->priv->alpha = vm["search_alpha" ].as<float>(); if (vm.count("search_subsample_time")) srn->priv->subsample_timesteps = vm["search_subsample_time"].as<float>(); - - check_option<string>(*srn->priv->neighbor_features_string, all, vm, vm_file, "search_neighbor_features", false, string_equal, + + check_option<string>(*srn->priv->neighbor_features_string, all, vm, "search_neighbor_features", false, string_equal, "warning: you specified a different feature structure with --search_neighbor_features than the one loaded from predictor. using loaded value of: ", ""); parse_neighbor_features(*srn); @@ -2785,11 +2736,11 @@ void print_update(vw& all, searn& srn) throw exception(); } - //check_option<float >(srn->priv->exploration_temperature, all, vm, vm_file, "search_exploration_temperature", false, float_equal, + //check_option<float >(srn->priv->exploration_temperature, all, vm, "search_exploration_temperature", false, float_equal, // "warning: you specified a different value through --search_exploration_temperature than the one loaded from predictor. using loaded value of: ", ""); - check_option<size_t>(srn->priv->A, all, vm, vm_file, "search", false, size_equal, + check_option<size_t>(srn->priv->A, all, vm, "search", false, size_equal, "warning: you specified a different number of actions through --search than the one loaded from predictor. using loaded value of: ", ""); - + //if (vm.count("search_allow_current_policy")) srn->priv->allow_current_policy = true; //if (vm.count("search_rollout_oracle")) srn->priv->rollout_oracle = true; @@ -2807,24 +2758,19 @@ void print_update(vw& all, searn& srn) cerr << "error: kbest must be at least equal to beam_size" << endl; throw exception(); } - + //check if the base learner is contextual bandit, in which case, we dont rollout all actions. - if (vm.count("cb") || vm_file.count("cb")) { + if (vm.count("cb")) { srn->priv->rollout_all_actions = false; srn->priv->valid_labels = new CB::label(); } else { srn->priv->rollout_all_actions = true; srn->priv->valid_labels = new COST_SENSITIVE::label(); } - + //if we loaded a regressor with -i option, --search_trained_nb_policies contains the number of trained policies in the file // and --search_total_nb_policies contains the total number of policies in the file - if ( vm_file.count("search_total_nb_policies") ) { - srn->priv->current_policy = (uint32_t)vm_file["search_trained_nb_policies"].as<size_t>(); - srn->priv->total_number_of_policies = (uint32_t)vm_file["search_total_nb_policies"].as<size_t>(); - if (vm.count("search_total_nb_policies") && (uint32_t)vm["search_total_nb_policies"].as<size_t>() != srn->priv->total_number_of_policies) - std::cerr << "warning: --search_total_nb_policies doesn't match the total number of policies stored in initial predictor. Using loaded value of: " << srn->priv->total_number_of_policies << endl; - } else if (vm.count("search_total_nb_policies")) + if (vm.count("search_total_nb_policies")) srn->priv->total_number_of_policies = (uint32_t)vm["search_total_nb_policies"].as<size_t>(); ensure_param(srn->priv->beta , 0.0, 1.0, 0.5, "warning: search_beta must be in (0,1); resetting to 0.5"); @@ -2832,11 +2778,11 @@ void print_update(vw& all, searn& srn) //compute total number of policies we will have at end of training // we add current_policy for cases where we start from an initial set of policies loaded through -i option - uint32_t tmp_number_of_policies = srn->priv->current_policy; + uint32_t tmp_number_of_policies = srn->priv->current_policy; if( all.training ) tmp_number_of_policies += (int)ceil(((float)all.numpasses) / ((float)srn->priv->passes_per_policy)); - //the user might have specified the number of policies that will eventually be trained through multiple vw calls, + //the user might have specified the number of policies that will eventually be trained through multiple vw calls, //so only set total_number_of_policies to computed value if it is larger cdbg << "current_policy=" << srn->priv->current_policy << " tmp_number_of_policies=" << tmp_number_of_policies << " total_number_of_policies=" << srn->priv->total_number_of_policies << endl; if( tmp_number_of_policies > srn->priv->total_number_of_policies ) { @@ -2852,8 +2798,8 @@ void print_update(vw& all, searn& srn) srn->priv->current_policy--; std::stringstream ss1, ss2; - ss1 << srn->priv->current_policy; VW::cmd_string_replace_value(all.options_from_file,"--search_trained_nb_policies", ss1.str()); - ss2 << srn->priv->total_number_of_policies; VW::cmd_string_replace_value(all.options_from_file,"--search_total_nb_policies", ss2.str()); + ss1 << srn->priv->current_policy; VW::cmd_string_replace_value(all.file_options,"--search_trained_nb_policies", ss1.str()); + ss2 << srn->priv->total_number_of_policies; VW::cmd_string_replace_value(all.file_options,"--search_total_nb_policies", ss2.str()); cdbg << "search current_policy = " << srn->priv->current_policy << " total_number_of_policies = " << srn->priv->total_number_of_policies << endl; @@ -2870,20 +2816,20 @@ void print_update(vw& all, searn& srn) all.p->emptylines_separate_examples = true; // default to OAA labels unless the task wants to override this! - all.p->lp = MULTICLASS::mc_label; - srn->task->initialize(*srn, srn->priv->A, opts, vm, vm_file); + all.p->lp = MULTICLASS::mc_label; + srn->task->initialize(*srn, srn->priv->A, vm); if (vm.count("search_allowed_transitions")) read_allowed_transitions((uint32_t)srn->priv->A, vm["search_allowed_transitions"].as<string>().c_str()); - + // set up auto-history if they want it if (srn->priv->auto_history) { default_info(&srn->priv->hinfo); - handle_history_options(all, srn->priv->hinfo, opts, vm, vm_file); - + handle_history_options(all, srn->priv->hinfo, vm); + if (srn->priv->hinfo.length < srn->priv->hinfo.features) srn->priv->hinfo.length = srn->priv->hinfo.features; - + if (srn->priv->hinfo.length == 0) srn->priv->auto_history = false; } else { @@ -2914,38 +2860,38 @@ void print_update(vw& all, searn& srn) l->set_end_examples<searn,end_examples>(); l->set_finish<searn,searn_finish>(); l->set_end_pass<searn,end_pass>(); - + return l; } // the interface: - uint32_t searn::predict(example* ecs, size_t ec_len, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed) // for LDF + uint32_t searn::predictLDF(example* ecs, size_t ec_len, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed) // for LDF { return searn_predict(this->priv, ecs, ec_len, yallowed, ystar, false); } - uint32_t searn::predict(example* ecs, size_t ec_len, uint32_t one_ystar, v_array<uint32_t>* yallowed) // for LDF + uint32_t searn::predictLDF(example* ecs, size_t ec_len, uint32_t one_ystar, v_array<uint32_t>* yallowed) // for LDF { if (one_ystar == (uint32_t)-1) // test example return searn_predict(this->priv, ecs, ec_len, yallowed, NULL, false); else return searn_predict(this->priv, ecs, ec_len, yallowed, (v_array<uint32_t>*)&one_ystar, true); } - + uint32_t searn::predict(example* ec, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed) // for not LDF { return searn_predict(this->priv, ec, 0, yallowed, ystar, false); } - + uint32_t searn::predict(example* ec, uint32_t one_ystar, v_array<uint32_t>* yallowed) // for not LDF { if (one_ystar == (uint32_t)-1) // test example return searn_predict(this->priv, ec, 0, yallowed, NULL, false); else return searn_predict(this->priv, ec, 0, yallowed, (v_array<uint32_t>*)&one_ystar, true); } - + void searn::loss(float incr_loss, size_t predictions_since_last) { searn_declare_loss(this->priv, predictions_since_last, incr_loss); } - + void searn::snapshot(size_t index, size_t tag, void* data_ptr, size_t sizeof_data, bool used_for_prediction) { searn_snapshot(this->priv, index, tag, data_ptr, sizeof_data, used_for_prediction); } - + stringstream& searn::output() { return searn_output_streamstream(this->priv); } @@ -2975,26 +2921,26 @@ void print_update(vw& all, searn& srn) NON-LDF BEAM 100 ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence --search 5 -i z2.model -p out --search_beam 100 --search_kbest 100 - - + + LDF TRAIN ./vw -k -c -d z2 --passes 50 --invariant --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 --holdout_off -f z2.model --audit --csoaa_ldf m LDF NO BEAM - ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 -i z2.model --audit --csoaa_ldf m -p out + ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 -i z2.model --audit --csoaa_ldf m -p out LDF BEAM 1 - + ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 -i z2.model --audit --csoaa_ldf m -p out --search_beam 1 LDF BEAM 100 - + ./vw -k -c -d z2 -t --search_as_dagger 1e-8 --search_task sequence_demoldf --search 5 -i z2.model --audit --csoaa_ldf m -p out --search_beam 100 --search_kbest 100 === SPAN === - + TRAIN ./vw -k -c -d seqtest --passes 50 --invariant --search_as_dagger 1e-8 --search_task sequencespan --search 7 --holdout_off -f seqtest.model @@ -3004,11 +2950,11 @@ void print_update(vw& all, searn& srn) ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out BEAM 1 - + ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_beam 1 BEAM 100 - + ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_beam 100 --search_kbest 100 @@ -3023,14 +2969,14 @@ void print_update(vw& all, searn& srn) ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_bilou BEAM 1 - + ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_bilou --search_beam 1 BEAM 100 - + ./vw -k -c -d seqtest -t --search_as_dagger 1e-8 --search_task sequencespan --search 7 -i seqtest.model -p out --search_beam 100 --search_kbest 100 --search_bilou - - + + */ @@ -3075,4 +3021,3 @@ next slow commit ab38c2fedee0ba7ca86bd4cf4d145a27d4a6f5d5 */ - diff --git a/vowpalwabbit/searn.h b/vowpalwabbit/searn.h index 1caa8f7e..3c6031c2 100644 --- a/vowpalwabbit/searn.h +++ b/vowpalwabbit/searn.h @@ -48,20 +48,21 @@ namespace Searn { uint32_t predict(example* ec, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed=NULL); // if there are multiple oracle actions // for making predictions in LDF mode: - uint32_t predict(example* ecs, size_t ec_len, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed=NULL); // if there is a single oracle action - uint32_t predict(example* ecs, size_t ec_len, uint32_t one_ystar, v_array<uint32_t>* yallowed=NULL); // if there is are multiple oracle action + uint32_t predictLDF(example* ecs, size_t ec_len, v_array<uint32_t>* ystar, v_array<uint32_t>* yallowed=NULL); // if there is a single oracle action + uint32_t predictLDF(example* ecs, size_t ec_len, uint32_t one_ystar, v_array<uint32_t>* yallowed=NULL); // if there is are multiple oracle action // for generating output (check to see if output().good() before attempting to write!) stringstream& output(); // internal data searn_task* task; + vw* all; searn_private* priv; void* task_data; }; - template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, po::variables_map& vm_file, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string); - void check_option(bool& ret, vw&all, po::variables_map& vm, po::variables_map& vm_file, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string); + template<class T> void check_option(T& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, bool(*equal)(T,T), const char* mismatch_error_string, const char* required_error_string); + void check_option(bool& ret, vw&all, po::variables_map& vm, const char* opt_name, bool default_to_cmdline, const char* mismatch_error_string); bool string_equal(string a, string b); bool float_equal(float a, float b); bool uint32_equal(uint32_t a, uint32_t b); @@ -69,12 +70,12 @@ namespace Searn { struct searn_task { const char* task_name; - void (*initialize)(searn&,size_t&,std::vector<std::string>&, po::variables_map&, po::variables_map&); + void (*initialize)(searn&,size_t&, po::variables_map&); void (*finish)(searn&); void (*structured_predict)(searn&, std::vector<example*>); }; - LEARNER::learner* setup(vw&, std::vector<std::string>&, po::variables_map&, po::variables_map&); + LEARNER::learner* setup(vw&, po::variables_map&); void searn_finish(void*); void searn_drive(void*); void searn_learn(void*,example*); diff --git a/vowpalwabbit/searn_sequencetask.cc b/vowpalwabbit/searn_sequencetask.cc index 7c761fb5..df962943 100644 --- a/vowpalwabbit/searn_sequencetask.cc +++ b/vowpalwabbit/searn_sequencetask.cc @@ -18,7 +18,7 @@ namespace SequenceSpanTask { Searn::searn_task task = { "sequencespan", namespace SequenceTask { using namespace Searn; - void initialize(searn& srn, size_t& num_actions, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) { + void initialize(searn& srn, size_t& num_actions, po::variables_map& vm) { srn.set_options( AUTO_HISTORY | // automatically add history features to our examples, please AUTO_HAMMING_LOSS | // please just use hamming loss on individual predictions -- we won't declare loss EXAMPLES_DONT_CHANGE ); // we don't do any internal example munging @@ -44,25 +44,28 @@ namespace ArgmaxTask { struct task_data { float false_negative_cost; + float negative_weight; bool predict_max; }; - void initialize(searn& srn, size_t& num_actions, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + void initialize(searn& srn, size_t& num_actions, po::variables_map& vm) { task_data* my_task_data = new task_data(); - po::options_description desc("search sequencespan options"); - desc.add_options()("cost", po::value<float>(&(my_task_data->false_negative_cost))->default_value(10.0), "False Negative Cost"); - desc.add_options()("max", po::value<bool>(&(my_task_data->predict_max))->default_value(false), "Disable structure: just predict the max"); + po::options_description argmax_opts("argmax options"); + argmax_opts.add_options() + ("cost", po::value<float>(&(my_task_data->false_negative_cost))->default_value(10.0), "False Negative Cost") + ("negative_weight", po::value<float>(&(my_task_data->negative_weight))->default_value(1), "Relative weight of negative examples") + ("max", "Disable structure: just predict the max"); - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); + vm = add_options(*srn.all, argmax_opts); + if (vm.count("max")) + my_task_data->predict_max = true; + else + my_task_data->predict_max = false; + srn.set_task_data(my_task_data); if (my_task_data->predict_max) @@ -97,9 +100,9 @@ namespace ArgmaxTask { } float loss = 0.; if (max_label > max_prediction) - loss = my_task_data->false_negative_cost; + loss = my_task_data->false_negative_cost / my_task_data->negative_weight; else if (max_prediction > max_label) - loss = 1.; + loss = 1.; srn.loss(loss); if (srn.output().good()) @@ -167,20 +170,15 @@ namespace SequenceSpanTask { v_array<uint32_t> only_two_allowed; // used for BILOU encoding }; - void initialize(searn& srn, size_t& num_actions, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) { + void initialize(searn& srn, size_t& num_actions, po::variables_map& vm) { task_data * my_task_data = new task_data(); - po::options_description desc("search sequencespan options"); - desc.add_options()("search_span_bilou", "switch to (internal) BILOU encoding instead of BIO encoding"); + po::options_description sspan_opts("search sequencespan options"); + sspan_opts.add_options()("search_span_bilou", "switch to (internal) BILOU encoding instead of BIO encoding"); - po::parsed_options parsed = po::command_line_parser(opts). - style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing). - options(desc).allow_unregistered().run(); - opts = po::collect_unrecognized(parsed.options, po::include_positional); - po::store(parsed, vm); - po::notify(vm); + vm = add_options(*srn.all, sspan_opts); - if (vm.count("search_span_bilou") || vm_file.count("search_span_bilou")) { + if (vm.count("search_span_bilou")) { cerr << "switching to BILOU encoding for sequence span labeling" << endl; my_task_data->encoding = BILOU; num_actions = num_actions * 2 - 1; @@ -268,7 +266,7 @@ namespace SequenceTask_DemoLDF { // this is just to debug/show off how to do LD size_t num_actions; }; - void initialize(searn& srn, size_t& num_actions, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) { + void initialize(searn& srn, size_t& num_actions, po::variables_map& vm) { COST_SENSITIVE::wclass default_wclass = { 0., 0, 0., 0. }; example* ldf_examples = alloc_examples(sizeof(COST_SENSITIVE::label), num_actions); @@ -315,7 +313,7 @@ namespace SequenceTask_DemoLDF { // this is just to debug/show off how to do LD lab->costs[0].wap_value = 0.; } - size_t pred_id = srn.predict(data->ldf_examples, data->num_actions, MULTICLASS::get_example_label(ec[i]) - 1); + size_t pred_id = srn.predictLDF(data->ldf_examples, data->num_actions, MULTICLASS::get_example_label(ec[i]) - 1); size_t prediction = pred_id + 1; // or ldf_examples[pred_id]->ld.costs[0].weight_index if (srn.output().good()) diff --git a/vowpalwabbit/searn_sequencetask.h b/vowpalwabbit/searn_sequencetask.h index e7a8a703..92621e68 100644 --- a/vowpalwabbit/searn_sequencetask.h +++ b/vowpalwabbit/searn_sequencetask.h @@ -9,28 +9,28 @@ license as described in the file LICENSE. #include "searn.h" namespace SequenceTask { - void initialize(Searn::searn&, size_t&, std::vector<std::string>&, po::variables_map&, po::variables_map&); + void initialize(Searn::searn&, size_t&, po::variables_map&); void finish(Searn::searn&); void structured_predict(Searn::searn&, vector<example*>); extern Searn::searn_task task; } namespace ArgmaxTask { - void initialize(Searn::searn&, size_t&, std::vector<std::string>&, po::variables_map&, po::variables_map&); + void initialize(Searn::searn&, size_t&, po::variables_map&); void finish(Searn::searn&); void structured_predict(Searn::searn&, vector<example*>); extern Searn::searn_task task; } namespace SequenceSpanTask { - void initialize(Searn::searn&, size_t&, std::vector<std::string>&, po::variables_map&, po::variables_map&); + void initialize(Searn::searn&, size_t&, po::variables_map&); void finish(Searn::searn&); void structured_predict(Searn::searn&, vector<example*>); extern Searn::searn_task task; } namespace SequenceTask_DemoLDF { - void initialize(Searn::searn&, size_t&, std::vector<std::string>&, po::variables_map&, po::variables_map&); + void initialize(Searn::searn&, size_t&, po::variables_map&); void finish(Searn::searn&); void structured_predict(Searn::searn&, vector<example*>); void update_example_indicies(bool audit, example* ec, uint32_t mult_amount, uint32_t plus_amount); diff --git a/vowpalwabbit/sparse_dense.cc b/vowpalwabbit/sparse_dense.cc deleted file mode 100644 index 417cae28..00000000 --- a/vowpalwabbit/sparse_dense.cc +++ /dev/null @@ -1,17 +0,0 @@ -/* -Copyright (c) by respective owners including Yahoo!, Microsoft, and -individual contributors. All rights reserved. Released under a BSD (revised) -license as described in the file LICENSE. - */ - -#include "sparse_dense.h" -#include "constant.h" -#include <math.h> - - -void sd_offset_update(weight* weights, size_t mask, feature* begin, feature* end, size_t offset, float update, float regularization) -{ - for (feature* f = begin; f!= end; f++) - weights[(f->weight_index + offset) & mask] += update * f->x - regularization * weights[(f->weight_index + offset) & mask]; -} - diff --git a/vowpalwabbit/sparse_dense.h b/vowpalwabbit/sparse_dense.h deleted file mode 100644 index 1d6140e0..00000000 --- a/vowpalwabbit/sparse_dense.h +++ /dev/null @@ -1,98 +0,0 @@ -/* -Copyright (c) by respective owners including Yahoo!, Microsoft, and -individual contributors. All rights reserved. Released under a BSD -license as described in the file LICENSE. - */ -#ifndef SPARSE_DENSE_H -#define SPARSE_DENSE_H - -#include <math.h> -#include "parse_example.h" -#include "constant.h" - -template<class R> -struct predict_data { - float prediction; - R extra; -}; - -inline float sign(float w){ if (w < 0.) return -1.; else return 1.;} - -inline float trunc_weight(const float w, const float gravity){ - return (gravity < fabsf(w)) ? w - sign(w) * gravity : 0.f; -} - -inline void vec_add(float& p, const float fx, float& fw) { - p += fw * fx; -} - -inline void vec_add_trunc(predict_data<float>& p, const float fx, float& fw) { - p.prediction += trunc_weight(fw, p.extra) * fx; -} - -template <bool adaptive, size_t normalized_idx> -inline void vec_add_rescale(float& p, const float fx, float& fw) { - weight* w = &fw; - float x_abs = fabs(fx); - if( x_abs > w[normalized_idx] ) {// new scale discovered - if( w[normalized_idx] > 0. ) {//If the normalizer is > 0 then rescale the weight so it's as if the new scale was the old scale. - float rescale = (w[normalized_idx]/x_abs); - w[0] *= (adaptive ? rescale : rescale*rescale); - } - w[normalized_idx] = x_abs; - } - p += fw * fx; -} - -template <bool adaptive, size_t normalized_idx> -inline void vec_add_trunc_rescale(predict_data<float>& p, const float fx, float& fw) { - weight* w = &fw; - float x_abs = fabs(fx); - if( x_abs > w[normalized_idx] ) { - if( w[normalized_idx] > 0. ) { - float rescale = (w[normalized_idx]/x_abs); - w[0] *= (adaptive ? rescale : rescale*rescale); - } - w[normalized_idx] = x_abs; - } - p.prediction += trunc_weight(fw, p.extra) * fx; -} - -template <bool adaptive, size_t normalized_idx> -inline void vec_add_rescale_general(predict_data<float>& p, const float fx, float& fw) { - weight* w = &fw; - float x_abs = fabs(fx); - float power_t_norm = 1.f - (adaptive ? p.extra : 0.f); - if( x_abs > w[normalized_idx] ) { - if( w[normalized_idx] > 0. ) { - float rescale = (w[normalized_idx]/x_abs); - w[0] *= powf(rescale*rescale,power_t_norm); - } - w[normalized_idx] = x_abs; - } - p.prediction += fw * fx; -} - -struct gnp { - float gravity; - float power_t; -}; - -template <bool adaptive, size_t normalized_idx> -inline void vec_add_trunc_rescale_general(predict_data<gnp>& p, const float fx, float& fw) { - weight* w = &fw; - float x_abs = fabs(fx); - float power_t_norm = 1.f - (adaptive ? p.extra.power_t : 0.f); - if( x_abs > w[normalized_idx] ) { - if( w[normalized_idx] > 0. ) { - float rescale = (w[normalized_idx]/x_abs); - w[0] *= powf(rescale*rescale,power_t_norm); - } - w[normalized_idx] = x_abs; - } - p.prediction += trunc_weight(fw, p.extra.gravity) * fx; -} - -void sd_offset_update(weight* weights, size_t mask, feature* begin, feature* end, size_t offset, float update, float regularization); - -#endif diff --git a/vowpalwabbit/topk.cc b/vowpalwabbit/topk.cc index e0a50393..89432fa2 100644 --- a/vowpalwabbit/topk.cc +++ b/vowpalwabbit/topk.cc @@ -109,7 +109,7 @@ namespace TOPK { VW::finish_example(all, &ec); } - learner* setup(vw& all, std::vector<std::string>&opts, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { topk* data = (topk*)calloc_or_die(1, sizeof(topk)); diff --git a/vowpalwabbit/topk.h b/vowpalwabbit/topk.h index b1b8adf3..e29e4a09 100644 --- a/vowpalwabbit/topk.h +++ b/vowpalwabbit/topk.h @@ -16,7 +16,7 @@ license as described in the file LICENSE. namespace TOPK { - LEARNER::learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw& all, po::variables_map& vm); } #endif diff --git a/vowpalwabbit/unique_sort.cc b/vowpalwabbit/unique_sort.cc index da88b226..8022d1ec 100644 --- a/vowpalwabbit/unique_sort.cc +++ b/vowpalwabbit/unique_sort.cc @@ -41,21 +41,29 @@ void unique_audit_features(v_array<audit_data> &features) features.end = ++last; } -void unique_sort_features(bool audit, example* ae) +void unique_sort_features(bool audit, uint32_t parse_mask, example* ae) { ae->sorted=true; for (unsigned char* b = ae->indices.begin; b != ae->indices.end; b++) { - qsort(ae->atomics[*b].begin, ae->atomics[*b].size(), sizeof(feature), + v_array<feature> features = ae->atomics[*b]; + + for (size_t i = 0; i < features.size(); i++) + features[i].weight_index &= parse_mask; + qsort(features.begin, features.size(), sizeof(feature), order_features); unique_features(ae->atomics[*b]); if (audit) { - qsort(ae->audit_features[*b].begin, ae->audit_features[*b].size(), sizeof(audit_data), + v_array<audit_data> afeatures = ae->audit_features[*b]; + + for (size_t i = 0; i < ae->atomics[*b].size(); i++) + afeatures[i].weight_index &= parse_mask; + + qsort(afeatures.begin, afeatures.size(), sizeof(audit_data), order_audit_features); - unique_audit_features(ae->audit_features[*b]); + unique_audit_features(afeatures); } } } - diff --git a/vowpalwabbit/unique_sort.h b/vowpalwabbit/unique_sort.h index 5b3b91b7..3e5e0d05 100644 --- a/vowpalwabbit/unique_sort.h +++ b/vowpalwabbit/unique_sort.h @@ -8,6 +8,6 @@ license as described in the file LICENSE. #include "parser.h" #include "example.h" -void unique_sort_features(bool audit, example* ae); +void unique_sort_features(bool audit, uint32_t parse_mask, example* ae); #endif diff --git a/vowpalwabbit/vw_static.vcxproj b/vowpalwabbit/vw_static.vcxproj index 9ea2895b..42656934 100644 --- a/vowpalwabbit/vw_static.vcxproj +++ b/vowpalwabbit/vw_static.vcxproj @@ -281,7 +281,6 @@ <ClInclude Include="searn_sequencetask.h" />
<ClInclude Include="sender.h" />
<ClInclude Include="simple_label.h" />
- <ClInclude Include="sparse_dense.h" />
<ClInclude Include="targetver.h" />
<ClInclude Include="topk.h" />
<ClInclude Include="unique_sort.h" />
@@ -333,7 +332,6 @@ <ClCompile Include="searn_sequencetask.cc" />
<ClCompile Include="sender.cc" />
<ClCompile Include="simple_label.cc" />
- <ClCompile Include="sparse_dense.cc" />
<ClCompile Include="topk.cc" />
<ClCompile Include="unique_sort.cc" />
<ClCompile Include="wap.cc" />
diff --git a/vowpalwabbit/wap.cc b/vowpalwabbit/wap.cc index 35d796c5..1a0cef0a 100644 --- a/vowpalwabbit/wap.cc +++ b/vowpalwabbit/wap.cc @@ -234,25 +234,19 @@ namespace WAP { VW::finish_example(all, &ec); } - learner* setup(vw& all, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file) + learner* setup(vw& all, po::variables_map& vm) { wap* w=(wap*)calloc_or_die(1,sizeof(wap)); w->all = &all; uint32_t nb_actions = 0; - if( vm_file.count("wap") ) { //if loaded options from regressor - nb_actions = (uint32_t)vm_file["wap"].as<size_t>(); - if( vm.count("wap") && (uint32_t)vm["wap"].as<size_t>() != nb_actions ) - std::cerr << "warning: you specified a different number of actions through --wap than the one loaded from regressor. Pursuing with loaded value of: " << nb_actions << endl; - } - else { - nb_actions = (uint32_t)vm["wap"].as<size_t>(); - //append wap with nb_actions to options_from_file so it is saved to regressor later - std::stringstream ss; - ss << " --wap " << nb_actions; - all.options_from_file.append(ss.str()); - } + nb_actions = (uint32_t)vm["wap"].as<size_t>(); + + //append wap with nb_actions to options_from_file so it is saved to regressor later + std::stringstream ss; + ss << " --wap " << nb_actions; + all.file_options.append(ss.str()); all.p->lp = COST_SENSITIVE::cs_label; all.sd->k = (uint32_t)nb_actions; diff --git a/vowpalwabbit/wap.h b/vowpalwabbit/wap.h index f1254858..decd9e03 100644 --- a/vowpalwabbit/wap.h +++ b/vowpalwabbit/wap.h @@ -10,7 +10,7 @@ license as described in the file LICENSE. #include "parse_args.h" namespace WAP { - LEARNER::learner* setup(vw&, std::vector<std::string>&, po::variables_map& vm, po::variables_map& vm_file); + LEARNER::learner* setup(vw&, po::variables_map& vm); } #endif |