Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/vowpal_wabbit.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHal Daume III <me@hal3.name>2014-03-26 19:54:10 +0400
committerHal Daume III <me@hal3.name>2014-03-26 19:54:10 +0400
commitaca18de090b46b5141411af499a6e1311c94e24e (patch)
treefa7b5bb752f773f7a46349b197a7ff7eece58698 /vowpalwabbit/csoaa.cc
parent4bb13399b3bc9094de2bab9e06a1c8cdf77316f8 (diff)
parent4ba0bef0e733da4bba4bad127287720f5fe1a0a8 (diff)
integrated john's changes
Diffstat (limited to 'vowpalwabbit/csoaa.cc')
-rw-r--r--vowpalwabbit/csoaa.cc343
1 files changed, 25 insertions, 318 deletions
diff --git a/vowpalwabbit/csoaa.cc b/vowpalwabbit/csoaa.cc
index fd202c24..df6b73c6 100644
--- a/vowpalwabbit/csoaa.cc
+++ b/vowpalwabbit/csoaa.cc
@@ -7,10 +7,9 @@ license as described in the file LICENSE.
#include <math.h>
#include <stdio.h>
-#include "csoaa.h"
+#include "cost_sensitive.h"
#include "simple_label.h"
#include "cache.h"
-#include "oaa.h"
#include "v_hashmap.h"
#include "parse_example.h"
#include "vw.h"
@@ -19,295 +18,13 @@ using namespace std;
using namespace LEARNER;
+using namespace COST_SENSITIVE;
+
namespace CSOAA {
struct csoaa{
vw* all;
};
- void name_value(substring &s, v_array<substring>& name, float &v)
- {
- tokenize(':', s, name);
-
- switch (name.size()) {
- case 0:
- case 1:
- v = 1.;
- break;
- case 2:
- v = float_of_substring(name[1]);
- if ( nanpattern(v))
- {
- cerr << "error NaN value for: ";
- cerr.write(name[0].begin, name[0].end - name[0].begin);
- cerr << " terminating." << endl;
- throw exception();
- }
- break;
- default:
- cerr << "example with a wierd name. What is '";
- cerr.write(s.begin, s.end - s.begin);
- cerr << "'?\n";
- }
- }
-
- bool is_test_label(label* ld)
- {
- if (ld->costs.size() == 0)
- return true;
- for (unsigned int i=0; i<ld->costs.size(); i++)
- if (FLT_MAX != ld->costs[i].x)
- return false;
- return true;
- }
-
- char* bufread_label(label* ld, char* c, io_buf& cache)
- {
- size_t num = *(size_t *)c;
- ld->costs.erase();
- c += sizeof(size_t);
- size_t total = sizeof(wclass)*num;
- if (buf_read(cache, c, (int)total) < total)
- {
- cout << "error in demarshal of cost data" << endl;
- return c;
- }
- for (size_t i = 0; i<num; i++)
- {
- wclass temp = *(wclass *)c;
- c += sizeof(wclass);
- ld->costs.push_back(temp);
- }
-
- return c;
- }
-
- size_t read_cached_label(shared_data*, void* v, io_buf& cache)
- {
- label* ld = (label*) v;
- ld->costs.erase();
- char *c;
- size_t total = sizeof(size_t);
- if (buf_read(cache, c, (int)total) < total)
- return 0;
- c = bufread_label(ld,c, cache);
-
- return total;
- }
-
- float weight(void* v)
- {
- return 1.;
- }
-
- char* bufcache_label(label* ld, char* c)
- {
- *(size_t *)c = ld->costs.size();
- c += sizeof(size_t);
- for (unsigned int i = 0; i< ld->costs.size(); i++)
- {
- *(wclass *)c = ld->costs[i];
- c += sizeof(wclass);
- }
- return c;
- }
-
- void cache_label(void* v, io_buf& cache)
- {
- char *c;
- label* ld = (label*) v;
- buf_write(cache, c, sizeof(size_t)+sizeof(wclass)*ld->costs.size());
- bufcache_label(ld,c);
- }
-
- void default_label(void* v)
- {
- label* ld = (label*) v;
- ld->costs.erase();
- }
-
- void delete_label(void* v)
- {
- label* ld = (label*)v;
- ld->costs.delete_v();
- }
-
- void copy_label(void*&dst, void*src)
- {
- label*&ldD = (label*&)dst;
- label* ldS = (label* )src;
- copy_array(ldD->costs, ldS->costs);
- }
-
- bool substring_eq(substring ss, const char* str) {
- size_t len_ss = ss.end - ss.begin;
- size_t len_str = strlen(str);
- if (len_ss != len_str) return false;
- return (strncmp(ss.begin, str, len_ss) == 0);
- }
-
- void parse_label(parser* p, shared_data* sd, void* v, v_array<substring>& words)
- {
- label* ld = (label*)v;
-
- ld->costs.erase();
- for (unsigned int i = 0; i < words.size(); i++) {
- wclass f = {0.,0,0.,0.};
- name_value(words[i], p->parse_name, f.x);
-
- if (p->parse_name.size() == 0)
- cerr << "invalid cost: specification -- no names!" << endl;
- else {
- if (substring_eq(p->parse_name[0], "shared")) {
- if (p->parse_name.size() == 1) {
- f.x = -1;
- f.weight_index = 0;
- } else
- cerr << "shared feature vectors should not have costs" << endl;
- } else if (substring_eq(p->parse_name[0], "label")) {
- if (p->parse_name.size() == 2) {
- f.weight_index = (size_t)f.x;
- f.x = -1;
- } else
- cerr << "label feature vectors must have label ids" << endl;
- } else {
- f.weight_index = 0;
- if (p->parse_name.size() == 1 || p->parse_name.size() == 2 || p->parse_name.size() == 3) {
- f.weight_index = (uint32_t)hashstring(p->parse_name[0], 0);
- if (p->parse_name.size() == 1 && f.x >= 0) // test examples are specified just by un-valued class #s
- f.x = FLT_MAX;
-
- if ((f.weight_index >= 1) && (f.weight_index <= sd->k) && (f.x >= 0)) {} // normal example
- else if ((f.weight_index >= 1) && (f.weight_index <= sd->k) && (f.x <= -1)) {} // label definition
- else if ((f.weight_index == 0) && (f.x <= -1)) {} // shared header
- else
- cerr << "invalid cost specification: " << f.weight_index << endl;
- } else
- cerr << "malformed cost specification on '" << (p->parse_name[0].begin) << "'" << endl;
- }
- ld->costs.push_back(f);
- }
- }
-
- if (words.size() == 0) {
- if (sd->k != (uint32_t)-1) {
- for (uint32_t i = 1; i <= sd->k; i++) {
- wclass f = {FLT_MAX, i, 0., 0.};
- ld->costs.push_back(f);
- }
- } else {
- //cerr << "ldf test examples must have possible labels listed" << endl;
- //throw exception();
- }
- }
- }
-
- void print_update(vw& all, bool is_test, example& ec)
- {
- if (all.sd->weighted_examples >= all.sd->dump_interval && !all.quiet && !all.bfgs)
- {
- char label_buf[32];
- if (is_test)
- strcpy(label_buf," unknown");
- else
- sprintf(label_buf," known");
-
- if(!all.holdout_set_off && all.current_pass >= 1)
- {
- if(all.sd->holdout_sum_loss == 0. && all.sd->weighted_holdout_examples == 0.)
- fprintf(stderr, " unknown ");
- else
- fprintf(stderr, "%-10.6f " , all.sd->holdout_sum_loss/all.sd->weighted_holdout_examples);
-
- if(all.sd->holdout_sum_loss_since_last_dump == 0. && all.sd->weighted_holdout_examples_since_last_dump == 0.)
- fprintf(stderr, " unknown ");
- else
- fprintf(stderr, "%-10.6f " , all.sd->holdout_sum_loss_since_last_dump/all.sd->weighted_holdout_examples_since_last_dump);
-
- fprintf(stderr, "%8ld %8.1f %s %8lu %8lu h\n",
- (long int)all.sd->example_number,
- all.sd->weighted_examples,
- label_buf,
- (long unsigned int)ec.final_prediction,
- (long unsigned int)ec.num_features);
-
- all.sd->weighted_holdout_examples_since_last_dump = 0;
- all.sd->holdout_sum_loss_since_last_dump = 0.0;
- }
- else
- fprintf(stderr, "%-10.6f %-10.6f %8ld %8.1f %s %8lu %8lu\n",
- all.sd->sum_loss/all.sd->weighted_examples,
- all.sd->sum_loss_since_last_dump / (all.sd->weighted_examples - all.sd->old_weighted_examples),
- (long int)all.sd->example_number,
- all.sd->weighted_examples,
- label_buf,
- (long unsigned int)ec.final_prediction,
- (long unsigned int)ec.num_features);
-
- all.sd->sum_loss_since_last_dump = 0.0;
- all.sd->old_weighted_examples = all.sd->weighted_examples;
- VW::update_dump_interval(all);
- }
- }
-
- void output_example(vw& all, example& ec)
- {
- label* ld = (label*)ec.ld;
-
- float loss = 0.;
- if (!is_test_label(ld))
- {//need to compute exact loss
- size_t pred = (size_t)ec.final_prediction;
-
- float chosen_loss = FLT_MAX;
- float min = FLT_MAX;
- for (wclass *cl = ld->costs.begin; cl != ld->costs.end; cl ++) {
- if (cl->weight_index == pred)
- chosen_loss = cl->x;
- if (cl->x < min)
- min = cl->x;
- }
- if (chosen_loss == FLT_MAX)
- cerr << "warning: csoaa predicted an invalid class" << endl;
-
- loss = chosen_loss - min;
- }
-
- if(ec.test_only)
- {
- all.sd->weighted_holdout_examples += ec.global_weight;//test weight seen
- all.sd->weighted_holdout_examples_since_last_dump += ec.global_weight;
- all.sd->weighted_holdout_examples_since_last_pass += ec.global_weight;
- all.sd->holdout_sum_loss += loss;
- all.sd->holdout_sum_loss_since_last_dump += loss;
- all.sd->holdout_sum_loss_since_last_pass += loss;//since last pass
- }
- else
- {
- all.sd->weighted_examples += 1.;
- all.sd->total_features += ec.num_features;
- all.sd->sum_loss += loss;
- all.sd->sum_loss_since_last_dump += loss;
- all.sd->example_number++;
- }
-
- for (int* sink = all.final_prediction_sink.begin; sink != all.final_prediction_sink.end; sink++)
- all.print((int)*sink, ec.final_prediction, 0, ec.tag);
-
- if (all.raw_prediction > 0) {
- string outputString;
- stringstream outputStringStream(outputString);
- for (unsigned int i = 0; i < ld->costs.size(); i++) {
- wclass cl = ld->costs[i];
- if (i > 0) outputStringStream << ' ';
- outputStringStream << cl.weight_index << ':' << cl.partial_prediction;
- }
- //outputStringStream << endl;
- all.print_text(all.raw_prediction, outputStringStream.str(), ec.tag);
- }
-
- print_update(all, is_test_label((label*)ec.ld), ec);
- }
-
template <bool is_learn>
void predict_or_learn(csoaa& c, learner& base, example& ec) {
vw* all = c.all;
@@ -382,16 +99,6 @@ namespace CSOAA {
l->set_finish_example<csoaa,finish_example>();
return l;
}
-
- bool example_is_test(example& ec)
- {
- v_array<CSOAA::wclass> costs = ((label*)ec.ld)->costs;
- if (costs.size() == 0) return true;
- for (size_t j=0; j<costs.size(); j++)
- if (costs[j].x != FLT_MAX) return false;
- return true;
- }
-
}
namespace CSOAA_AND_WAP_LDF {
@@ -419,7 +126,7 @@ namespace LabelDict {
bool ec_is_label_definition(example& ec) // label defs look like "___:-1"
{
- v_array<CSOAA::wclass> costs = ((CSOAA::label*)ec.ld)->costs;
+ v_array<COST_SENSITIVE::wclass> costs = ((COST_SENSITIVE::label*)ec.ld)->costs;
for (size_t j=0; j<costs.size(); j++)
if (costs[j].x >= 0.) return false;
if (ec.indices.size() == 0) return false;
@@ -430,7 +137,7 @@ namespace LabelDict {
bool ec_is_example_header(example& ec) // example headers look like "0:-1"
{
- v_array<CSOAA::wclass> costs = ((CSOAA::label*)ec.ld)->costs;
+ v_array<COST_SENSITIVE::wclass> costs = ((COST_SENSITIVE::label*)ec.ld)->costs;
if (costs.size() != 1) return false;
if (costs[0].weight_index != 0) return false;
if (costs[0].x >= 0) return false;
@@ -549,9 +256,9 @@ namespace LabelDict {
}
}
- inline bool cmp_wclass_ptr(const CSOAA::wclass* a, const CSOAA::wclass* b) { return a->x < b->x; }
+ inline bool cmp_wclass_ptr(const COST_SENSITIVE::wclass* a, const COST_SENSITIVE::wclass* b) { return a->x < b->x; }
- void compute_wap_values(vector<CSOAA::wclass*> costs) {
+ void compute_wap_values(vector<COST_SENSITIVE::wclass*> costs) {
std::sort(costs.begin(), costs.end(), cmp_wclass_ptr);
costs[0]->wap_value = 0.;
for (size_t i=1; i<costs.size(); i++)
@@ -622,7 +329,7 @@ namespace LabelDict {
void make_single_prediction(vw& all, ldf& l, learner& base, example& ec, size_t*prediction, float*min_score, float*min_cost, float*max_cost) {
label *ld = (label*)ec.ld;
- v_array<CSOAA::wclass> costs = ld->costs;
+ v_array<COST_SENSITIVE::wclass> costs = ld->costs;
label_data simple_label;
if (costs.size() == 0) {
@@ -667,14 +374,14 @@ namespace LabelDict {
void do_actual_learning_wap(vw& all, ldf& l, learner& base, size_t start_K)
{
size_t K = l.ec_seq.size();
- bool isTest = CSOAA::example_is_test(*l.ec_seq[start_K]);
+ bool isTest = COST_SENSITIVE::example_is_test(*l.ec_seq[start_K]);
size_t prediction = 0;
float min_score = FLT_MAX;
for (size_t k=start_K; k<K; k++) {
example *ec = l.ec_seq.begin[k];
- if (CSOAA::example_is_test(*ec) != isTest) {
+ if (COST_SENSITIVE::example_is_test(*ec) != isTest) {
isTest = true;
cerr << "warning: wap_ldf got mix of train/test data; assuming test" << endl;
}
@@ -687,10 +394,10 @@ namespace LabelDict {
}
// do actual learning
- vector<CSOAA::wclass*> all_costs;
+ vector<COST_SENSITIVE::wclass*> all_costs;
if (is_learn && all.training && !isTest) {
for (size_t k=start_K; k<K; k++) {
- v_array<CSOAA::wclass> this_costs = ((label*)l.ec_seq.begin[k]->ld)->costs;
+ v_array<COST_SENSITIVE::wclass> this_costs = ((label*)l.ec_seq.begin[k]->ld)->costs;
for (size_t j=0; j<this_costs.size(); j++)
all_costs.push_back(&this_costs[j]);
}
@@ -703,7 +410,7 @@ namespace LabelDict {
for (size_t k1=start_K; k1<K; k1++) {
example *ec1 = l.ec_seq.begin[k1];
label *ld1 = (label*)ec1->ld;
- v_array<CSOAA::wclass> costs1 = ld1->costs;
+ v_array<COST_SENSITIVE::wclass> costs1 = ld1->costs;
bool prediction_is_me = false;
ec1->ld = &simple_label;
float example_t1 = ec1->example_t;
@@ -716,7 +423,7 @@ namespace LabelDict {
for (size_t k2=k1+1; k2<K; k2++) {
example *ec2 = l.ec_seq.begin[k2];
label *ld2 = (label*)ec2->ld;
- v_array<CSOAA::wclass> costs2 = ld2->costs;
+ v_array<COST_SENSITIVE::wclass> costs2 = ld2->costs;
for (size_t j2=0; j2<costs2.size(); j2++) {
if (costs2[j2].weight_index == (uint32_t)-1) continue;
@@ -759,7 +466,7 @@ namespace LabelDict {
{
size_t K = l.ec_seq.size();
size_t prediction = 0;
- bool isTest = CSOAA::example_is_test(*l.ec_seq[start_K]);
+ bool isTest = COST_SENSITIVE::example_is_test(*l.ec_seq[start_K]);
float min_score = FLT_MAX;
float min_cost = FLT_MAX;
float max_cost = -FLT_MAX;
@@ -768,7 +475,7 @@ namespace LabelDict {
for (size_t k=start_K; k<K; k++) {
example *ec = l.ec_seq.begin[k];
- if (CSOAA::example_is_test(*ec) != isTest) {
+ if (COST_SENSITIVE::example_is_test(*ec) != isTest) {
isTest = true;
cerr << "warning: ldf got mix of train/test data; assuming test" << endl;
}
@@ -786,7 +493,7 @@ namespace LabelDict {
for (size_t k=start_K; k<K; k++) {
example *ec = l.ec_seq.begin[k];
label *ld = (label*)ec->ld;
- v_array<CSOAA::wclass> costs = ld->costs;
+ v_array<COST_SENSITIVE::wclass> costs = ld->costs;
// learn
label_data simple_label;
@@ -855,7 +562,7 @@ namespace LabelDict {
features.push_back(fnew);
}
- v_array<CSOAA::wclass> costs = ((CSOAA::label*)l.ec_seq[i]->ld)->costs;
+ v_array<COST_SENSITIVE::wclass> costs = ((COST_SENSITIVE::label*)l.ec_seq[i]->ld)->costs;
for (size_t j=0; j<costs.size(); j++) {
size_t lab = costs[j].weight_index;
LabelDict::set_label_features(l, lab, features);
@@ -886,7 +593,7 @@ namespace LabelDict {
void output_example(vw& all, example& ec, bool& hit_loss)
{
label* ld = (label*)ec.ld;
- v_array<CSOAA::wclass> costs = ld->costs;
+ v_array<COST_SENSITIVE::wclass> costs = ld->costs;
if (example_is_newline(ec)) return;
if (LabelDict::ec_is_example_header(ec)) return;
@@ -897,7 +604,7 @@ namespace LabelDict {
float loss = 0.;
size_t final_pred = (size_t)ec.final_prediction;
- if (!CSOAA::example_is_test(ec)) {
+ if (!COST_SENSITIVE::example_is_test(ec)) {
for (size_t j=0; j<costs.size(); j++) {
if (hit_loss) break;
if (final_pred == costs[j].weight_index) {
@@ -926,7 +633,7 @@ namespace LabelDict {
}
- CSOAA::print_update(all, CSOAA::example_is_test(ec), ec);
+ COST_SENSITIVE::print_update(all, COST_SENSITIVE::example_is_test(ec), ec);
}
void output_example_seq(vw& all, ldf& l)
@@ -968,7 +675,7 @@ namespace LabelDict {
vw* all = l->all;
l->base = &base;
- if ((!all->training) || CSOAA::example_is_test(ec)) {
+ if ((!all->training) || COST_SENSITIVE::example_is_test(ec)) {
size_t prediction = 0;
float min_score = FLT_MAX;
make_single_prediction(*all, *l, base, ec, &prediction, &min_score, NULL, NULL);
@@ -992,7 +699,7 @@ namespace LabelDict {
if (l->ec_seq.size() > 0)
cerr << "warning: label definition encountered in data block -- ignoring data!" << endl;
- if (!((!all->training) || CSOAA::example_is_test(ec))) {
+ if (!((!all->training) || COST_SENSITIVE::example_is_test(ec))) {
l->ec_seq.erase();
l->ec_seq.push_back(ec);
do_actual_learning<is_learn>(*all, *l, base);
@@ -1057,7 +764,7 @@ namespace LabelDict {
vw* all = l.all;
l.base = &base;
- bool is_test = CSOAA::example_is_test(ec) || !all->training;
+ bool is_test = COST_SENSITIVE::example_is_test(ec) || !all->training;
if (is_test)
make_single_prediction(*all, l, base, ec, NULL, NULL, NULL, NULL);
@@ -1130,7 +837,7 @@ namespace LabelDict {
all.options_from_file.append(ldf_arg);
}
- all.p->lp = CSOAA::cs_label_parser;
+ all.p->lp = COST_SENSITIVE::cs_label_parser;
all.sd->k = (uint32_t)-1;