diff options
author | John Langford <jl@hunch.net> | 2014-11-10 20:33:14 +0300 |
---|---|---|
committer | John Langford <jl@hunch.net> | 2014-11-10 20:33:14 +0300 |
commit | 55d3b14ac014d17e67dda6ac7e145d2aef619313 (patch) | |
tree | 4d3146c63ed97e92c9cddc3fecc86ddcf43d0b36 /python | |
parent | b53ba7c762506c95dc46853515dba0b56e1f5a41 (diff) | |
parent | cef3e42b9d622c28b11ea8bc31951f139b35740f (diff) |
Merge branch 'master' of /home/jl/programs/vowpal_wabbit into v0
Diffstat (limited to 'python')
-rw-r--r-- | python/covington.py | 4 | ||||
-rw-r--r-- | python/pylibvw.cc | 122 | ||||
-rw-r--r-- | python/pyvw.py | 102 | ||||
-rw-r--r-- | python/test_search.py | 6 | ||||
-rw-r--r-- | python/test_search_ldf.py | 65 |
5 files changed, 239 insertions, 60 deletions
diff --git a/python/covington.py b/python/covington.py index b6eeda53..8508bfad 100644 --- a/python/covington.py +++ b/python/covington.py @@ -103,7 +103,7 @@ print 'training non-LDF' vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --quiet") task = vw.init_search_task(CovingtonDepParser) for p in range(2): # do two passes over the training data - task.learn(my_dataset.__iter__) + task.learn(my_dataset) print 'testing non-LDF' print task.predict( [(w,-1) for w in "the monster ate a sandwich".split()] ) print 'should have printed [ 1 2 -1 4 2 ]' @@ -114,7 +114,7 @@ print 'training LDF' vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet") task = vw.init_search_task(CovingtonDepParserLDF) for p in range(2): # do two passes over the training data - task.learn(my_dataset.__iter__) + task.learn(my_dataset) print 'testing LDF' print task.predict( [(w,-1) for w in "the monster ate a sandwich".split()] ) print 'should have printed [ 1 2 -1 4 2 ]' diff --git a/python/pylibvw.cc b/python/pylibvw.cc index 4abff109..70580dbd 100644 --- a/python/pylibvw.cc +++ b/python/pylibvw.cc @@ -51,7 +51,7 @@ predictor_ptr get_predictor(search_ptr sch, ptag my_tag) { label_parser* get_label_parser(vw*all, size_t labelType) { switch (labelType) { - case lDEFAULT: return &all->p->lp; // TODO: check null + case lDEFAULT: return all ? &all->p->lp : NULL; case lBINARY: return &simple_label; case lMULTICLASS: return &MULTICLASS::mc_label; case lCOST_SENSITIVE: return &COST_SENSITIVE::cs_label; @@ -62,9 +62,9 @@ label_parser* get_label_parser(vw*all, size_t labelType) { void my_delete_example(void*voidec) { example* ec = (example*) voidec; - size_t labelType = ec->example_counter; + size_t labelType = (ec->tag.size() == 0) ? lDEFAULT : ec->tag[0]; label_parser* lp = get_label_parser(NULL, labelType); - dealloc_example(lp->delete_label, *ec); + dealloc_example(lp ? lp->delete_label : NULL, *ec); free(ec); } @@ -77,36 +77,28 @@ example* my_empty_example0(vw_ptr vw, size_t labelType) { COST_SENSITIVE::wclass zero = { 0., 1, 0., 0. }; ((COST_SENSITIVE::label*)ec->ld)->costs.push_back(zero); } - ec->example_counter = labelType; // example_counter unused in our own examples, so hide labelType in it! + ec->tag.erase(); + if (labelType != lDEFAULT) + ec->tag.push_back((char)labelType); // hide the label type in the tag return ec; } example_ptr my_empty_example(vw_ptr vw, size_t labelType) { - if (labelType == lDEFAULT) { - example* new_ec = VW::new_unused_example(*vw); - return boost::shared_ptr<example>(new_ec, dont_delete_me); - } else { - example* ec = my_empty_example0(vw, labelType); - return boost::shared_ptr<example>(ec, my_delete_example); - } + example* ec = my_empty_example0(vw, labelType); + return boost::shared_ptr<example>(ec, my_delete_example); } example_ptr my_read_example(vw_ptr all, size_t labelType, char*str) { - if (labelType == lDEFAULT) { - example*ec = VW::read_example(*all, str); - return boost::shared_ptr<example>(ec, dont_delete_me); - } else { - example*ec = my_empty_example0(all, labelType); - read_line(*all, ec, str); - parse_atomic_example(*all, ec, false); - VW::setup_example(*all, ec); - ec->example_counter = labelType; - return boost::shared_ptr<example>(ec, my_delete_example); - } + example*ec = my_empty_example0(all, labelType); + read_line(*all, ec, str); + parse_atomic_example(*all, ec, false); + VW::setup_example(*all, ec); + ec->example_counter = labelType; + return boost::shared_ptr<example>(ec, my_delete_example); } void my_finish_example(vw_ptr all, example_ptr ec) { - VW::finish_example(*all, ec.get()); + // TODO } void my_learn(vw_ptr all, example_ptr ec) { @@ -165,6 +157,46 @@ void ex_push_feature(example_ptr ec, unsigned char ns, uint32_t fid, float v) { ec->total_sum_feat_sq += v * v; } +void ex_push_feature_list(example_ptr ec, vw_ptr vw, unsigned char ns, py::list& a) { + // warning: assumes namespace exists! + char ns_str[2] = { ns, 0 }; + uint32_t ns_hash = VW::hash_space(*vw, ns_str); + size_t count = 0; float sum_sq = 0.; + for (size_t i=0; i<len(a); i++) { + feature f = { 1., 0 }; + py::object ai = a[i]; + py::extract<py::tuple> get_tup(ai); + if (get_tup.check()) { + py::tuple fv = get_tup(); + if (len(fv) != 2) { cerr << "warning: malformed feature in list" << endl; continue; } // TODO str(ai) + py::extract<float> get_val(fv[1]); + if (get_val.check()) + f.x = get_val(); + else { cerr << "warning: malformed feature in list" << endl; continue; } + ai = fv[0]; + } + + bool got = false; + py::extract<uint32_t> get_int(ai); + if (get_int.check()) { f.weight_index = get_int(); got = true; } + else { + py::extract<string> get_str(ai); + if (get_str.check()) { + f.weight_index = VW::hash_feature(*vw, get_str(), ns_hash); + got = true; + } else { cerr << "warning: malformed feature in list" << endl; continue; } + } + if (got && (f.x != 0.)) { + ec->atomics[ns].push_back(f); + count++; + sum_sq += f.x * f.x; + } + } + ec->num_features += count; + ec->sum_feat_sq[ns] += sum_sq; + ec->total_sum_feat_sq += sum_sq; +} + bool ex_pop_feature(example_ptr ec, unsigned char ns) { if (ec->atomics[ns].size() == 0) return false; feature f = ec->atomics[ns].pop(); @@ -199,6 +231,7 @@ void my_setup_example(vw_ptr vw, example_ptr ec) { } void ex_set_label_string(example_ptr ec, vw_ptr vw, string label, size_t labelType) { + // SPEEDUP: if it's already set properly, don't modify label_parser& old_lp = vw->p->lp; vw->p->lp = *get_label_parser(&*vw, labelType); VW::parse_example_label(*vw, *ec, label); @@ -310,17 +343,51 @@ void search_run_fn(Search::search&sch) { } } +void search_setup_fn(Search::search&sch) { + try { + HookTask::task_data* d = sch.get_task_data<HookTask::task_data>(); + py::object run = *(py::object*)d->setup_object; + run.attr("__call__")(); + } catch(...) { + PyErr_Print(); + PyErr_Clear(); + throw exception(); + } +} + +void search_takedown_fn(Search::search&sch) { + try { + HookTask::task_data* d = sch.get_task_data<HookTask::task_data>(); + py::object run = *(py::object*)d->takedown_object; + run.attr("__call__")(); + } catch(...) { + PyErr_Print(); + PyErr_Clear(); + throw exception(); + } +} + void py_delete_run_object(void* pyobj) { py::object* o = (py::object*)pyobj; delete o; } -void set_structured_predict_hook(search_ptr sch, py::object run_object) { +void set_structured_predict_hook(search_ptr sch, py::object run_object, py::object setup_object, py::object takedown_object) { verify_search_set_properly(sch); HookTask::task_data* d = sch->get_task_data<HookTask::task_data>(); d->run_f = &search_run_fn; - py::object* new_obj = new py::object(run_object); // TODO: delete me! - d->run_object = new_obj; + delete (py::object*)d->run_object; d->run_object = NULL; + delete (py::object*)d->setup_object; d->setup_object = NULL; + delete (py::object*)d->takedown_object; d->takedown_object = NULL; + d->run_object = new py::object(run_object); + if (setup_object.ptr() != Py_None) { + d->setup_object = new py::object(setup_object); + d->run_setup_f = &search_setup_fn; + } + if (takedown_object.ptr() != Py_None) { + d->takedown_object = new py::object(takedown_object); + d->run_takedown_f = &search_takedown_fn; + } d->delete_run_object = &py_delete_run_object; } @@ -442,6 +509,7 @@ BOOST_PYTHON_MODULE(pylibvw) { .def("feature_weight", &ex_feature_weight, "The the feature value (weight) per .feature(...)") .def("push_hashed_feature", &ex_push_feature, "Add a hashed feature to a given namespace (id=character-ord)") + .def("push_feature_list", &ex_push_feature_list, "Add a (Python) list of features to a given namespace") .def("pop_feature", &ex_pop_feature, "Remove the top feature from a given namespace; returns True iff the list was non-empty") .def("push_namespace", &ex_push_namespace, "Add a new namespace") .def("ensure_namespace_exists", &ex_ensure_namespace_exists, "Add a new namespace if it doesn't already exist") @@ -498,9 +566,11 @@ BOOST_PYTHON_MODULE(pylibvw) { .def("get_history_length", &Search::search::get_history_length, "Get the value specified by --search_history_length") .def("loss", &Search::search::loss, "Declare a (possibly incremental) loss") .def("should_output", &search_should_output, "Check whether search wants us to output (only happens if you have -p running)") + .def("predict_needs_example", &Search::search::predictNeedsExample, "Check whether a subsequent call to predict is actually going to use the example you pass---i.e., can you skip feature computation?") .def("output", &search_output, "Add a string to the coutput (should only do if should_output returns True)") .def("get_num_actions", &search_get_num_actions, "Return the total number of actions search was initialized with") .def("set_structured_predict_hook", &set_structured_predict_hook, "Set the hook (function pointer) that search should use for structured prediction (you don't want to call this yourself!") + .def("is_ldf", &Search::search::is_ldf, "check whether this search task is running in LDF mode") .def("po_exists", &po_exists, "For program (cmd line) options, check to see if a given option was specified; eg sch.po_exists(\"search\") should be True") .def("po_get", &po_get, "For program (cmd line) options, if an option was specified, get its value; eg sch.po_get(\"search\") should return the # of actions (returns either int or string)") diff --git a/python/pyvw.py b/python/pyvw.py index f16c8472..5b68b2e3 100644 --- a/python/pyvw.py +++ b/python/pyvw.py @@ -16,24 +16,31 @@ class SearchTask(): def _run(self, your_own_input_example): pass - def _call_vw(self, fn, isTest): + def _call_vw(self, my_example, isTest): # run_fn, setup_fn, takedown_fn, isTest): + self._output = None self.bogus_example.set_test_only(isTest) - self.sch.set_structured_predict_hook(fn) + def run(): self._output = self._run(my_example) + setup = None + takedown = None + if callable(getattr(self, "_setup", None)): setup = lambda: self._setup(my_example) + if callable(getattr(self, "_takedown", None)): takedown = lambda: self._takedown(my_example) + self.sch.set_structured_predict_hook(run, setup, takedown) self.vw.learn(self.bogus_example) self.vw.learn(self.blank_line) # this will cause our ._run hook to get called def learn(self, data_iterator): - for my_example in data_iterator(): - self._call_vw(lambda: self._run(my_example), isTest=False) + for my_example in data_iterator.__iter__(): + self._call_vw(my_example, isTest=False); + def example(self, initStringOrDict=None, labelType=pylibvw.vw.lDefault): + """TODO""" + if self.sch.predict_needs_example(): + return self.vw.example(initStringOrDict, labelType) + else: + return self.vw.example(None, labelType) + def predict(self, my_example): - self._output = None - def f(): self._output = self._run(my_example) - self._call_vw(f, isTest=True) - #if self._output is None: - # raise Exception('structured predict hook failed to return anything') - # don't raise this exception because your _run code legitimately - # _could_ return None! + self._call_vw(my_example, isTest=True); return self._output class vw(pylibvw.vw): @@ -52,14 +59,16 @@ class vw(pylibvw.vw): the weight for that position in the (learned) weight vector.""" return pylibvw.vw.get_weight(self, index, offset) - def learn(self, example): - """Perform an online update; example can either be an example + def learn(self, ec): + """Perform an online update; ec can either be an example object or a string (in which case it is parsed and then learned on).""" - if isinstance(example, str): - self.learn_string(example) + if isinstance(ec, str): + self.learn_string(ec) else: - pylibvw.vw.learn(self, example) + if hasattr(ec, 'setup_done') and not ec.setup_done: + ec.setup_example() + pylibvw.vw.learn(self, ec) def finish(self): """stop VW by calling finish (and, eg, write weights to disk)""" @@ -81,8 +90,13 @@ class vw(pylibvw.vw): """The basic (via-reduction) prediction mechanism. Several variants are supported through this overloaded function: - 'examples' can be a single example (interpreted as non-LDF - mode) or a list of examples (interpreted as LDF mode) + 'examples' can be a single example (interpreted as + non-LDF mode) or a list of examples (interpreted as + LDF mode). it can also be a lambda function that + returns a single example or list of examples, and in + that list, each element can also be a lambda function + that returns an example. this is done for lazy + example construction (aka speed). 'my_tag' should be an integer id, specifying this prediction @@ -105,18 +119,42 @@ class vw(pylibvw.vw): 'learner_id' specifies the underlying learner id Returns a single prediction. + """ - if (isinstance(examples, list) and all([isinstance(ex, example) or isinstance(ex, pylibvw.example) for ex in examples])) or \ - isinstance(examples, example) or isinstance(examples, pylibvw.example): - P = sch.get_predictor(my_tag) - if isinstance(examples, list): # LDF - P.set_input_length(len(examples)) + + P = sch.get_predictor(my_tag) + if sch.is_ldf(): + # we need to know how many actions there are, even if we don't know their identities + while hasattr(examples, '__call__'): examples = examples() + if not isinstance(examples, list): raise TypeError('expected example _list_ in LDF mode for SearchTask.predict()') + P.set_input_length(len(examples)) + if sch.predict_needs_example(): for n in range(len(examples)): - P.set_input_at(n, examples[n]) - else: # non-LDF + ec = examples[n] + while hasattr(ec, '__call__'): ec = ec() # unfold the lambdas + if not isinstance(ec, example) and not isinstance(ec, pylibvw.example): raise TypeError('non-example in LDF example list in SearchTask.predict()') + P.set_input_at(n, ec) + else: + pass # TODO: do we need to set the examples even though they're not used? + else: + if sch.predict_needs_example(): + while hasattr(examples, '__call__'): examples = examples() P.set_input(examples) - - if isinstance(oracle, list): P.set_oracles(oracle) + else: + pass # TODO: do we need to set the examples even though they're not used? + + # if (isinstance(examples, list) and all([isinstance(ex, example) or isinstance(ex, pylibvw.example) for ex in examples])) or \ + # isinstance(examples, example) or isinstance(examples, pylibvw.example): + # if isinstance(examples, list): # LDF + # P.set_input_length(len(examples)) + # for n in range(len(examples)): + # P.set_input_at(n, examples[n]) + # else: # non-LDF + # P.set_input(examples) + if True: # TODO: get rid of this + if oracle is None: pass + elif isinstance(oracle, list): + if len(oracle) > 0: P.set_oracles(oracle) elif isinstance(oracle, int): P.set_oracle(oracle) else: raise TypeError('expecting oracle to be a list or an integer') @@ -338,7 +376,10 @@ class example(pylibvw.example): get an "empty" example which you can construct by hand (see, eg, example.push_features). If initString is a string, then this string is parsed as it would be from a VW data file into an - example (and "setup_example" is run).""" + example (and "setup_example" is run). if it is a dict, then we add all features in that dictionary. finally, if it's a function, we (repeatedly) execute it fn() until it's not a function any more (for lazy feature computation).""" + + while hasattr(initStringOrDict, '__call__'): + initStringOrDict = initStringOrDict() if initStringOrDict is None: pylibvw.example.__init__(self, vw, labelType) @@ -502,8 +543,8 @@ class example(pylibvw.example): Fails if setup has run.""" ns = self.get_ns(ns) self.ensure_namespace_exists(ns) - ns_hash = self.vw.hash_space(ns.ns) - + #self.push_feature_list(self.vw, ns.ord_ns, featureList) + ns_hash = self.vw.hash_space( ns.ns ) for feature in featureList: if isinstance(feature, int) or isinstance(feature, str): f = feature @@ -516,6 +557,7 @@ class example(pylibvw.example): self.push_feature(ns, f, v, ns_hash) + def finish(self): """Tell VW that you're done with this example and it can recycle it for later use.""" diff --git a/python/test_search.py b/python/test_search.py index 89134460..aef62d73 100644 --- a/python/test_search.py +++ b/python/test_search.py @@ -56,8 +56,10 @@ sequenceLabeler = vw.init_search_task(SequenceLabeler) # train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above print >>sys.stderr, 'training!' -for curPass in range(10): - sequenceLabeler.learn(my_dataset.__iter__) +i = 0 +while i < 10000000000: + sequenceLabeler.learn(my_dataset) + i += 1 # now see the predictions on a test sentence print >>sys.stderr, 'predicting!' diff --git a/python/test_search_ldf.py b/python/test_search_ldf.py new file mode 100644 index 00000000..b72bb1f7 --- /dev/null +++ b/python/test_search_ldf.py @@ -0,0 +1,65 @@ +import sys +import pyvw + +# wow! your data can be ANY type you want... does NOT have to be VW examples +DET = 1 +NOUN = 2 +VERB = 3 +ADJ = 4 +my_dataset = [ [(DET , 'the'), + (NOUN, 'monster'), + (VERB, 'ate'), + (DET , 'a'), + (ADJ , 'big'), + (NOUN, 'sandwich')], + [(DET , 'the'), + (NOUN, 'sandwich'), + (VERB, 'was'), + (ADJ , 'tasty')], + [(NOUN, 'it'), + (VERB, 'ate'), + (NOUN, 'it'), + (ADJ , 'all')] ] + + +class SequenceLabeler(pyvw.SearchTask): + def __init__(self, vw, sch, num_actions): + # you must must must initialize the parent class + # this will automatically store self.sch <- sch, self.vw <- vw + pyvw.SearchTask.__init__(self, vw, sch, num_actions) + + # set whatever options you want + sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES | sch.IS_LDF ) + + def makeExample(self, word, p): + ex = self.example({'w': [word + '_' + str(p)]}, labelType=self.vw.lCostSensitive) + ex.set_label_string(str(p) + ':0') + return ex + + def _run(self, sentence): # it's called _run to remind you that you shouldn't call it directly! + output = [] + for n in range(len(sentence)): + pos,word = sentence[n] + # use "with...as..." to guarantee that the example is finished properly + ex = [ self.makeExample(word,p) for p in [DET,NOUN,VERB,ADJ] ] + pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos-1, condition=(n,'p')) + output.append(pred + 1) + return output + +# initialize VW as usual, but use 'hook' as the search_task +vw = pyvw.vw("--search 0 --csoaa_ldf m --quiet --search_task hook --ring_size 1024") + +# tell VW to construct your search task object +sequenceLabeler = vw.init_search_task(SequenceLabeler) + +# train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above +print >>sys.stderr, 'training!' +i = 0 +while i < 100000000: + sequenceLabeler.learn(my_dataset) + i += 1 + +# now see the predictions on a test sentence +print >>sys.stderr, 'predicting!' +print sequenceLabeler.predict( [(1,w) for w in "the sandwich ate a monster".split()] ) +print 'should have printed: [1, 2, 3, 1, 2]' |