Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/vowpal_wabbit.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/python
diff options
context:
space:
mode:
authorJohn Langford <jl@hunch.net>2014-11-10 20:33:14 +0300
committerJohn Langford <jl@hunch.net>2014-11-10 20:33:14 +0300
commit55d3b14ac014d17e67dda6ac7e145d2aef619313 (patch)
tree4d3146c63ed97e92c9cddc3fecc86ddcf43d0b36 /python
parentb53ba7c762506c95dc46853515dba0b56e1f5a41 (diff)
parentcef3e42b9d622c28b11ea8bc31951f139b35740f (diff)
Merge branch 'master' of /home/jl/programs/vowpal_wabbit into v0
Diffstat (limited to 'python')
-rw-r--r--python/covington.py4
-rw-r--r--python/pylibvw.cc122
-rw-r--r--python/pyvw.py102
-rw-r--r--python/test_search.py6
-rw-r--r--python/test_search_ldf.py65
5 files changed, 239 insertions, 60 deletions
diff --git a/python/covington.py b/python/covington.py
index b6eeda53..8508bfad 100644
--- a/python/covington.py
+++ b/python/covington.py
@@ -103,7 +103,7 @@ print 'training non-LDF'
vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --quiet")
task = vw.init_search_task(CovingtonDepParser)
for p in range(2): # do two passes over the training data
- task.learn(my_dataset.__iter__)
+ task.learn(my_dataset)
print 'testing non-LDF'
print task.predict( [(w,-1) for w in "the monster ate a sandwich".split()] )
print 'should have printed [ 1 2 -1 4 2 ]'
@@ -114,7 +114,7 @@ print 'training LDF'
vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet")
task = vw.init_search_task(CovingtonDepParserLDF)
for p in range(2): # do two passes over the training data
- task.learn(my_dataset.__iter__)
+ task.learn(my_dataset)
print 'testing LDF'
print task.predict( [(w,-1) for w in "the monster ate a sandwich".split()] )
print 'should have printed [ 1 2 -1 4 2 ]'
diff --git a/python/pylibvw.cc b/python/pylibvw.cc
index 4abff109..70580dbd 100644
--- a/python/pylibvw.cc
+++ b/python/pylibvw.cc
@@ -51,7 +51,7 @@ predictor_ptr get_predictor(search_ptr sch, ptag my_tag) {
label_parser* get_label_parser(vw*all, size_t labelType) {
switch (labelType) {
- case lDEFAULT: return &all->p->lp; // TODO: check null
+ case lDEFAULT: return all ? &all->p->lp : NULL;
case lBINARY: return &simple_label;
case lMULTICLASS: return &MULTICLASS::mc_label;
case lCOST_SENSITIVE: return &COST_SENSITIVE::cs_label;
@@ -62,9 +62,9 @@ label_parser* get_label_parser(vw*all, size_t labelType) {
void my_delete_example(void*voidec) {
example* ec = (example*) voidec;
- size_t labelType = ec->example_counter;
+ size_t labelType = (ec->tag.size() == 0) ? lDEFAULT : ec->tag[0];
label_parser* lp = get_label_parser(NULL, labelType);
- dealloc_example(lp->delete_label, *ec);
+ dealloc_example(lp ? lp->delete_label : NULL, *ec);
free(ec);
}
@@ -77,36 +77,28 @@ example* my_empty_example0(vw_ptr vw, size_t labelType) {
COST_SENSITIVE::wclass zero = { 0., 1, 0., 0. };
((COST_SENSITIVE::label*)ec->ld)->costs.push_back(zero);
}
- ec->example_counter = labelType; // example_counter unused in our own examples, so hide labelType in it!
+ ec->tag.erase();
+ if (labelType != lDEFAULT)
+ ec->tag.push_back((char)labelType); // hide the label type in the tag
return ec;
}
example_ptr my_empty_example(vw_ptr vw, size_t labelType) {
- if (labelType == lDEFAULT) {
- example* new_ec = VW::new_unused_example(*vw);
- return boost::shared_ptr<example>(new_ec, dont_delete_me);
- } else {
- example* ec = my_empty_example0(vw, labelType);
- return boost::shared_ptr<example>(ec, my_delete_example);
- }
+ example* ec = my_empty_example0(vw, labelType);
+ return boost::shared_ptr<example>(ec, my_delete_example);
}
example_ptr my_read_example(vw_ptr all, size_t labelType, char*str) {
- if (labelType == lDEFAULT) {
- example*ec = VW::read_example(*all, str);
- return boost::shared_ptr<example>(ec, dont_delete_me);
- } else {
- example*ec = my_empty_example0(all, labelType);
- read_line(*all, ec, str);
- parse_atomic_example(*all, ec, false);
- VW::setup_example(*all, ec);
- ec->example_counter = labelType;
- return boost::shared_ptr<example>(ec, my_delete_example);
- }
+ example*ec = my_empty_example0(all, labelType);
+ read_line(*all, ec, str);
+ parse_atomic_example(*all, ec, false);
+ VW::setup_example(*all, ec);
+ ec->example_counter = labelType;
+ return boost::shared_ptr<example>(ec, my_delete_example);
}
void my_finish_example(vw_ptr all, example_ptr ec) {
- VW::finish_example(*all, ec.get());
+ // TODO
}
void my_learn(vw_ptr all, example_ptr ec) {
@@ -165,6 +157,46 @@ void ex_push_feature(example_ptr ec, unsigned char ns, uint32_t fid, float v) {
ec->total_sum_feat_sq += v * v;
}
+void ex_push_feature_list(example_ptr ec, vw_ptr vw, unsigned char ns, py::list& a) {
+ // warning: assumes namespace exists!
+ char ns_str[2] = { ns, 0 };
+ uint32_t ns_hash = VW::hash_space(*vw, ns_str);
+ size_t count = 0; float sum_sq = 0.;
+ for (size_t i=0; i<len(a); i++) {
+ feature f = { 1., 0 };
+ py::object ai = a[i];
+ py::extract<py::tuple> get_tup(ai);
+ if (get_tup.check()) {
+ py::tuple fv = get_tup();
+ if (len(fv) != 2) { cerr << "warning: malformed feature in list" << endl; continue; } // TODO str(ai)
+ py::extract<float> get_val(fv[1]);
+ if (get_val.check())
+ f.x = get_val();
+ else { cerr << "warning: malformed feature in list" << endl; continue; }
+ ai = fv[0];
+ }
+
+ bool got = false;
+ py::extract<uint32_t> get_int(ai);
+ if (get_int.check()) { f.weight_index = get_int(); got = true; }
+ else {
+ py::extract<string> get_str(ai);
+ if (get_str.check()) {
+ f.weight_index = VW::hash_feature(*vw, get_str(), ns_hash);
+ got = true;
+ } else { cerr << "warning: malformed feature in list" << endl; continue; }
+ }
+ if (got && (f.x != 0.)) {
+ ec->atomics[ns].push_back(f);
+ count++;
+ sum_sq += f.x * f.x;
+ }
+ }
+ ec->num_features += count;
+ ec->sum_feat_sq[ns] += sum_sq;
+ ec->total_sum_feat_sq += sum_sq;
+}
+
bool ex_pop_feature(example_ptr ec, unsigned char ns) {
if (ec->atomics[ns].size() == 0) return false;
feature f = ec->atomics[ns].pop();
@@ -199,6 +231,7 @@ void my_setup_example(vw_ptr vw, example_ptr ec) {
}
void ex_set_label_string(example_ptr ec, vw_ptr vw, string label, size_t labelType) {
+ // SPEEDUP: if it's already set properly, don't modify
label_parser& old_lp = vw->p->lp;
vw->p->lp = *get_label_parser(&*vw, labelType);
VW::parse_example_label(*vw, *ec, label);
@@ -310,17 +343,51 @@ void search_run_fn(Search::search&sch) {
}
}
+void search_setup_fn(Search::search&sch) {
+ try {
+ HookTask::task_data* d = sch.get_task_data<HookTask::task_data>();
+ py::object run = *(py::object*)d->setup_object;
+ run.attr("__call__")();
+ } catch(...) {
+ PyErr_Print();
+ PyErr_Clear();
+ throw exception();
+ }
+}
+
+void search_takedown_fn(Search::search&sch) {
+ try {
+ HookTask::task_data* d = sch.get_task_data<HookTask::task_data>();
+ py::object run = *(py::object*)d->takedown_object;
+ run.attr("__call__")();
+ } catch(...) {
+ PyErr_Print();
+ PyErr_Clear();
+ throw exception();
+ }
+}
+
void py_delete_run_object(void* pyobj) {
py::object* o = (py::object*)pyobj;
delete o;
}
-void set_structured_predict_hook(search_ptr sch, py::object run_object) {
+void set_structured_predict_hook(search_ptr sch, py::object run_object, py::object setup_object, py::object takedown_object) {
verify_search_set_properly(sch);
HookTask::task_data* d = sch->get_task_data<HookTask::task_data>();
d->run_f = &search_run_fn;
- py::object* new_obj = new py::object(run_object); // TODO: delete me!
- d->run_object = new_obj;
+ delete (py::object*)d->run_object; d->run_object = NULL;
+ delete (py::object*)d->setup_object; d->setup_object = NULL;
+ delete (py::object*)d->takedown_object; d->takedown_object = NULL;
+ d->run_object = new py::object(run_object);
+ if (setup_object.ptr() != Py_None) {
+ d->setup_object = new py::object(setup_object);
+ d->run_setup_f = &search_setup_fn;
+ }
+ if (takedown_object.ptr() != Py_None) {
+ d->takedown_object = new py::object(takedown_object);
+ d->run_takedown_f = &search_takedown_fn;
+ }
d->delete_run_object = &py_delete_run_object;
}
@@ -442,6 +509,7 @@ BOOST_PYTHON_MODULE(pylibvw) {
.def("feature_weight", &ex_feature_weight, "The the feature value (weight) per .feature(...)")
.def("push_hashed_feature", &ex_push_feature, "Add a hashed feature to a given namespace (id=character-ord)")
+ .def("push_feature_list", &ex_push_feature_list, "Add a (Python) list of features to a given namespace")
.def("pop_feature", &ex_pop_feature, "Remove the top feature from a given namespace; returns True iff the list was non-empty")
.def("push_namespace", &ex_push_namespace, "Add a new namespace")
.def("ensure_namespace_exists", &ex_ensure_namespace_exists, "Add a new namespace if it doesn't already exist")
@@ -498,9 +566,11 @@ BOOST_PYTHON_MODULE(pylibvw) {
.def("get_history_length", &Search::search::get_history_length, "Get the value specified by --search_history_length")
.def("loss", &Search::search::loss, "Declare a (possibly incremental) loss")
.def("should_output", &search_should_output, "Check whether search wants us to output (only happens if you have -p running)")
+ .def("predict_needs_example", &Search::search::predictNeedsExample, "Check whether a subsequent call to predict is actually going to use the example you pass---i.e., can you skip feature computation?")
.def("output", &search_output, "Add a string to the coutput (should only do if should_output returns True)")
.def("get_num_actions", &search_get_num_actions, "Return the total number of actions search was initialized with")
.def("set_structured_predict_hook", &set_structured_predict_hook, "Set the hook (function pointer) that search should use for structured prediction (you don't want to call this yourself!")
+ .def("is_ldf", &Search::search::is_ldf, "check whether this search task is running in LDF mode")
.def("po_exists", &po_exists, "For program (cmd line) options, check to see if a given option was specified; eg sch.po_exists(\"search\") should be True")
.def("po_get", &po_get, "For program (cmd line) options, if an option was specified, get its value; eg sch.po_get(\"search\") should return the # of actions (returns either int or string)")
diff --git a/python/pyvw.py b/python/pyvw.py
index f16c8472..5b68b2e3 100644
--- a/python/pyvw.py
+++ b/python/pyvw.py
@@ -16,24 +16,31 @@ class SearchTask():
def _run(self, your_own_input_example):
pass
- def _call_vw(self, fn, isTest):
+ def _call_vw(self, my_example, isTest): # run_fn, setup_fn, takedown_fn, isTest):
+ self._output = None
self.bogus_example.set_test_only(isTest)
- self.sch.set_structured_predict_hook(fn)
+ def run(): self._output = self._run(my_example)
+ setup = None
+ takedown = None
+ if callable(getattr(self, "_setup", None)): setup = lambda: self._setup(my_example)
+ if callable(getattr(self, "_takedown", None)): takedown = lambda: self._takedown(my_example)
+ self.sch.set_structured_predict_hook(run, setup, takedown)
self.vw.learn(self.bogus_example)
self.vw.learn(self.blank_line) # this will cause our ._run hook to get called
def learn(self, data_iterator):
- for my_example in data_iterator():
- self._call_vw(lambda: self._run(my_example), isTest=False)
+ for my_example in data_iterator.__iter__():
+ self._call_vw(my_example, isTest=False);
+ def example(self, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
+ """TODO"""
+ if self.sch.predict_needs_example():
+ return self.vw.example(initStringOrDict, labelType)
+ else:
+ return self.vw.example(None, labelType)
+
def predict(self, my_example):
- self._output = None
- def f(): self._output = self._run(my_example)
- self._call_vw(f, isTest=True)
- #if self._output is None:
- # raise Exception('structured predict hook failed to return anything')
- # don't raise this exception because your _run code legitimately
- # _could_ return None!
+ self._call_vw(my_example, isTest=True);
return self._output
class vw(pylibvw.vw):
@@ -52,14 +59,16 @@ class vw(pylibvw.vw):
the weight for that position in the (learned) weight vector."""
return pylibvw.vw.get_weight(self, index, offset)
- def learn(self, example):
- """Perform an online update; example can either be an example
+ def learn(self, ec):
+ """Perform an online update; ec can either be an example
object or a string (in which case it is parsed and then
learned on)."""
- if isinstance(example, str):
- self.learn_string(example)
+ if isinstance(ec, str):
+ self.learn_string(ec)
else:
- pylibvw.vw.learn(self, example)
+ if hasattr(ec, 'setup_done') and not ec.setup_done:
+ ec.setup_example()
+ pylibvw.vw.learn(self, ec)
def finish(self):
"""stop VW by calling finish (and, eg, write weights to disk)"""
@@ -81,8 +90,13 @@ class vw(pylibvw.vw):
"""The basic (via-reduction) prediction mechanism. Several
variants are supported through this overloaded function:
- 'examples' can be a single example (interpreted as non-LDF
- mode) or a list of examples (interpreted as LDF mode)
+ 'examples' can be a single example (interpreted as
+ non-LDF mode) or a list of examples (interpreted as
+ LDF mode). it can also be a lambda function that
+ returns a single example or list of examples, and in
+ that list, each element can also be a lambda function
+ that returns an example. this is done for lazy
+ example construction (aka speed).
'my_tag' should be an integer id, specifying this prediction
@@ -105,18 +119,42 @@ class vw(pylibvw.vw):
'learner_id' specifies the underlying learner id
Returns a single prediction.
+
"""
- if (isinstance(examples, list) and all([isinstance(ex, example) or isinstance(ex, pylibvw.example) for ex in examples])) or \
- isinstance(examples, example) or isinstance(examples, pylibvw.example):
- P = sch.get_predictor(my_tag)
- if isinstance(examples, list): # LDF
- P.set_input_length(len(examples))
+
+ P = sch.get_predictor(my_tag)
+ if sch.is_ldf():
+ # we need to know how many actions there are, even if we don't know their identities
+ while hasattr(examples, '__call__'): examples = examples()
+ if not isinstance(examples, list): raise TypeError('expected example _list_ in LDF mode for SearchTask.predict()')
+ P.set_input_length(len(examples))
+ if sch.predict_needs_example():
for n in range(len(examples)):
- P.set_input_at(n, examples[n])
- else: # non-LDF
+ ec = examples[n]
+ while hasattr(ec, '__call__'): ec = ec() # unfold the lambdas
+ if not isinstance(ec, example) and not isinstance(ec, pylibvw.example): raise TypeError('non-example in LDF example list in SearchTask.predict()')
+ P.set_input_at(n, ec)
+ else:
+ pass # TODO: do we need to set the examples even though they're not used?
+ else:
+ if sch.predict_needs_example():
+ while hasattr(examples, '__call__'): examples = examples()
P.set_input(examples)
-
- if isinstance(oracle, list): P.set_oracles(oracle)
+ else:
+ pass # TODO: do we need to set the examples even though they're not used?
+
+ # if (isinstance(examples, list) and all([isinstance(ex, example) or isinstance(ex, pylibvw.example) for ex in examples])) or \
+ # isinstance(examples, example) or isinstance(examples, pylibvw.example):
+ # if isinstance(examples, list): # LDF
+ # P.set_input_length(len(examples))
+ # for n in range(len(examples)):
+ # P.set_input_at(n, examples[n])
+ # else: # non-LDF
+ # P.set_input(examples)
+ if True: # TODO: get rid of this
+ if oracle is None: pass
+ elif isinstance(oracle, list):
+ if len(oracle) > 0: P.set_oracles(oracle)
elif isinstance(oracle, int): P.set_oracle(oracle)
else: raise TypeError('expecting oracle to be a list or an integer')
@@ -338,7 +376,10 @@ class example(pylibvw.example):
get an "empty" example which you can construct by hand (see, eg,
example.push_features). If initString is a string, then this
string is parsed as it would be from a VW data file into an
- example (and "setup_example" is run)."""
+ example (and "setup_example" is run). if it is a dict, then we add all features in that dictionary. finally, if it's a function, we (repeatedly) execute it fn() until it's not a function any more (for lazy feature computation)."""
+
+ while hasattr(initStringOrDict, '__call__'):
+ initStringOrDict = initStringOrDict()
if initStringOrDict is None:
pylibvw.example.__init__(self, vw, labelType)
@@ -502,8 +543,8 @@ class example(pylibvw.example):
Fails if setup has run."""
ns = self.get_ns(ns)
self.ensure_namespace_exists(ns)
- ns_hash = self.vw.hash_space(ns.ns)
-
+ #self.push_feature_list(self.vw, ns.ord_ns, featureList)
+ ns_hash = self.vw.hash_space( ns.ns )
for feature in featureList:
if isinstance(feature, int) or isinstance(feature, str):
f = feature
@@ -516,6 +557,7 @@ class example(pylibvw.example):
self.push_feature(ns, f, v, ns_hash)
+
def finish(self):
"""Tell VW that you're done with this example and it can
recycle it for later use."""
diff --git a/python/test_search.py b/python/test_search.py
index 89134460..aef62d73 100644
--- a/python/test_search.py
+++ b/python/test_search.py
@@ -56,8 +56,10 @@ sequenceLabeler = vw.init_search_task(SequenceLabeler)
# train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above
print >>sys.stderr, 'training!'
-for curPass in range(10):
- sequenceLabeler.learn(my_dataset.__iter__)
+i = 0
+while i < 10000000000:
+ sequenceLabeler.learn(my_dataset)
+ i += 1
# now see the predictions on a test sentence
print >>sys.stderr, 'predicting!'
diff --git a/python/test_search_ldf.py b/python/test_search_ldf.py
new file mode 100644
index 00000000..b72bb1f7
--- /dev/null
+++ b/python/test_search_ldf.py
@@ -0,0 +1,65 @@
+import sys
+import pyvw
+
+# wow! your data can be ANY type you want... does NOT have to be VW examples
+DET = 1
+NOUN = 2
+VERB = 3
+ADJ = 4
+my_dataset = [ [(DET , 'the'),
+ (NOUN, 'monster'),
+ (VERB, 'ate'),
+ (DET , 'a'),
+ (ADJ , 'big'),
+ (NOUN, 'sandwich')],
+ [(DET , 'the'),
+ (NOUN, 'sandwich'),
+ (VERB, 'was'),
+ (ADJ , 'tasty')],
+ [(NOUN, 'it'),
+ (VERB, 'ate'),
+ (NOUN, 'it'),
+ (ADJ , 'all')] ]
+
+
+class SequenceLabeler(pyvw.SearchTask):
+ def __init__(self, vw, sch, num_actions):
+ # you must must must initialize the parent class
+ # this will automatically store self.sch <- sch, self.vw <- vw
+ pyvw.SearchTask.__init__(self, vw, sch, num_actions)
+
+ # set whatever options you want
+ sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES | sch.IS_LDF )
+
+ def makeExample(self, word, p):
+ ex = self.example({'w': [word + '_' + str(p)]}, labelType=self.vw.lCostSensitive)
+ ex.set_label_string(str(p) + ':0')
+ return ex
+
+ def _run(self, sentence): # it's called _run to remind you that you shouldn't call it directly!
+ output = []
+ for n in range(len(sentence)):
+ pos,word = sentence[n]
+ # use "with...as..." to guarantee that the example is finished properly
+ ex = [ self.makeExample(word,p) for p in [DET,NOUN,VERB,ADJ] ]
+ pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos-1, condition=(n,'p'))
+ output.append(pred + 1)
+ return output
+
+# initialize VW as usual, but use 'hook' as the search_task
+vw = pyvw.vw("--search 0 --csoaa_ldf m --quiet --search_task hook --ring_size 1024")
+
+# tell VW to construct your search task object
+sequenceLabeler = vw.init_search_task(SequenceLabeler)
+
+# train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above
+print >>sys.stderr, 'training!'
+i = 0
+while i < 100000000:
+ sequenceLabeler.learn(my_dataset)
+ i += 1
+
+# now see the predictions on a test sentence
+print >>sys.stderr, 'predicting!'
+print sequenceLabeler.predict( [(1,w) for w in "the sandwich ate a monster".split()] )
+print 'should have printed: [1, 2, 3, 1, 2]'