Merge branch 'master' of /home/jl/programs/vowpal_wabbit into v0

author: John Langford <jl@hunch.net> 2014-11-10 20:33:14 +0300
committer: John Langford <jl@hunch.net> 2014-11-10 20:33:14 +0300
commit: 55d3b14ac014d17e67dda6ac7e145d2aef619313 (patch)
tree: 4d3146c63ed97e92c9cddc3fecc86ddcf43d0b36 /python
parent: b53ba7c762506c95dc46853515dba0b56e1f5a41 (diff)
parent: cef3e42b9d622c28b11ea8bc31951f139b35740f (diff)
5 files changed, 239 insertions, 60 deletions
diff --git a/python/covington.py b/python/covington.py
index b6eeda53..8508bfad 100644
--- a/python/covington.py
+++ b/python/covington.py
@@ -103,7 +103,7 @@ print 'training non-LDF'
 vw = pyvw.vw("--search 2 --search_task hook --ring_size 1024 --quiet")
 task = vw.init_search_task(CovingtonDepParser)
 for p in range(2): # do two passes over the training data
-    task.learn(my_dataset.__iter__)
+    task.learn(my_dataset)
 print 'testing non-LDF'
 print task.predict( [(w,-1) for w in "the monster ate a sandwich".split()] )
 print 'should have printed [ 1 2 -1 4 2 ]'
@@ -114,7 +114,7 @@ print 'training LDF'
 vw = pyvw.vw("--search 0 --csoaa_ldf m --search_task hook --ring_size 1024 --quiet")
 task = vw.init_search_task(CovingtonDepParserLDF)
 for p in range(2): # do two passes over the training data
-    task.learn(my_dataset.__iter__)
+    task.learn(my_dataset)
 print 'testing LDF'
 print task.predict( [(w,-1) for w in "the monster ate a sandwich".split()] )
 print 'should have printed [ 1 2 -1 4 2 ]'
diff --git a/python/pylibvw.cc b/python/pylibvw.cc
index 4abff109..70580dbd 100644
--- a/python/pylibvw.cc
+++ b/python/pylibvw.cc
@@ -51,7 +51,7 @@ predictor_ptr get_predictor(search_ptr sch, ptag my_tag) {
 
 label_parser* get_label_parser(vw*all, size_t labelType) {
   switch (labelType) {
-    case lDEFAULT:           return &all->p->lp; // TODO: check null
+    case lDEFAULT:           return all ? &all->p->lp : NULL;
     case lBINARY:            return &simple_label;
     case lMULTICLASS:        return &MULTICLASS::mc_label;
     case lCOST_SENSITIVE:    return &COST_SENSITIVE::cs_label;
@@ -62,9 +62,9 @@ label_parser* get_label_parser(vw*all, size_t labelType) {
 
 void my_delete_example(void*voidec) {
   example* ec = (example*) voidec;
-  size_t labelType = ec->example_counter;
+  size_t labelType = (ec->tag.size() == 0) ? lDEFAULT : ec->tag[0];
   label_parser* lp = get_label_parser(NULL, labelType);
-  dealloc_example(lp->delete_label, *ec);
+  dealloc_example(lp ? lp->delete_label : NULL, *ec);
   free(ec);
 }
 
@@ -77,36 +77,28 @@ example* my_empty_example0(vw_ptr vw, size_t labelType) {
     COST_SENSITIVE::wclass zero = { 0., 1, 0., 0. };
     ((COST_SENSITIVE::label*)ec->ld)->costs.push_back(zero);
   }
-  ec->example_counter = labelType; // example_counter unused in our own examples, so hide labelType in it!
+  ec->tag.erase();
+  if (labelType != lDEFAULT)
+    ec->tag.push_back((char)labelType);  // hide the label type in the tag
   return ec;
 }
 
 example_ptr my_empty_example(vw_ptr vw, size_t labelType) {
-  if (labelType == lDEFAULT) {
-    example* new_ec = VW::new_unused_example(*vw);
-    return boost::shared_ptr<example>(new_ec, dont_delete_me);
-  } else {
-    example* ec = my_empty_example0(vw, labelType);
-    return boost::shared_ptr<example>(ec, my_delete_example);
-  }
+  example* ec = my_empty_example0(vw, labelType);
+  return boost::shared_ptr<example>(ec, my_delete_example);
 }  
 
 example_ptr my_read_example(vw_ptr all, size_t labelType, char*str) {
-  if (labelType == lDEFAULT) {
-    example*ec = VW::read_example(*all, str);
-    return boost::shared_ptr<example>(ec, dont_delete_me);
-  } else {
-    example*ec = my_empty_example0(all, labelType);
-    read_line(*all, ec, str);
-    parse_atomic_example(*all, ec, false);
-    VW::setup_example(*all, ec);
-    ec->example_counter = labelType;
-    return boost::shared_ptr<example>(ec, my_delete_example);
-  }
+  example*ec = my_empty_example0(all, labelType);
+  read_line(*all, ec, str);
+  parse_atomic_example(*all, ec, false);
+  VW::setup_example(*all, ec);
+  ec->example_counter = labelType;
+  return boost::shared_ptr<example>(ec, my_delete_example);
 }
 
 void my_finish_example(vw_ptr all, example_ptr ec) {
-  VW::finish_example(*all, ec.get());
+  // TODO
 }
 
 void my_learn(vw_ptr all, example_ptr ec) {
@@ -165,6 +157,46 @@ void ex_push_feature(example_ptr ec, unsigned char ns, uint32_t fid, float v) {
   ec->total_sum_feat_sq += v * v;
 }
 
+void ex_push_feature_list(example_ptr ec, vw_ptr vw, unsigned char ns, py::list& a) {
+  // warning: assumes namespace exists!
+  char ns_str[2] = { ns, 0 };
+  uint32_t ns_hash = VW::hash_space(*vw, ns_str);
+  size_t count = 0; float sum_sq = 0.;
+  for (size_t i=0; i<len(a); i++) {
+    feature f = { 1., 0 };
+    py::object ai = a[i];
+    py::extract<py::tuple> get_tup(ai);
+    if (get_tup.check()) {
+      py::tuple fv = get_tup();
+      if (len(fv) != 2) { cerr << "warning: malformed feature in list" << endl; continue; } // TODO str(ai)
+      py::extract<float> get_val(fv[1]);
+      if (get_val.check())
+        f.x = get_val();
+      else { cerr << "warning: malformed feature in list" << endl; continue; }
+      ai = fv[0];
+    }
+    
+    bool got = false;
+    py::extract<uint32_t> get_int(ai);
+    if (get_int.check()) { f.weight_index = get_int(); got = true; }
+    else {
+      py::extract<string> get_str(ai);
+      if (get_str.check()) {
+        f.weight_index = VW::hash_feature(*vw, get_str(), ns_hash);
+        got = true;
+      } else { cerr << "warning: malformed feature in list" << endl; continue; }
+    }
+    if (got && (f.x != 0.)) {
+      ec->atomics[ns].push_back(f);
+      count++;
+      sum_sq += f.x * f.x;
+    }
+  }
+  ec->num_features += count;
+  ec->sum_feat_sq[ns] += sum_sq;
+  ec->total_sum_feat_sq += sum_sq;
+}
+
 bool ex_pop_feature(example_ptr ec, unsigned char ns) {
   if (ec->atomics[ns].size() == 0) return false;
   feature f = ec->atomics[ns].pop();
@@ -199,6 +231,7 @@ void my_setup_example(vw_ptr vw, example_ptr ec) {
 }
 
 void ex_set_label_string(example_ptr ec, vw_ptr vw, string label, size_t labelType) {
+  // SPEEDUP: if it's already set properly, don't modify
   label_parser& old_lp = vw->p->lp;
   vw->p->lp = *get_label_parser(&*vw, labelType);
   VW::parse_example_label(*vw, *ec, label);
@@ -310,17 +343,51 @@ void search_run_fn(Search::search&sch) {
   }
 }
 
+void search_setup_fn(Search::search&sch) {
+  try {
+    HookTask::task_data* d = sch.get_task_data<HookTask::task_data>();
+    py::object run = *(py::object*)d->setup_object;
+    run.attr("__call__")();
+  } catch(...) {
+    PyErr_Print();
+    PyErr_Clear();
+    throw exception();
+  }
+}
+
+void search_takedown_fn(Search::search&sch) {
+  try {
+    HookTask::task_data* d = sch.get_task_data<HookTask::task_data>();
+    py::object run = *(py::object*)d->takedown_object;
+    run.attr("__call__")();
+  } catch(...) {
+    PyErr_Print();
+    PyErr_Clear();
+    throw exception();
+  }
+}
+
 void py_delete_run_object(void* pyobj) {
   py::object* o = (py::object*)pyobj;
   delete o;
 }
 
-void set_structured_predict_hook(search_ptr sch, py::object run_object) {
+void set_structured_predict_hook(search_ptr sch, py::object run_object, py::object setup_object, py::object takedown_object) {
   verify_search_set_properly(sch);
   HookTask::task_data* d = sch->get_task_data<HookTask::task_data>();
   d->run_f = &search_run_fn;
-  py::object* new_obj = new py::object(run_object);  // TODO: delete me!
-  d->run_object = new_obj;
+  delete (py::object*)d->run_object; d->run_object = NULL;
+  delete (py::object*)d->setup_object; d->setup_object = NULL;
+  delete (py::object*)d->takedown_object; d->takedown_object = NULL;
+  d->run_object = new py::object(run_object);
+  if (setup_object.ptr() != Py_None) {
+    d->setup_object = new py::object(setup_object);
+    d->run_setup_f = &search_setup_fn;
+  }
+  if (takedown_object.ptr() != Py_None) {
+    d->takedown_object = new py::object(takedown_object);
+    d->run_takedown_f = &search_takedown_fn;
+  }
   d->delete_run_object = &py_delete_run_object;
 }
 
@@ -442,6 +509,7 @@ BOOST_PYTHON_MODULE(pylibvw) {
       .def("feature_weight", &ex_feature_weight, "The the feature value (weight) per .feature(...)")
 
       .def("push_hashed_feature", &ex_push_feature, "Add a hashed feature to a given namespace (id=character-ord)")
+      .def("push_feature_list", &ex_push_feature_list, "Add a (Python) list of features to a given namespace")
       .def("pop_feature", &ex_pop_feature, "Remove the top feature from a given namespace; returns True iff the list was non-empty")
       .def("push_namespace", &ex_push_namespace, "Add a new namespace")
       .def("ensure_namespace_exists", &ex_ensure_namespace_exists, "Add a new namespace if it doesn't already exist")
@@ -498,9 +566,11 @@ BOOST_PYTHON_MODULE(pylibvw) {
       .def("get_history_length", &Search::search::get_history_length, "Get the value specified by --search_history_length")
       .def("loss", &Search::search::loss, "Declare a (possibly incremental) loss")
       .def("should_output", &search_should_output, "Check whether search wants us to output (only happens if you have -p running)")
+      .def("predict_needs_example", &Search::search::predictNeedsExample, "Check whether a subsequent call to predict is actually going to use the example you pass---i.e., can you skip feature computation?")
       .def("output", &search_output, "Add a string to the coutput (should only do if should_output returns True)")
       .def("get_num_actions", &search_get_num_actions, "Return the total number of actions search was initialized with")
       .def("set_structured_predict_hook", &set_structured_predict_hook, "Set the hook (function pointer) that search should use for structured prediction (you don't want to call this yourself!")
+      .def("is_ldf", &Search::search::is_ldf, "check whether this search task is running in LDF mode")
 
       .def("po_exists", &po_exists, "For program (cmd line) options, check to see if a given option was specified; eg sch.po_exists(\"search\") should be True")
       .def("po_get", &po_get, "For program (cmd line) options, if an option was specified, get its value; eg sch.po_get(\"search\") should return the # of actions (returns either int or string)")
diff --git a/python/pyvw.py b/python/pyvw.py
index f16c8472..5b68b2e3 100644
--- a/python/pyvw.py
+++ b/python/pyvw.py
@@ -16,24 +16,31 @@ class SearchTask():
     def _run(self, your_own_input_example):
         pass
 
-    def _call_vw(self, fn, isTest):
+    def _call_vw(self, my_example, isTest): # run_fn, setup_fn, takedown_fn, isTest):
+        self._output = None
         self.bogus_example.set_test_only(isTest)
-        self.sch.set_structured_predict_hook(fn)
+        def run(): self._output = self._run(my_example)
+        setup = None
+        takedown = None
+        if callable(getattr(self, "_setup", None)): setup = lambda: self._setup(my_example)
+        if callable(getattr(self, "_takedown", None)): takedown = lambda: self._takedown(my_example)
+        self.sch.set_structured_predict_hook(run, setup, takedown)
         self.vw.learn(self.bogus_example)
         self.vw.learn(self.blank_line) # this will cause our ._run hook to get called
         
     def learn(self, data_iterator):
-        for my_example in data_iterator():
-            self._call_vw(lambda: self._run(my_example), isTest=False)
+        for my_example in data_iterator.__iter__():
+            self._call_vw(my_example, isTest=False);
 
+    def example(self, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
+        """TODO"""
+        if self.sch.predict_needs_example():
+            return self.vw.example(initStringOrDict, labelType)
+        else:
+            return self.vw.example(None, labelType)
+            
     def predict(self, my_example):
-        self._output = None
-        def f(): self._output = self._run(my_example)
-        self._call_vw(f, isTest=True)
-        #if self._output is None:
-        #    raise Exception('structured predict hook failed to return anything')
-        # don't raise this exception because your _run code legitimately
-        # _could_ return None!
+        self._call_vw(my_example, isTest=True);
         return self._output
 
 class vw(pylibvw.vw):
@@ -52,14 +59,16 @@ class vw(pylibvw.vw):
         the weight for that position in the (learned) weight vector."""
         return pylibvw.vw.get_weight(self, index, offset)
 
-    def learn(self, example):
-        """Perform an online update; example can either be an example
+    def learn(self, ec):
+        """Perform an online update; ec can either be an example
         object or a string (in which case it is parsed and then
         learned on)."""
-        if isinstance(example, str):
-            self.learn_string(example)
+        if isinstance(ec, str):
+            self.learn_string(ec)
         else:
-            pylibvw.vw.learn(self, example)
+            if hasattr(ec, 'setup_done') and not ec.setup_done:
+                ec.setup_example()
+            pylibvw.vw.learn(self, ec)
 
     def finish(self):
         """stop VW by calling finish (and, eg, write weights to disk)"""
@@ -81,8 +90,13 @@ class vw(pylibvw.vw):
             """The basic (via-reduction) prediction mechanism. Several
             variants are supported through this overloaded function:
             
-              'examples' can be a single example (interpreted as non-LDF
-                 mode) or a list of examples (interpreted as LDF mode)
+              'examples' can be a single example (interpreted as
+                 non-LDF mode) or a list of examples (interpreted as
+                 LDF mode).  it can also be a lambda function that
+                 returns a single example or list of examples, and in
+                 that list, each element can also be a lambda function
+                 that returns an example. this is done for lazy
+                 example construction (aka speed).
 
               'my_tag' should be an integer id, specifying this prediction
                  
@@ -105,18 +119,42 @@ class vw(pylibvw.vw):
               'learner_id' specifies the underlying learner id
 
             Returns a single prediction.
+
             """
-            if (isinstance(examples, list) and all([isinstance(ex, example) or isinstance(ex, pylibvw.example) for ex in examples])) or \
-               isinstance(examples, example) or isinstance(examples, pylibvw.example):
-                P = sch.get_predictor(my_tag)
-                if isinstance(examples, list): # LDF
-                    P.set_input_length(len(examples))
+
+            P = sch.get_predictor(my_tag)
+            if sch.is_ldf():
+                # we need to know how many actions there are, even if we don't know their identities
+                while hasattr(examples, '__call__'): examples = examples()
+                if not isinstance(examples, list): raise TypeError('expected example _list_ in LDF mode for SearchTask.predict()')
+                P.set_input_length(len(examples))
+                if sch.predict_needs_example():
                     for n in range(len(examples)):
-                        P.set_input_at(n, examples[n])
-                else: # non-LDF
+                        ec = examples[n]
+                        while hasattr(ec, '__call__'): ec = ec()   # unfold the lambdas
+                        if not isinstance(ec, example) and not isinstance(ec, pylibvw.example): raise TypeError('non-example in LDF example list in SearchTask.predict()')
+                        P.set_input_at(n, ec)
+                else:
+                    pass # TODO: do we need to set the examples even though they're not used?
+            else:
+                if sch.predict_needs_example():
+                    while hasattr(examples, '__call__'): examples = examples()
                     P.set_input(examples)
-                
-                if isinstance(oracle, list): P.set_oracles(oracle)
+                else:
+                    pass # TODO: do we need to set the examples even though they're not used?
+            
+            # if (isinstance(examples, list) and all([isinstance(ex, example) or isinstance(ex, pylibvw.example) for ex in examples])) or \
+            #    isinstance(examples, example) or isinstance(examples, pylibvw.example):
+            #     if isinstance(examples, list): # LDF
+            #         P.set_input_length(len(examples))
+            #         for n in range(len(examples)):
+            #             P.set_input_at(n, examples[n])
+            #     else: # non-LDF
+            #         P.set_input(examples)
+            if True:   # TODO: get rid of this
+                if oracle is None: pass
+                elif isinstance(oracle, list):
+                    if len(oracle) > 0: P.set_oracles(oracle)
                 elif isinstance(oracle, int): P.set_oracle(oracle)
                 else: raise TypeError('expecting oracle to be a list or an integer')
 
@@ -338,7 +376,10 @@ class example(pylibvw.example):
         get an "empty" example which you can construct by hand (see, eg,
         example.push_features). If initString is a string, then this
         string is parsed as it would be from a VW data file into an
-        example (and "setup_example" is run)."""
+        example (and "setup_example" is run). if it is a dict, then we add all features in that dictionary. finally, if it's a function, we (repeatedly) execute it fn() until it's not a function any more (for lazy feature computation)."""
+
+        while hasattr(initStringOrDict, '__call__'):
+            initStringOrDict = initStringOrDict()
 
         if initStringOrDict is None:
             pylibvw.example.__init__(self, vw, labelType)
@@ -502,8 +543,8 @@ class example(pylibvw.example):
         Fails if setup has run."""
         ns = self.get_ns(ns)
         self.ensure_namespace_exists(ns)
-        ns_hash = self.vw.hash_space(ns.ns)
-        
+        #self.push_feature_list(self.vw, ns.ord_ns, featureList)
+        ns_hash = self.vw.hash_space( ns.ns )
         for feature in featureList:
             if isinstance(feature, int) or isinstance(feature, str):
                 f = feature
@@ -516,6 +557,7 @@ class example(pylibvw.example):
 
             self.push_feature(ns, f, v, ns_hash)
 
+
     def finish(self):
         """Tell VW that you're done with this example and it can
         recycle it for later use."""
diff --git a/python/test_search.py b/python/test_search.py
index 89134460..aef62d73 100644
--- a/python/test_search.py
+++ b/python/test_search.py
@@ -56,8 +56,10 @@ sequenceLabeler = vw.init_search_task(SequenceLabeler)
 
 # train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above
 print >>sys.stderr, 'training!'
-for curPass in range(10):
-    sequenceLabeler.learn(my_dataset.__iter__)
+i = 0
+while i < 10000000000:
+    sequenceLabeler.learn(my_dataset)
+    i += 1
 
 # now see the predictions on a test sentence
 print >>sys.stderr, 'predicting!'
diff --git a/python/test_search_ldf.py b/python/test_search_ldf.py
new file mode 100644
index 00000000..b72bb1f7
--- /dev/null
+++ b/python/test_search_ldf.py
@@ -0,0 +1,65 @@
+import sys
+import pyvw
+
+# wow! your data can be ANY type you want... does NOT have to be VW examples
+DET  = 1
+NOUN = 2
+VERB = 3
+ADJ  = 4
+my_dataset = [ [(DET , 'the'),
+                (NOUN, 'monster'),
+                (VERB, 'ate'),
+                (DET , 'a'),
+                (ADJ , 'big'),
+                (NOUN, 'sandwich')],
+               [(DET , 'the'),
+                (NOUN, 'sandwich'),
+                (VERB, 'was'),
+                (ADJ , 'tasty')],
+               [(NOUN, 'it'),
+                (VERB, 'ate'),
+                (NOUN, 'it'),
+                (ADJ , 'all')] ]
+
+
+class SequenceLabeler(pyvw.SearchTask):
+    def __init__(self, vw, sch, num_actions):
+        # you must must must initialize the parent class
+        # this will automatically store self.sch <- sch, self.vw <- vw
+        pyvw.SearchTask.__init__(self, vw, sch, num_actions)
+        
+        # set whatever options you want
+        sch.set_options( sch.AUTO_HAMMING_LOSS | sch.AUTO_CONDITION_FEATURES | sch.IS_LDF )
+
+    def makeExample(self, word, p):
+        ex = self.example({'w': [word + '_' + str(p)]}, labelType=self.vw.lCostSensitive)
+        ex.set_label_string(str(p) + ':0')
+        return ex
+        
+    def _run(self, sentence):   # it's called _run to remind you that you shouldn't call it directly!
+        output = []
+        for n in range(len(sentence)):
+            pos,word = sentence[n]
+            # use "with...as..." to guarantee that the example is finished properly
+            ex = [ self.makeExample(word,p) for p in [DET,NOUN,VERB,ADJ] ]
+            pred = self.sch.predict(examples=ex, my_tag=n+1, oracle=pos-1, condition=(n,'p'))
+            output.append(pred + 1)
+        return output
+
+# initialize VW as usual, but use 'hook' as the search_task
+vw = pyvw.vw("--search 0 --csoaa_ldf m --quiet --search_task hook --ring_size 1024")
+
+# tell VW to construct your search task object
+sequenceLabeler = vw.init_search_task(SequenceLabeler)
+
+# train it on the above dataset ten times; the my_dataset.__iter__ feeds into _run above
+print >>sys.stderr, 'training!'
+i = 0
+while i < 100000000:
+    sequenceLabeler.learn(my_dataset)
+    i += 1
+
+# now see the predictions on a test sentence
+print >>sys.stderr, 'predicting!'
+print sequenceLabeler.predict( [(1,w) for w in "the sandwich ate a monster".split()] )
+print 'should have printed: [1, 2, 3, 1, 2]'
author	John Langford <jl@hunch.net>	2014-11-10 20:33:14 +0300
committer	John Langford <jl@hunch.net>	2014-11-10 20:33:14 +0300
commit	55d3b14ac014d17e67dda6ac7e145d2aef619313 (patch)
tree	4d3146c63ed97e92c9cddc3fecc86ddcf43d0b36 /python
parent	b53ba7c762506c95dc46853515dba0b56e1f5a41 (diff)
parent	cef3e42b9d622c28b11ea8bc31951f139b35740f (diff)