more tests for searn, more refactoring

author: Hal Daume III <me@hal3.name> 2012-06-01 00:44:23 +0400
committer: Hal Daume III <me@hal3.name> 2012-06-01 00:44:23 +0400
commit: a7151261249ec358e574d23fed13064a7281fd99 (patch)
tree: 99a85a6d74d162d3c7c0e41bacdaf774d28c4bcf
parent: da64b998b59e7e8e45d049d79dc70da4618363fe (diff)
35 files changed, 723 insertions, 117 deletions
diff --git a/Makefile b/Makefile
index 64d4ca07..0a085a4c 100644
--- a/Makefile
+++ b/Makefile
@@ -30,7 +30,7 @@ FLAGS = $(ARCH) $(WARN_FLAGS) $(OPTIM_FLAGS) -D_FILE_OFFSET_BITS=64 -I $(BOOST_I
 #FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -pg -g
 
 # for valgrind
-FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O0
+#FLAGS = -Wall $(ARCH) -ffast-math -D_FILE_OFFSET_BITS=64 -I $(BOOST_INCLUDE) -g -O0
 
 BINARIES = vw active_interactor
 MANPAGES = vw.1
diff --git a/test/RunTests b/test/RunTests
index 048503df..720d56d5 100755
--- a/test/RunTests
+++ b/test/RunTests
@@ -459,17 +459,17 @@ __DATA__
     train-sets/ref/wsj_small-tm.dat.stderr
 
 # Test 14: Run searn on seq_small for 12 passes, 4 passes per policy
-{VW} -c -d train-sets/seq_small --passes 12 --searn_passes_per_policy 4 --searn sequence --searn_max_action 4 && rm -f train-sets/seq_small.cache
+{VW} -c -d train-sets/seq_small --passes 12 --searn_passes_per_policy 4 --searn 4 --searn_task sequence && rm -f train-sets/seq_small.cache
     train-sets/ref/searn_small.stdout
     train-sets/ref/searn_small.stderr
 
 # Test 15: Run searn on wsj_small for 12 passes, 4 passes per policy, extra features
-{VW} -c -d train-sets/wsj_small.dat.gz --passes 12 --searn_passes_per_policy 4 --searn sequence --searn_max_action 45 --searn_sequencetask_history 2 --searn_sequencetask_bigrams --searn_sequencetask_features 1 && rm -f train-sets/wsj_small.dat.gz.cache
+{VW} -c -d train-sets/wsj_small.dat.gz --passes 12 --searn_passes_per_policy 4 --searn_task sequence --searn 45 --searn_sequencetask_history 2 --searn_sequencetask_bigrams --searn_sequencetask_features 1 && rm -f train-sets/wsj_small.dat.gz.cache
     train-sets/ref/searn_wsj.stdout
     train-sets/ref/searn_wsj.stderr
 
 # Test 16: Run searn (wap) on wsj_small for 2 passes, 1 pass per policy, extra features
-{VW} -c -d train-sets/wsj_small.dat.gz --passes 2 --searn_passes_per_policy 1 --searn sequence --searn_max_action 45 --wap 45 --searn_history 2 --searn_bigrams --searn_features 1 && rm -f train-sets/wsj_small.dat.gz.cache
+{VW} -c -d train-sets/wsj_small.dat.gz --passes 2 --searn_passes_per_policy 1 --searn_task sequence --searn 45 --wap 45 --searn_sequencetask_history 2 --searn_sequencetask_bigrams --searn_sequencetask_features 1 && rm -f train-sets/wsj_small.dat.gz.cache
     train-sets/ref/searn_wsj2.dat.stdout
     train-sets/ref/searn_wsj2.dat.stderr
 
diff --git a/test/test-sets/ref/0001.stderr b/test/test-sets/ref/0001.stderr
index 8a62ef04..149aeac8 100644
--- a/test/test-sets/ref/0001.stderr
+++ b/test/test-sets/ref/0001.stderr
@@ -1,12 +1,13 @@
-using no cache
-Reading from train-sets/0001.dat
-num sources = 1
 Num weight bits = 17
 learning rate = 10
 initial_t = 1
 power_t = 0.5
 predictions = 001.predict.tmp
 only testing
+warning: final argument 'train-sets/0001.dat' assumed to be input file; in the future, please use -d
+using no cache
+Reading from train-sets/0001.dat
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
 0.000000   0.000000          3      3.0     0.0000   0.0000      326
diff --git a/test/test-sets/ref/0002b.stderr b/test/test-sets/ref/0002b.stderr
index 4765f0a7..3ae7448e 100644
--- a/test/test-sets/ref/0002b.stderr
+++ b/test/test-sets/ref/0002b.stderr
@@ -1,12 +1,12 @@
-using no cache
-Reading from train-sets/0002.dat
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0.5
 predictions = 0002b.predict
 only testing
+using no cache
+Reading from train-sets/0002.dat
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
 0.005280   0.005280          3      3.0     0.5498   0.4980       15
diff --git a/test/test-sets/ref/0002c.stderr b/test/test-sets/ref/0002c.stderr
index ad7a2197..92233c36 100644
--- a/test/test-sets/ref/0002c.stderr
+++ b/test/test-sets/ref/0002c.stderr
@@ -1,12 +1,12 @@
-using no cache
-Reading from train-sets/0002.dat
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0.5
 predictions = 0002c.predict
 only testing
+using no cache
+Reading from train-sets/0002.dat
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
 0.002276   0.002276          3      3.0     0.5498   0.5361      184
diff --git a/test/train-sets/chunk_to_features.pl b/test/train-sets/chunk_to_features.pl
new file mode 100755
index 00000000..24ba8039
--- /dev/null
+++ b/test/train-sets/chunk_to_features.pl
@@ -0,0 +1,78 @@
+#!/usr/bin/perl -w
+use strict;
+
+my %cdict = (); my $cdictNum = 1;
+while (1) {
+    my $cdictFile = shift or last;
+    open F, $cdictFile or die;
+    while (<F>) {
+        chomp;
+        my ($c, $num) = split;
+        $cdict{$c} = $num;
+        if ($num+1 > $cdictNum) { $cdictNum = $num + 1; }
+    }
+    close F or die;
+}
+
+my @w = (); my @t = (); my @c = ();
+while (<>) {
+    chomp;
+    if (/^[\s]*$/) { dumpit(); print "\n"; @w = (); @t = (); @c = (); next; }
+
+    my ($w,$t,$c) = split;
+    #if ($c =~ /-NP/) { push @c, "1"; } else { push @c, "-1"; }
+    if (not exists $cdict{$c}) { 
+        $cdict{$c} = $cdictNum;
+        $cdictNum++;
+        print STDERR "$c\t$cdict{$c}\n";
+    }
+
+    push @c, $cdict{$c};
+    push @t, $t;
+    push @w, $w;
+}
+
+sub dumpit {
+    for (my $n=0; $n<@c; $n++) {
+        my %f = ();
+        for (my $m=-2; $m<=+2; $m++) {
+            computef(\%f, '_'.$m, $n+$m);
+        }
+        print $c[$n] . ' |';
+        foreach my $f (keys %f) { 
+            $f =~ s/:/-COL-/g;
+            $f =~ s/\|/-PIP-/g;
+            print ' ' . $f; 
+        }
+        print "\n";
+    }
+}
+
+sub computef {
+    my ($f, $s0, $i) = @_;
+
+    if ($i <   0) { $f->{"w".$s0."=<s>" } = 1; return; }
+    if ($i >= @c) { $f->{"w".$s0."=</s>"} = 1; return; }
+
+    my $w = $w[$i]; my $p = $t[$i]; my $l = lc($w[$i]);
+
+    $f->{"w".$s0."=".$w} = 1;
+#    $f->"p:=".$p} = 1;
+    $f->{"l".$s0."=".$l} = 1;
+
+    my $c = $w;
+    $c =~ s/[A-Z]+/A/g;
+    $c =~ s/[a-z]+/a/g;
+    $c =~ s/[0-9]+/0/g;
+    $c =~ s/[^\.Aa0]+/\#/g;
+    $f->{"c".$s0."=".$c} = 1;
+    $f->{"c".$s0."=".$c."_fw=".(($i==0) ? "y" : "n")} = 1;
+
+    my $N = length($l);
+    $f->{"pre1".$s0."=".substr($l,0,1)} = 1;
+    $f->{"pre2".$s0."=".substr($l,0,2)} = 1;
+    $f->{"pre3".$s0."=".substr($l,0,3)} = 1;
+    $f->{"suf1".$s0."=".substr($l,$N-1,1)} = 1;
+    $f->{"suf2".$s0."=".substr($l,$N-2,2)} = 1;
+    $f->{"suf3".$s0."=".substr($l,$N-3,3)} = 1;
+}
diff --git a/test/train-sets/cs_test.pred b/test/train-sets/cs_test.pred
new file mode 100644
index 00000000..df8df931
--- /dev/null
+++ b/test/train-sets/cs_test.pred
@@ -0,0 +1,300 @@
+1.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+1.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
+2.000000
diff --git a/test/train-sets/eval_chunking.pl b/test/train-sets/eval_chunking.pl
new file mode 100755
index 00000000..78f7b92c
--- /dev/null
+++ b/test/train-sets/eval_chunking.pl
@@ -0,0 +1,110 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $rdictFile = shift or die;
+my $truthFile = shift or die;
+
+my %rdict = (); my $rdictNum = 1;
+open F, $rdictFile or die;
+while (<F>) {
+    chomp;
+    my ($c, $num) = split;
+    if ($c =~ /^[BI]-O$/) { $c = 'O'; }
+    $rdict{$num} = $c;
+    if ($num+1 > $rdictNum) { $rdictNum = $num + 1; }
+}
+close F or die;
+
+my $np = 0;
+my $nt = 0;
+my $ni = 0;
+my $nil = 0;
+
+my $nc = 0;
+my $ncl = 0;
+my $na = 0;
+
+my @truth = ();
+if ($truthFile =~ /.gz$/ ) { open T, "zcat  $truthFile |" or die; }
+elsif ($truthFile =~ /.bz2$/) { open T, "bzcat $truthFile |" or die; }
+else { open T, $truthFile or die; }
+while (<T>) {
+    chomp;
+    if (/^[\s]*$/) { runit(); @truth = (); next; }
+    my ($c) = split;
+    if (not defined $rdict{$c}) { die $c; }
+    push @truth, $rdict{$c};
+}
+close T;
+
+my $p  = $ni  / (($np > 0) ? $np : 1);
+my $r  = $ni  / (($nt > 0) ? $nt : 1);
+my $f  = 2 * $p * $r / ($p + $r);
+my $a  = $nc  / (($na > 0) ? $na : 1);
+my $pl = $nil / (($np > 0) ? $np : 1);
+my $rl = $nil / (($nt > 0) ? $nt : 1);
+my $fl = 2 * $pl * $rl / ($pl + $rl);
+my $al = $ncl / (($na > 0) ? $na : 1);
+
+$p  = int($p  * 1000)/10; $r  = int($r  * 1000)/10; $f  = int($f  * 1000)/10; $a  = int ($a  * 1000)/10;
+$pl = int($pl * 1000)/10; $rl = int($rl * 1000)/10; $fl = int($fl * 1000)/10; $al = int ($al * 1000)/10;
+
+print "unlabeled: p=$p\tr=$r\tf=$f\tacc=$a\n";
+print "  labeled: p=$pl\tr=$rl\tf=$fl\tacc=$al\n";
+
+
+sub runit {
+    my $N = scalar @truth;
+    my @pred = ();
+    for (my $n=0; $n<$N; $n++) {
+        $_ = <>;
+        chomp;
+        $_ = int($_);
+        if (not defined $rdict{$_}) { die $_; }
+        push @pred, $rdict{$_};
+    }
+    $_ = <>; chomp;
+    if (not /^\s*$/) { die; }
+
+    $na += $N;
+    for (my $n=0; $n<$N; $n++) {
+        if ($pred[$n] eq $truth[$n]) { $ncl++; }
+        if (substr($pred[$n],0,1) eq substr($truth[$n],0,1)) { $nc++; }
+    }
+
+    my %c1 = chunksof(@truth);
+    my %c2 = chunksof(@pred);
+
+    $np += scalar keys %c1;
+    $nt += scalar keys %c2;
+    foreach my $c (keys %c1) {
+        if (exists $c2{$c}) { 
+            $ni++;
+            if ($c2{$c} eq $c1{$c}) {
+                $nil++;
+            }
+        }
+    }
+}
+
+sub chunksof {
+    my @l = @_;
+    my $i = 0;
+    my %c = ();
+    while ($i < @l) {
+        if ($l[$i] =~ /^B-(.+)$/) {
+            my $lab = $1;
+            if ($lab eq 'O') { $i++; next; }
+            my $j = $i+1;
+            while ($j < @l) {
+                if ($l[$j] eq "I-$lab") { $j++; }
+                else { last; }
+            }
+            $c{"$i $j"} = $lab;
+            $i = $j;
+        } else {
+            $i++;
+        }
+    }
+    return (%c);
+}
diff --git a/test/train-sets/ner.cdict b/test/train-sets/ner.cdict
new file mode 100644
index 00000000..669cc7bc
--- /dev/null
+++ b/test/train-sets/ner.cdict
@@ -0,0 +1,9 @@
+B-ORG	1
+B-O	2
+B-MISC	3
+B-PER	4
+I-PER	5
+B-LOC	6
+I-ORG	7
+I-MISC	8
+I-LOC	9
diff --git a/test/train-sets/ner.pred b/test/train-sets/ner.pred
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/test/train-sets/ner.pred
diff --git a/test/train-sets/ner.test.gz b/test/train-sets/ner.test.gz
new file mode 100644
index 00000000..499b74e0
--- /dev/null
+++ b/test/train-sets/ner.test.gz
diff --git a/test/train-sets/ner.tm b/test/train-sets/ner.tm
new file mode 100644
index 00000000..43a9fe14
--- /dev/null
+++ b/test/train-sets/ner.tm
@@ -0,0 +1,11 @@
+9
+1 1 1 1 0 1 0 0 0
+1 1 1 1 0 1 1 0 0
+1 1 1 1 0 1 0 0 0
+1 1 1 1 0 1 0 1 0
+0 1 1 0 1 1 0 0 0
+0 1 0 0 1 0 0 0 0
+1 1 1 1 0 1 0 0 1
+1 1 1 1 0 1 1 0 0
+1 1 1 1 0 1 0 1 0
+1 1 1 0 0 1 0 0 1
diff --git a/test/train-sets/ner.train.gz b/test/train-sets/ner.train.gz
new file mode 100644
index 00000000..ba01acd5
--- /dev/null
+++ b/test/train-sets/ner.train.gz
diff --git a/test/train-sets/ref/0001.stderr b/test/train-sets/ref/0001.stderr
index edef747e..5e1d3426 100644
--- a/test/train-sets/ref/0001.stderr
+++ b/test/train-sets/ref/0001.stderr
@@ -1,14 +1,14 @@
 You have chosen to generate 3-grams
 You have chosen to generate 1-skip-3-grams
 final_regressor = models/0001.model
-creating cache_file = train-sets/0001.dat.cache
-Reading from train-sets/0001.dat
-num sources = 1
 Num weight bits = 17
 learning rate = 2.56e+06
 initial_t = 128000
 power_t = 1
 decay_learning_rate = 1
+creating cache_file = train-sets/0001.dat.cache
+Reading from train-sets/0001.dat
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
 1.000000   1.000000          3      3.0     0.0000   1.0000      326
diff --git a/test/train-sets/ref/0002.stderr b/test/train-sets/ref/0002.stderr
index 7d14e85c..a3a5c9a9 100644
--- a/test/train-sets/ref/0002.stderr
+++ b/test/train-sets/ref/0002.stderr
@@ -1,11 +1,11 @@
 final_regressor = models/0002.model
-using no cache
-Reading from train-sets/0002.dat
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0.5
+using no cache
+Reading from train-sets/0002.dat
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
 0.110447   0.110447          3      3.0     0.5498   0.3591       15
diff --git a/test/train-sets/ref/0002a.stderr b/test/train-sets/ref/0002a.stderr
index 6cc621c3..55e54a58 100644
--- a/test/train-sets/ref/0002a.stderr
+++ b/test/train-sets/ref/0002a.stderr
@@ -1,12 +1,13 @@
 creating quadratic features for pairs: Tf ff 
 final_regressor = models/0002a.model
-using no cache
-Reading from train-sets/0002.dat
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0
+warning: final argument 'train-sets/0002.dat' assumed to be input file; in the future, please use -d
+using no cache
+Reading from train-sets/0002.dat
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
 0.146961   0.146961          3      3.0     0.5498   0.2139      197
diff --git a/test/train-sets/ref/0002c.stderr b/test/train-sets/ref/0002c.stderr
index 2d9f312b..b7fe4901 100644
--- a/test/train-sets/ref/0002c.stderr
+++ b/test/train-sets/ref/0002c.stderr
@@ -1,12 +1,13 @@
 creating quadratic features for pairs: ff 
 final_regressor = models/0002c.model
-using no cache
-Reading from train-sets/0002.dat
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0
+warning: final argument 'train-sets/0002.dat' assumed to be input file; in the future, please use -d
+using no cache
+Reading from train-sets/0002.dat
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
 0.134680   0.134680          3      3.0     0.5498   0.2361      184
diff --git a/test/train-sets/ref/cs_test.ldf.csoaa.stderr b/test/train-sets/ref/cs_test.ldf.csoaa.stderr
index f3c6d06a..d6a9c73b 100644
--- a/test/train-sets/ref/cs_test.ldf.csoaa.stderr
+++ b/test/train-sets/ref/cs_test.ldf.csoaa.stderr
@@ -1,6 +1,3 @@
-creating cache_file = train-sets/cs_test.ldf.cache
-Reading from train-sets/cs_test.ldf
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
@@ -8,6 +5,9 @@ power_t = 0.5
 decay_learning_rate = 1
 predictions = cs_test.ldf.csoaa.predict
 warning: turning off constant for label dependent features; use --noconstant
+creating cache_file = train-sets/cs_test.ldf.cache
+Reading from train-sets/cs_test.ldf
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
 0.000000   0.000000          3      3.0    known        0        3
diff --git a/test/train-sets/ref/cs_test.ldf.wap.stderr b/test/train-sets/ref/cs_test.ldf.wap.stderr
index 1a06a442..5405ca82 100644
--- a/test/train-sets/ref/cs_test.ldf.wap.stderr
+++ b/test/train-sets/ref/cs_test.ldf.wap.stderr
@@ -1,20 +1,19 @@
-creating cache_file = train-sets/cs_test.ldf.cache
-Reading from train-sets/cs_test.ldf
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0.5
 decay_learning_rate = 1
 predictions = cs_test.ldf.wap.predict
-warning: turning off constant for label dependent features; use --noconstant
+creating cache_file = train-sets/cs_test.ldf.cache
+Reading from train-sets/cs_test.ldf
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
-0.000000   0.000000          3      3.0    known        0        3
-0.000000   0.000000          6      6.0    known        0        3
-0.000000   0.000000         11     11.0    known        0        3
-0.000000   0.000000         22     22.0    known        0        3
-0.000000   0.000000         44     44.0    known        1        3
+0.000000   0.000000          3      3.0    known        0        4
+0.000000   0.000000          6      6.0    known        0        4
+0.000000   0.000000         11     11.0    known        0        4
+0.000000   0.000000         22     22.0    known        0        4
+0.000000   0.000000         44     44.0    known        1        4
 
 finished run
 number of examples = 70
@@ -22,4 +21,4 @@ weighted example sum = 70
 weighted label sum = 0
 average loss = 0
 best constant = -0.01449
-total feature number = 210
+total feature number = 280
diff --git a/test/train-sets/ref/rcv1_small.stderr b/test/train-sets/ref/rcv1_small.stderr
index fe58cb4a..8f5e861c 100644
--- a/test/train-sets/ref/rcv1_small.stderr
+++ b/test/train-sets/ref/rcv1_small.stderr
@@ -1,7 +1,4 @@
 enabling BFGS based optimization **without** curvature calculation
-creating cache_file = train-sets/rcv1_small.dat.cache
-Reading from train-sets/rcv1_small.dat
-num sources = 1
 Num weight bits = 20
 learning rate = 10
 initial_t = 1
@@ -11,15 +8,18 @@ using l2 regularization
 m = 7
 Allocated 72M for weights and mem
 ## avg. loss 	der. mag. 	d. m. cond.	 wolfe1    	wolfe2    	mix fraction	curvature 	dir. magnitude	step size 	time      
- 1 6.931472e-01	1.859805e-03	3.768599e+00	          	          	          	3.414409e+01	1.977478e+04	1.103734e-01	2.119
- 3 4.624629e-01	1.085868e-02	1.936935e+00	 0.554592  	0.194583  	          	          	2.208660e+02	1.000000e+00	2.656     
- 4 3.384483e-01	4.246157e-04	1.726574e-01	 0.520733  	0.133178  	          	          	3.415247e+01	1.000000e+00	3.323     
- 5 3.184994e-01	7.930477e-05	6.343691e-02	 0.751656  	0.517330  	          	          	6.775453e+01	1.000000e+00	4.025     
- 6 3.024582e-01	2.887344e-06	1.460970e-02	 0.657158  	0.328843  	          	          	3.618752e+01	1.000000e+00	5.114     
- 7 2.967627e-01	4.279802e-06	3.517537e-03	 0.645592  	0.293725  	          	          	1.585137e+01	1.000000e+00	6.356     
- 8 2.952703e-01	2.241755e-06	1.631614e-03	 0.527831  	0.052654  	          	          	5.802390e+00	1.000000e+00	7.734     
- 9 2.950591e-01	7.095166e-07	1.546886e-03	 0.199872  	-0.608863 	          	          	7.389087e-01	1.000000e+00	9.263     
-10 2.948347e-01	2.803435e-07	1.223611e-04	 0.585419  	0.171988  	          	          	1.453111e-01	1.000000e+00	12.281    
+creating cache_file = train-sets/rcv1_small.dat.cache
+Reading from train-sets/rcv1_small.dat
+num sources = 1
+ 1 6.931472e-01	1.859805e-03	3.768599e+00	          	          	          	3.414409e+01	1.977478e+04	1.103734e-01	0.769
+ 3 4.624629e-01	1.085868e-02	1.936935e+00	 0.554592  	0.194583  	          	          	2.208660e+02	1.000000e+00	0.934     
+ 4 3.384483e-01	4.246157e-04	1.726574e-01	 0.520733  	0.133178  	          	          	3.415247e+01	1.000000e+00	1.142     
+ 5 3.184994e-01	7.930477e-05	6.343691e-02	 0.751656  	0.517330  	          	          	6.775453e+01	1.000000e+00	1.394     
+ 6 3.024582e-01	2.887344e-06	1.460970e-02	 0.657158  	0.328843  	          	          	3.618752e+01	1.000000e+00	1.673     
+ 7 2.967627e-01	4.279802e-06	3.517537e-03	 0.645592  	0.293725  	          	          	1.585137e+01	1.000000e+00	1.984     
+ 8 2.952703e-01	2.241755e-06	1.631614e-03	 0.527831  	0.052654  	          	          	5.802390e+00	1.000000e+00	2.332     
+ 9 2.950591e-01	7.095166e-07	1.546886e-03	 0.199872  	-0.608863 	          	          	7.389087e-01	1.000000e+00	2.712     
+10 2.948347e-01	2.803435e-07	1.223611e-04	 0.585419  	0.171988  	          	          	1.453111e-01	1.000000e+00	3.432     
 
 
 finished run
diff --git a/test/train-sets/ref/searn_small.stderr b/test/train-sets/ref/searn_small.stderr
new file mode 100644
index 00000000..0d2a3a1e
--- /dev/null
+++ b/test/train-sets/ref/searn_small.stderr
@@ -0,0 +1,23 @@
+Num weight bits = 18
+learning rate = 10
+initial_t = 1
+power_t = 0.5
+decay_learning_rate = 1
+creating cache_file = train-sets/seq_small.cache
+Reading from train-sets/seq_small
+num sources = 1
+average    since       example  example    current  current  current
+loss       last        counter   weight      label  predict features
+#pol  average    since      sequence         example            current label      current predicted  current   cur   cur         predic.        examples
+chng  loss       last        counter          weight          sequence prefix        sequence prefix features  pass   pol            made          gener.
+   0  1.333333   1.333333          3        3.000000   [1 3 2 1 4 3         ] [1 3 2 1 4 3         ]       18     2     0              18              12
+   1  1.000000   0.666667          6        6.000000   [1 3 2 1 4 3         ] [1 3 2 1 4 3         ]       18     5     1              49              30
+   1  0.727273   0.400000         11       11.000000   [1 3 2 1 4 3         ] [1 3 2 1 4 3         ]       18    10     2             162              60
+
+finished run
+number of examples = 12
+weighted example sum = 12
+weighted label sum = 0
+average loss = 0.6667
+best constant = -0.09091
+total feature number = 552
diff --git a/test/train-sets/ref/searn_small.stdout b/test/train-sets/ref/searn_small.stdout
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/test/train-sets/ref/searn_small.stdout
diff --git a/test/train-sets/ref/searn_wsj.stderr b/test/train-sets/ref/searn_wsj.stderr
new file mode 100644
index 00000000..3d156c0e
--- /dev/null
+++ b/test/train-sets/ref/searn_wsj.stderr
@@ -0,0 +1,29 @@
+Num weight bits = 18
+learning rate = 10
+initial_t = 1
+power_t = 0.5
+decay_learning_rate = 1
+creating cache_file = train-sets/wsj_small.dat.gz.cache
+Reading from train-sets/wsj_small.dat.gz
+num sources = 1
+average    since       example  example    current  current  current
+loss       last        counter   weight      label  predict features
+#pol  average    since      sequence         example            current label      current predicted  current   cur   cur         predic.        examples
+chng  loss       last        counter          weight          sequence prefix        sequence prefix features  pass   pol            made          gener.
+   0  22.000000  22.000000         3        3.000000   [14 10 13 9 1 2 1 4..] [11 11 11 1 2 1 2 1..]     2659     0     0              93              64
+   0  21.000000  20.000000         6        6.000000   [19 2 22 4 3 9 1 1 ..] [1 2 3 9 1 2 1 1 12..]     3324     0     0             196             160
+   0  18.363636  15.200000        11       11.000000   [29 4 3 9 1 1 23 8 ..] [1 2 3 9 1 2 1 10 7..]     1424     0     0             328             312
+   0  15.000000  11.636364        22       22.000000   [11 11 21 3 10 13 3..] [11 11 21 3 1 2 3 1..]     3419     0     0             613             576
+   0  12.681818  10.363636        44       44.000000   [3 26 9 1 4 3 1 2 5..] [3 1 1 1 2 3 1 2 11..]     1139     0     0            1120            1107
+   0  9.988506   7.232558         87       87.000000   [11 11 12 9 1 2 11 ..] [11 11 12 9 1 2 11 ..]     2564     1     0            2220            2192
+   0  6.316092   2.643678        174      174.000000   [11 1 10 13 2 17 30..] [11 1 10 13 2 17 30..]     1044     2     0            4370            4358
+   1  4.597701   2.879310        348      348.000000   [2 11 2 11 12 3 11 ..] [2 25 1 1 12 3 11 1..]     2279     4     1           49100            8673
+   1  4.612069   4.626437        696      696.000000   [19 22 4 5 3 1 2 1 ..] [19 22 34 9 12 1 2 ..]     2754     8     2          459337           17212
+
+finished run
+number of examples = 936
+weighted example sum = 936
+weighted label sum = 0
+average loss = 4.706
+best constant = -0.00107
+total feature number = 78376268
diff --git a/test/train-sets/ref/searn_wsj.stdout b/test/train-sets/ref/searn_wsj.stdout
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/test/train-sets/ref/searn_wsj.stdout
diff --git a/test/train-sets/ref/searn_wsj2.dat.stderr b/test/train-sets/ref/searn_wsj2.dat.stderr
new file mode 100644
index 00000000..5cf68832
--- /dev/null
+++ b/test/train-sets/ref/searn_wsj2.dat.stderr
@@ -0,0 +1,26 @@
+Num weight bits = 18
+learning rate = 10
+initial_t = 1
+power_t = 0.5
+decay_learning_rate = 1
+creating cache_file = train-sets/wsj_small.dat.gz.cache
+Reading from train-sets/wsj_small.dat.gz
+num sources = 1
+average    since       example  example    current  current  current
+loss       last        counter   weight      label  predict features
+#pol  average    since      sequence         example            current label      current predicted  current   cur   cur         predic.        examples
+chng  loss       last        counter          weight          sequence prefix        sequence prefix features  pass   pol            made          gener.
+   0  21.666667  21.666667         3        3.000000   [14 10 13 9 1 2 1 4..] [11 11 11 15 9 9 1 ..]     2659     0     0              93              64
+   0  23.666667  25.666667         6        6.000000   [19 2 22 4 3 9 1 1 ..] [19 2 11 11 11 11 1..]     3324     0     0             196             160
+   0  20.909091  17.600000        11       11.000000   [29 4 3 9 1 1 23 8 ..] [19 2 3 9 1 6 28 29..]     1424     0     0             328             312
+   0  16.318182  11.727273        22       22.000000   [11 11 21 3 10 13 3..] [11 11 21 3 1 2 3 1..]     3419     0     0             613             576
+   0  12.727273  9.136364         44       44.000000   [3 26 9 1 4 3 1 2 5..] [3 11 11 1 2 3 1 2 ..]     1139     0     0            1120            1107
+   1  11.137931  9.511628         87       87.000000   [11 11 12 9 1 2 11 ..] [11 11 12 11 11 11 ..]     2564     1     1           13460            2192
+
+finished run
+number of examples = 156
+weighted example sum = 156
+weighted label sum = 0
+average loss = 8.532
+best constant = -0.006452
+total feature number = 7819789
diff --git a/test/train-sets/ref/searn_wsj2.dat.stdout b/test/train-sets/ref/searn_wsj2.dat.stdout
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/test/train-sets/ref/searn_wsj2.dat.stdout
diff --git a/test/train-sets/ref/seq_small.stderr b/test/train-sets/ref/seq_small.stderr
index 67c07e03..6fd97a8b 100644
--- a/test/train-sets/ref/seq_small.stderr
+++ b/test/train-sets/ref/seq_small.stderr
@@ -1,18 +1,18 @@
-creating cache_file = train-sets/seq_small.cache
-Reading from train-sets/seq_small
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0.5
 decay_learning_rate = 1
-average    since      sequence  example            current label      current predicted  current   cur   cur    predic.   examples
-loss       last        counter   weight          sequence prefix        sequence prefix features  pass   pol       made     gener.
-0.666667   0.666667          1      6.0   [ 1  3  2  1  4  3   ] [ 1  1  1  1  1  1   ]       12     0     0          6          0
-0.333333   0.000000          2     12.0   [ 1  3  2  1  4  3   ] [ 1  3  2  1  4  3   ]       12     1     0         12          6
-0.222222   0.000000          3     18.0   [ 1  3  2  1  4  3   ] [ 1  3  2  1  4  3   ]       12     2     0         18         12
-0.166667   0.000000          4     24.0   [ 1  3  2  1  4  3   ] [ 1  3  2  1  4  3   ]       12     3     0         24         18
-0.083333   0.000000          8     48.0   [ 1  3  2  1  4  3   ] [ 1  3  2  1  4  3   ]       12     7     1         72         42
+creating cache_file = train-sets/seq_small.cache
+Reading from train-sets/seq_small
+num sources = 1
+average    since      sequence         example            current label      current predicted  current   cur   cur         predic.        examples
+loss       last        counter          weight          sequence prefix        sequence prefix features  pass   pol            made          gener.
+0.666667   0.666667          1        6.000000   [ 1  3  2  1  4  3   ] [ 1  1  1  1  1  1   ]       12     0     0               6               0
+0.333333   0.000000          2       12.000000   [ 1  3  2  1  4  3   ] [ 1  3  2  1  4  3   ]       12     1     0              12               6
+0.222222   0.000000          3       18.000000   [ 1  3  2  1  4  3   ] [ 1  3  2  1  4  3   ]       12     2     0              18              12
+0.166667   0.000000          4       24.000000   [ 1  3  2  1  4  3   ] [ 1  3  2  1  4  3   ]       12     3     0              24              18
+0.083333   0.000000          8       48.000000   [ 1  3  2  1  4  3   ] [ 1  3  2  1  4  3   ]       12     7     1              51              42
 
 finished run
 number of examples = 12
diff --git a/test/train-sets/ref/wiki1K.stderr b/test/train-sets/ref/wiki1K.stderr
index 9531eb9c..4f28322b 100644
--- a/test/train-sets/ref/wiki1K.stderr
+++ b/test/train-sets/ref/wiki1K.stderr
@@ -1,22 +1,23 @@
 your learning rate is too high, setting it to 1
-using no cache
-Reading from train-sets/wiki1K.dat
-num sources = 1
 Num weight bits = 13
 learning rate = 1
 initial_t = 1
 power_t = 0.5
+warning: final argument 'train-sets/wiki1K.dat' assumed to be input file; in the future, please use -d
+using no cache
+Reading from train-sets/wiki1K.dat
+num sources = 1
 average    since       example  example    current  current  current
 loss       last        counter   weight      label  predict features
-10.276575  10.276575         3      3.0    unknown   0.0000       37
-10.341718  10.406862         6      6.0    unknown   0.0000       13
-10.311285  10.274764        11     11.0    unknown   0.0000       31
-10.452045  10.592805        22     22.0    unknown   0.0000        1
-10.439284  10.426523        44     44.0    unknown   0.0000      165
-10.459843  10.480881        87     87.0    unknown   0.0000       28
-10.093540  9.727236        174    174.0    unknown   0.0000       16
-9.566517   9.039494        348    348.0    unknown   0.0000        1
-9.066497   8.566477        696    696.0    unknown   0.0000      142
+10.276562  10.276562         3      3.0    unknown   0.0000       37
+10.341712  10.406861         6      6.0    unknown   0.0000       13
+10.311279  10.274760        11     11.0    unknown   0.0000       31
+10.452043  10.592806        22     22.0    unknown   0.0000        1
+10.439283  10.426523        44     44.0    unknown   0.0000      165
+10.459842  10.480879        87     87.0    unknown   0.0000       28
+10.093538  9.727235        174    174.0    unknown   0.0000       16
+9.566512   9.039486        348    348.0    unknown   0.0000        1
+9.066552   8.566591        696    696.0    unknown   0.0000      142
 
 finished run
 number of examples = 1000
diff --git a/test/train-sets/ref/wsj_small-tm.dat.stderr b/test/train-sets/ref/wsj_small-tm.dat.stderr
index 9da3f9bc..6693ac96 100644
--- a/test/train-sets/ref/wsj_small-tm.dat.stderr
+++ b/test/train-sets/ref/wsj_small-tm.dat.stderr
@@ -1,27 +1,28 @@
-using cache_file = train-sets/wsj_small.dat.gz.cache
-ignoring text input in favor of cache input
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0.5
-average    since      sequence  example            current label      current predicted  current   cur   cur    predic.   examples
-loss       last        counter   weight          sequence prefix        sequence prefix features  pass   pol       made     gener.
-0.810811   0.810811          1     37.0   [  1   2   3   1   4 ] [  1   1   1   1   1 ]     1654     0     0         37          0
-0.781250   0.740741          2     64.0   [ 11   2   3  11  11 ] [ 11  26   9  11  26 ]     1194     0     0        705         37
-0.731183   0.620690          3     93.0   [ 14  10  13   9   1 ] [ 11  15  16   1   1 ]     1286     0     0       1105         64
-0.720930   0.694444          4    129.0   [  3   4   6   3   1 ] [ 11  11   2   3  11 ]     1608     0     0       1494         93
-0.706250   0.645161          5    160.0   [ 19   3  10   2   1 ] [  2   3   1   2   1 ]     1378     0     0       2170        129
-0.678571   0.555556          6    196.0   [ 19   2  22   4   3 ] [ 11   2  11  11  11 ]     1608     0     0       2462        160
-0.676596   0.666667          7    235.0   [ 10   2   3   1  10 ] [  1   2  11   1   1 ]     1746     0     0       3061        196
-0.614731   0.491525         12    353.0   [  5  12  11  11  21 ] [ 11  12   9   1  21 ]     1102     0     0       5473        328
-0.482955   0.350427         25    704.0   [ 10  13  22   4   9 ] [ 10   2   1   4   1 ]     1148     0     0      12574        678
-0.398449   0.315126         57   1418.0   [ 19   1   4   6  36 ] [ 19   1   4   6   5 ]     2252     0     0      25497       1368
+cannot have --sequence_transition_file and zero history length, setting history length to 1
+creating cache_file = train-sets/wsj_small.dat.gz.cache
+Reading from train-sets/wsj_small.dat.gz
+num sources = 1
+average    since      sequence         example            current label      current predicted  current   cur   cur         predic.        examples
+loss       last        counter          weight          sequence prefix        sequence prefix features  pass   pol            made          gener.
+0.810811   0.810811          1       37.000000   [  1   2   3   1   4 ] [  1   1   1   1   1 ]     1654     0     0              37               0
+0.750000   0.666667          2       64.000000   [ 11   2   3  11  11 ] [  9   9   9  11   9 ]     1194     0     0             837              37
+0.698925   0.586207          3       93.000000   [ 14  10  13   9   1 ] [ 11  15  11   1   9 ]     1286     0     0            1457              64
+0.689922   0.666667          4      129.000000   [  3   4   6   3   1 ] [ 11  11   2   3  11 ]     1608     0     0            2088              93
+0.675000   0.612903          5      160.000000   [ 19   3  10   2   1 ] [  2   3   1   2   1 ]     1378     0     0            2892             129
+0.642857   0.500000          6      196.000000   [ 19   2  22   4   3 ] [ 19   2  11  11  11 ]     1608     0     0            3611             160
+0.634043   0.589744          7      235.000000   [ 10   2   3   1  10 ] [ 19   2   3   1  11 ]     1746     0     0            4423             196
+0.575071   0.457627         12      353.000000   [  5  12  11  11  21 ] [ 11  12   9   1  21 ]     1102     0     0            7489             328
+0.451705   0.327635         25      704.000000   [ 10  13  22   4   9 ] [ 10  13  22   4   3 ]     1148     0     0           15598             678
+0.375882   0.301120         57     1418.000000   [ 19   1   4   6  36 ] [ 19  14   4   6   5 ]     2252     0     0           31169            1368
 
 finished run
 number of examples = 78
 weighted example sum = 1932
 weighted label sum = 0
-average loss = 0.367
+average loss = 0.3437
 best constant = -0.0005179
 total feature number = 85128
diff --git a/test/train-sets/ref/wsj_small.dat.stderr b/test/train-sets/ref/wsj_small.dat.stderr
index f2c6101f..5d6b2738 100644
--- a/test/train-sets/ref/wsj_small.dat.stderr
+++ b/test/train-sets/ref/wsj_small.dat.stderr
@@ -1,29 +1,29 @@
-creating cache_file = train-sets/wsj_small.dat.gz.cache
-Reading from train-sets/wsj_small.dat.gz
-num sources = 1
 Num weight bits = 18
 learning rate = 10
 initial_t = 1
 power_t = 0.5
 decay_learning_rate = 1
-average    since      sequence  example            current label      current predicted  current   cur   cur    predic.   examples
-loss       last        counter   weight          sequence prefix        sequence prefix features  pass   pol       made     gener.
-0.810811   0.810811          1     37.0   [  1   2   3   1   4 ] [  1   1   1   1   1 ]     1654     0     0         37          0
-0.750000   0.666667          2     64.0   [ 11   2   3  11  11 ] [  1   2  11  12   9 ]     1194     0     0         64         37
-0.698925   0.586207          3     93.0   [ 14  10  13   9   1 ] [ 11  11  11  15   9 ]     1286     0     0         93         64
-0.775194   0.972222          4    129.0   [  3   4   6   3   1 ] [ 11  11  11  11  11 ]     1608     0     0        129         93
-0.756250   0.677419          5    160.0   [ 19   3  10   2   1 ] [ 14  10   1   2   1 ]     1378     0     0        160        129
-0.724490   0.583333          6    196.0   [ 19   2  22   4   3 ] [ 19   2  11  11  11 ]     1608     0     0        196        160
-0.744681   0.846154          7    235.0   [ 10   2   3   1  10 ] [ 19   2  11  11  11 ]     1746     0     0        235        196
-0.705382   0.627119         12    353.0   [  5  12  11  11  21 ] [ 11  12   9   1   2 ]     1102     0     0        353        328
-0.575284   0.444444         25    704.0   [ 10  13  22   4   9 ] [ 10  13   3   9   1 ]     1148     0     0        704        678
-0.482370   0.390756         57   1418.0   [ 19   1   4   6  36 ] [ 19   3   4   6   5 ]     2252     0     0       1418       1368
-0.309345   0.130909        110   2793.0   [  9   1  10  21   2 ] [  9   1  10  21   2 ]     1792     1     1      21055       2753
+creating cache_file = train-sets/wsj_small.dat.gz.cache
+Reading from train-sets/wsj_small.dat.gz
+num sources = 1
+average    since      sequence         example            current label      current predicted  current   cur   cur         predic.        examples
+loss       last        counter          weight          sequence prefix        sequence prefix features  pass   pol            made          gener.
+0.810811   0.810811          1       37.000000   [  1   2   3   1   4 ] [  1   1   1   1   1 ]     1654     0     0              37               0
+0.750000   0.666667          2       64.000000   [ 11   2   3  11  11 ] [  1   2  11  12   9 ]     1194     0     0              64              37
+0.698925   0.586207          3       93.000000   [ 14  10  13   9   1 ] [ 11  11  11  15   9 ]     1286     0     0              93              64
+0.775194   0.972222          4      129.000000   [  3   4   6   3   1 ] [ 11  11  11  11  11 ]     1608     0     0             129              93
+0.756250   0.677419          5      160.000000   [ 19   3  10   2   1 ] [ 14  10   1   2   1 ]     1378     0     0             160             129
+0.724490   0.583333          6      196.000000   [ 19   2  22   4   3 ] [ 19   2  11  11  11 ]     1608     0     0             196             160
+0.744681   0.846154          7      235.000000   [ 10   2   3   1  10 ] [ 19   2  11  11  11 ]     1746     0     0             235             196
+0.705382   0.627119         12      353.000000   [  5  12  11  11  21 ] [ 11  12   9   1   2 ]     1102     0     0             353             328
+0.575284   0.444444         25      704.000000   [ 10  13  22   4   9 ] [ 10  13   3   9   1 ]     1148     0     0             704             678
+0.482370   0.390756         57     1418.000000   [ 19   1   4   6  36 ] [ 19   3   4   6   5 ]     2252     0     0            1418            1368
+0.309345   0.130909        110     2793.000000   [  9   1  10  21   2 ] [  9   1  10  21   2 ]     1792     1     1           37389            2753
 
 finished run
 number of examples = 156
 weighted example sum = 3864
 weighted label sum = 0
-average loss = 0.2345
+average loss = 0.2347
 best constant = -0.0002589
 total feature number = 170256
diff --git a/test/train-sets/ref/zero.stderr b/test/train-sets/ref/zero.stderr
index 44922058..1db31510 100644
--- a/test/train-sets/ref/zero.stderr
+++ b/test/train-sets/ref/zero.stderr
@@ -1,7 +1,4 @@
 enabling BFGS based optimization **without** curvature calculation
-creating cache_file = train-sets/zero.dat.cache
-Reading from train-sets/zero.dat
-num sources = 1
 Num weight bits = 20
 learning rate = 10
 initial_t = 1
@@ -11,8 +8,11 @@ using l2 regularization
 m = 7
 Allocated 72M for weights and mem
 ## avg. loss 	der. mag. 	d. m. cond.	 wolfe1    	wolfe2    	mix fraction	curvature 	dir. magnitude	step size 	time      
- 1 0.000000e+00	0.000000e+00	0.000000e+00	          	          	          	0.000000e+00	0.000000e+00	0.000000e+00	0.258
- 3 0.000000e+00	0.000000e+00	0.000000e+00	 nan       	nan       	
+creating cache_file = train-sets/zero.dat.cache
+Reading from train-sets/zero.dat
+num sources = 1
+ 1 0.000000e+00	0.000000e+00	0.000000e+00	          	          	          	0.000000e+00	0.000000e+00	0.000000e+00	0.130
+ 3 0.000000e+00	0.000000e+00	0.000000e+00	 -nan      	-nan      	
 
 
 finished run
diff --git a/vowpalwabbit/searn.cc b/vowpalwabbit/searn.cc
index 06984d2a..e1012bd4 100644
--- a/vowpalwabbit/searn.cc
+++ b/vowpalwabbit/searn.cc
@@ -426,7 +426,7 @@ namespace Searn
 
   void parse_flags(vw&all, std::vector<std::string>&opts, po::variables_map& vm, void (*base_l)(vw&,example*), void (*base_f)(vw&))
   {
-    po::options_description desc("Sequence options");
+    po::options_description desc("Searn options");
     desc.add_options()
       ("searn_task", po::value<string>(), "the searn task")
       ("searn_rollout", po::value<size_t>(), "maximum rollout length")
@@ -480,7 +480,7 @@ namespace Searn
     if (vm.count("searn_passes_per_policy"))       passes_per_policy    = vm["searn_passes_per_policy"].as<size_t>();
     if (vm.count("searn_beta"))                    beta                 = vm["searn_beta"].as<float>();
     if (vm.count("searn_gamma"))                   gamma                = vm["searn_gamma"].as<float>();
-    if (vm.count("searn_recombine"))               do_recombination     = true;
+    if (vm.count("searn_norecombine"))             do_recombination     = false;
     if (vm.count("searn_allow_current_policy"))    allow_current_policy = true;
 
     if (beta <= 0 || beta >= 1) {
@@ -494,7 +494,7 @@ namespace Searn
     }
 
     if (task.initialize != NULL)
-      if (!task.initialize(vm)) {
+      if (!task.initialize(opts, vm)) {
         std::cerr << "error: task did not initialize properly" << std::endl;
         exit(-1);
       }
diff --git a/vowpalwabbit/searn.h b/vowpalwabbit/searn.h
index a6835e6f..e0eb88e0 100644
--- a/vowpalwabbit/searn.h
+++ b/vowpalwabbit/searn.h
@@ -126,7 +126,7 @@ namespace Searn
 
     // your task might need to initialize some memory at startup or
     // parse command line arguments: do that in initialize
-    bool   (*initialize)(po::variables_map& vm);
+    bool   (*initialize)(std::vector<std::string>&opts, po::variables_map& vm);
 
     // your task might need to free some memory at the end of running:
     // do that in finalize
diff --git a/vowpalwabbit/searn_sequencetask.cc b/vowpalwabbit/searn_sequencetask.cc
index 9dcac60e..87e2c6ee 100644
--- a/vowpalwabbit/searn_sequencetask.cc
+++ b/vowpalwabbit/searn_sequencetask.cc
@@ -34,17 +34,33 @@ namespace SequenceTask {
     // done.
   };
 
-  bool initialize(po::variables_map& vm)
+  bool initialize(std::vector<std::string>&opts, po::variables_map& vm)
   {
     SearnUtil::default_info(&hinfo);
 
+    po::options_description desc("Searn[sequence] options");
+    desc.add_options()
+      ("searn_sequencetask_history",  po::value<size_t>(), "length of history to use")
+      ("searn_sequencetask_features", po::value<size_t>(), "length of history to pair with observed features")
+      ("searn_sequencetask_bigrams",                       "use bigrams from history")
+      ("searn_sequencetask_bigram_features",               "use bigrams from history paired with observed features")
+      ("searn_sequencetask_fake_ldf",                      "pretend like we're an LDF model even though we need not be");
+
+    po::parsed_options parsed = po::command_line_parser(opts).
+      style(po::command_line_style::default_style ^ po::command_line_style::allow_guessing).
+      options(desc).allow_unregistered().run();
+    opts = po::collect_unrecognized(parsed.options, po::include_positional);
+    po::store(parsed, vm);
+    po::notify(vm);
+    
+
     if (vm.count("searn_sequencetask_bigrams"))          hinfo.bigrams = true;
     if (vm.count("searn_sequencetask_history"))          hinfo.length = vm["searn_sequencetask_history"].as<size_t>();
     if (vm.count("searn_sequencetask_bigram_features"))  hinfo.bigram_features = true;
     if (vm.count("searn_sequencetask_features"))         hinfo.features = vm["searn_sequencetask_features"].as<size_t>();
     if (vm.count("searn_sequencetask_fake_ldf"))         fake_as_ldf = true;
 
-    seq_max_action = vm["searn_max_action"].as<size_t>();
+    seq_max_action = vm["searn"].as<size_t>();
     constant_pow_length = 1;
     for (size_t i=0; i < hinfo.length; i++)
       constant_pow_length *= quadratic_constant;
diff --git a/vowpalwabbit/searn_sequencetask.h b/vowpalwabbit/searn_sequencetask.h
index a8653aac..d489ff66 100644
--- a/vowpalwabbit/searn_sequencetask.h
+++ b/vowpalwabbit/searn_sequencetask.h
@@ -14,7 +14,7 @@ namespace SequenceTask {
   void   finish(state);
   void   start_state_multiline(example**, size_t, state*);
   void   cs_example(vw&, state, example*&, bool);
-  bool   initialize(po::variables_map& vm);
+  bool   initialize(std::vector<std::string>&opts, po::variables_map& vm);
   size_t hash(state);
   bool   equivalent(state, state);
   std::string to_string(state, bool, std::vector<action>);
author	Hal Daume III <me@hal3.name>	2012-06-01 00:44:23 +0400
committer	Hal Daume III <me@hal3.name>	2012-06-01 00:44:23 +0400
commit	a7151261249ec358e574d23fed13064a7281fd99 (patch)
tree	99a85a6d74d162d3c7c0e41bacdaf774d28c4bcf
parent	da64b998b59e7e8e45d049d79dc70da4618363fe (diff)