#!/usr/bin/env perl # # Test suite for vw: # # You may add arbitrary (train/test/varying-options) tests # by adding data files and their expected reference STDOUT and STDERR # # See __DATA__ below for how to add more tests # require 5.014; use warnings; use Getopt::Std; use vars qw($opt_d $opt_D $opt_c $opt_e $opt_f $opt_E $opt_o $opt_w $opt_y $opt_t $opt_v $opt_V); my $Epsilon = 1e-4; my $VW; my $LDA; # External utilities we use. See init() for Windows specific actions. my $Diff = 'diff'; my $Cat = 'cat'; $ENV{'PATH'} .= ':test:.:vowpalwabbit:../vowpalwabbit'; # -V prefixes valgrind like this, we should adjust the default # options over time to what looks most useful. my $Valgrind = 'valgrind --quiet --error-exitcode=100 --track-origins=yes'; # -- timeout is part of GNU coreutils, some systems may not have it my $TimeOut = ''; my $TimeOutSec = 20; # max allowed time for single vw command run # By default, we run all tests in the list my $FullRun = 1; my $ErrorCount = 0; # These --side-by-side diff opts are used to make the # fuzzy-compare easier: just split on '|' and compare numeric values # word by word: my $DiffOpts = '-N --minimal --suppress-common-lines --ignore-all-space --strip-trailing-cr --side-by-side -W 160'; # These diff options are used for the diff we want to show the user # The intent is to make them easier to parse (and compare values) by a human my $DisplayDiffOpts = '-u --minimal'; my @PathAdd = qw(. .. ../vowpalwabbit); my @ToTest = (); # __DATA__ test counter my $TestNo = 0; sub v($;@) { my $verbose_level = shift @_; return unless ($opt_v >= $verbose_level); if (@_ == 1) { print STDERR @_; } else { printf STDERR @_; } } sub usage(@) { print STDERR @_, "\n" if (@_); die "Usage: $0 [options] [testno...] [vw-executable] By default will run against the 1st 'vw' executable found in: @PathAdd \$PATH Options: -c print test-suite commands before running them -d print diff output on significant diff failure -D print diff output even if it is not significant -e exit with non-zero status on first error -w Ignore white-space differences (diff --ignore-space-change) -f Ignore small (< $Epsilon) floating-point differences (fuzzy compare) -E Tolerance epsilon for fuzzy compares (default $Epsilon) -o Overwrite reference file with new/different result -y On error, copy bad files to (eg stderr.test21) for later comparison -v Verbosity (small integer) is verbosity level -V apply valgrind to vw commands -t Apply timeout (default $TimeOutSec) secs to individual tests (will only work where GNU coreutils 'timeout' is present) [testno...] Optional integer args: explicit test numbers (skip others) "; } sub mysystem { my $cmd = shift; v(1, "%s\n", $cmd); system($cmd); } sub command_failed($) { # Deal with cases where vw crashes, exits prematurely etc. # print a message to distinguish between all cases # return non-zero status if anything is bad my $cmd = shift; my $exitcode = 0; if ($?) { $exitcode = $? >> 8; my $signal = $? & 127; my $core = ''; if ($? & 128) { $core = ' (core dumped)'; } if ($signal) { printf STDERR "$0: test $TestNo: '%s' died from signal $signal$core\n", $cmd; $exitcode = 1; } elsif ($exitcode) { printf STDERR "$0: test $TestNo: '%s' failed (exitcode=$exitcode)\n", $cmd; } } # This is non-zero only if $cmd failed $exitcode; } sub valgrind_errfile($) { my $testno = shift; "Test-$testno.valgrind-err"; } # # which vw executable to test against # sub which_vw() { if (@ARGV == 1 || @ARGV == 2) { my $exe = $ARGV[0]; if (-f $exe && -x $exe) { printf STDERR "Testing vw: %s\n", $exe; return $exe; } else { usage("$0: argument $exe: not an executable file"); } } elsif (@ARGV == 0) { foreach my $dir (@PathAdd, split(':', $ENV{PATH})) { my $exe = "$dir/vw"; if (-x $exe) { printf STDERR "Testing vw: %s\n", $exe; return $exe; } } } usage("can't find a 'vw' executable to test on"); } sub which_lda() { if (@ARGV == 2) { my $exe = $ARGV[1]; if (-f $exe && -x $exe) { printf STDERR "Testing lda: %s\n", $exe; return $exe; } else { usage("$0: argument $exe: not an executable file"); } } elsif (@ARGV == 0 || @ARGV == 1) { foreach my $dir (@PathAdd, split(':', $ENV{PATH})) { my $exe = "$dir/vw"; if (-x $exe) { printf STDERR "Testing lda: %s\n", $exe; return $exe; } } } usage("can't find a 'lda' executable to test on"); } sub init() { $0 =~ s{.*/}{}; getopts('wcdDefyE:ov:Vt:') || usage(); $opt_v = 0 unless (defined $opt_v and $opt_v); print STDERR "testing $^O "; if ($^O =~ /MSWin/i) { v(1, "OS is $^O\n"); # On MS Windows we need to change paths to external executables # Assumes cygwin is installed $ENV{'PATH'} .= ':/cygdrive/c/cygwin/bin'; # And just to be safe (probably not needed): $Diff = 'c:\cygwin\bin\diff.exe'; $Cat = 'c:\cygwin\bin\cat.exe'; } elsif ($^O =~ /cygwin/i){ v(1,"OS is $^O\n"); # On MS Windows we need to change paths to external executables # Assumes cygwin is installed $ENV{'PATH'} .= ':/cygdrive/c/cygwin/bin'; # And just to be safe (probably not needed): $Diff = 'c:/cygwin/bin/diff.exe'; $Cat = 'c:/cygwin/bin/cat.exe'; } $Epsilon = $opt_E if ($opt_E); $Diff .= ' --ignore-space-change' if ($opt_w); my @num_args = (); my @exe_args = (); foreach my $arg (@ARGV) { if ($arg =~ /^\d+$/) { # a test number push(@num_args, $arg); next; } push(@exe_args, $arg); } if (@num_args) { @ToTest = sort { $a <=> $b } @num_args; # add dummy element so we don't become empty on last test push(@ToTest, -1); $FullRun = 0; } @ARGV = @exe_args; $VW = which_vw(); $LDA = which_lda(); my $timeout = `which timeout 2>/dev/null`; if ($timeout =~ /timeout$/) { chomp($timeout); $TimeOut = $timeout; v(1,"timeout is: %s\n", $TimeOut); } if ($opt_t) { if ($opt_t =~ /^\d+$/) { $TimeOutSec = $opt_t; } else { usage("-t $opt_t: -t can only accept integer seconds"); } warn "-t passed but this env doesn't have timeout installed\n" unless ($TimeOut); } } sub copy_file { my ($src_file, $dst_file) = @_; use File::Copy; print STDERR "\t\t-> copying output to $dst_file\n"; copy($src_file, $dst_file); } sub trim_spaces($) { my $str = shift; $str =~ s/^\s+//; $str =~ s/\s+$//; $str; } # # ref_file($default_name) # Reference file existence: if we're on Windows, AND # an alternate reference-file exists, give precedence # to the alternate file (file with a '-mswin' suffix.) # sub ref_file($) { my $file = shift; if ($^O =~ /MSWin/i) { my $win_reffile = "$file-mswin"; if (-e $win_reffile) { return $win_reffile; } } $file; } sub next_test() { my ($cmd, $out_ref, $err_ref, $pred_ref, $pred); $TestNo++; while (! eof(DATA)) { my $line = ; last if (defined($line) && $line =~ /^\s*$/); next if ($line =~ /^\s*#/); # skip comment lines if ($line =~ /{VW}/) { # The command line $cmd = trim_spaces($line); $cmd =~ s/{VW}/$VW/; if ($cmd =~ /\s-p\s+(\S+)/) { # -p predict_file $pred = $1; } next; } if ($line =~ /{LDA}/) { # The command line $cmd = trim_spaces($line); $cmd =~ s/{LDA}/$LDA/; if ($cmd =~ /\s-p\s+(\S+)/) { # -p predict_file $pred = $1; } next; } if ($line =~ m/\.stdout\b/) { $out_ref = ref_file(trim_spaces($line)); next; } if ($line =~ /\.stderr\b/) { $err_ref = ref_file(trim_spaces($line)); next; } if ($line =~ /\.predict\b/) { $pred_ref = ref_file(trim_spaces($line)); next; } # A test can also be an arbitrary (shell) command line # if the 1st non-space sequence is executable my ($first_word) = ($line =~ /^\s*(\S+)/); if (defined $first_word && -x $first_word) { $cmd = trim_spaces($line); next; } # if we get here it is some unrecognized pattern printf STDERR "Unrecognized test spec line:\n\t%s\n", $line; print STDERR "Test lines must match one of the following patterns:\n"; print STDERR "\tshell command to run\n"; print STDERR "\tvw command to run: {VW} ...\n"; print STDERR "\tstdout reference: *.stdout\n"; print STDERR "\tstderr reference: *.stderr\n"; print STDERR "\tpredict reference: *.predict\n"; } if (eof(DATA) && !defined $cmd) { return (undef, undef, undef, undef); } unless (defined $cmd) { die "$0: test $TestNo: command is undefined\n"; } unless (defined $err_ref) { v(2, "%s: test %s: stderr ref: undefined\n", $0, $TestNo); $err_ref = '/dev/null'; } # print STDERR "next_test: (\$cmd, $out_ref, $err_ref, $pred_ref, $pred)\n"; if ($opt_V) { $cmd = sprintf("%s --log-file='%s' %s", $Valgrind, valgrind_errfile($TestNo), $cmd); } elsif ($TimeOut) { $cmd = sprintf("%s %u %s", $TimeOut, $TimeOutSec, $cmd); } ($cmd, $out_ref, $err_ref, $pred_ref, $pred); } # # If the difference is small (least significant digits of numbers) # treat it as ok. It may be a result of 32 vs 64 bit calculations. # use Scalar::Util qw(looks_like_number); sub lenient_array_compare($$) { my ($w1_ref, $w2_ref) = @_; my (@w1) = @$w1_ref; my (@w2) = @$w2_ref; # print STDERR "lenient_array_compare: (@w1) (@w2)\n"; return 1 if ($#w1 != $#w2); # arrays not of same size my $nelem = scalar @w1; for (my $i = 0; $i < $nelem; $i++) { my ($word1, $word2) = ($w1[$i], $w2[$i]); # print STDERR "\t$word1 == $word2 ?\n"; next if ($word1 eq $word2); # There's some difference, is it significant? unless (looks_like_number($word1)) { v(4, "$word1 vs $word2: word1=$word1 is not a number!\n"); return 1; } unless (looks_like_number($word2)) { v(4, "$word1 vs $word2: word2=$word2 is not a number!\n"); return 1; } my $delta = abs($word1 - $word2); if ($delta > $Epsilon) { # We have a 'big enough' difference, but this difference # may still not be meaningful in all contexts: # Big numbers should be compared by ratio rather than # by difference # Must ensure we can divide (avoid div-by-0) if (abs($word2) <= 1.0) { # If numbers are so small (close to zero), # ($delta > $Epsilon) suffices for deciding that # the numbers are meaningfully different v(4, "$word1 vs $word2: delta=$delta > Epsilon=$Epsilon\n"); return 1; } # Now we can safely divide (since abs($word2) > 0) # and determine the ratio difference from 1.0 my $ratio_delta = abs($word1/$word2 - 1.0); if ($ratio_delta > $Epsilon) { v(4, "$word1 vs $word2: ratio_delta=$ratio_delta > Epsilon=$Epsilon\n"); return 1; } } } # print STDERR "lenient_array_compare: no meaningful difference\n"; return 0; # no meaningful difference } sub diff_lenient_float($$) { my ($reffile, $outfile) = @_; my $status = 0; my $tmpf = 'lenient-diff.tmp'; mysystem("$Diff $DiffOpts $reffile $outfile >$tmpf"); $status = $? >> 8; v(2, "diff produced $tmpf: status=$status\n"); if (-s $tmpf) { # The diff has something in it. my $fuzzy_status = 0; # assume innocent till proven guilty open(my $sdiff, $tmpf) || die "$0: diff_lenient_float: $tmpf: $!\n"; while (<$sdiff>) { chomp; my ($line1, $line2) = split(/\s*\|\s*/, $_); unless (defined($line1) && defined($line2)) { my $save_diff_file = "test-$TestNo.lenient-diff"; warn "$0: test $TestNo: $tmpf: line $.: fuzzy-match missing data on one of the sides. Can't compare\n$_\n"; warn "$0: test $TestNo: saving lenient diff in '$save_diff_file' for later inspection\n"; close $sdiff; rename($tmpf, $save_diff_file); return 1; } v(3, "line1: %s\n", $line1); v(3, "line2: %s\n", $line2); # Break lines into tokens/words my (@w1) = split(' ', $line1); my (@w2) = split(' ', $line2); if (lenient_array_compare(\@w1, \@w2) != 0) { $fuzzy_status = 1; last; } } close $sdiff; $status = $fuzzy_status; } $status; } # # perl internal way to emulate 'touch' # sub touch(@) { my $now = time; utime $now, $now, @_; } sub display_diff($$) { my ($reference_file, $actual_file) = @_; my $diff_cmd = "$Diff $DisplayDiffOpts $reference_file $actual_file"; printf STDERR "--- %s\n", $diff_cmd; mysystem($diff_cmd); } sub diff($$) { my ($reffile, $outfile) = @_; my $status = 0; $reffile = '' unless (defined $reffile); # Special case, empty file w/o reference is not considered a failure. # This is a most common case with stdout. unless (-e $reffile) { if (-s $outfile > 0) { warn "$0: test $TestNo: stdout ref: $reffile: $!\n"; exit 1 if ($opt_e); return 2 unless ($opt_o); } else { # Empty output without a ref is not considered a failure v(1, "$0: test $TestNo: empty output w/o reference: ignored.\n"); return 0; } } # Actually run the diff my $diff_cmd = "$Diff $DiffOpts $reffile $outfile"; mysystem("$diff_cmd >diff.tmp"); $status = $? >> 8; v(2, "$diff_cmd >diff.tmp: status=$status\n"); if (-s 'diff.tmp') { # There's some difference v(2, "diff.tmp has something in it. Is it meaningful?\n"); if ($opt_f && -e $reffile && -e $outfile && diff_lenient_float($reffile, $outfile) == 0) { print STDERR "$0: test $TestNo: minor (<$Epsilon) precision differences ignored\n"; $status = 0; } if ($opt_D or ($opt_d && $status)) { # Print the diff only iff: # 1) -D is in effect OR # 2) -d is in effect and diff is significant display_diff($reffile, $outfile); } if ($opt_o) { print STDERR "-o: overwriting reference:\n"; if (-e $reffile) { print STDERR "\t$reffile -> $reffile.prev\n"; rename($reffile, "$reffile.prev") || die "FATAL: rename($reffile, $reffile.prev): $!\n"; } print STDERR "\t$outfile -> $reffile\n"; rename($outfile, $reffile) || die "FATAL: rename($outfile, $reffile): $!\n"; unless ($opt_e) { $status = 0; } } } $status; } # # check_for_time_regression() # Compare last overall time to run to current to catch # performance regressions # my $LastTimeFile = 'RunTests.last.times'; sub write_times($@) { my ($file, @times) = @_; open(my $fh, ">$file") || die "$0: can't open(>$file): $!\n"; print $fh join(' ', @times), "\n"; close $fh; } sub read_times($) { my ($file) = @_; open(my $fh, $file) || die "$0: can't open($file): $!\n"; my $line = <$fh>; chomp $line; close $fh; return (split(' ', $line)); } sub check_for_time_regression() { my $tolerate_regress = 1.02; my $pct_change = 0.0; my ($overall_time0, $overall_time1); my ($user0, $system0, $cuser0, $csystem0); my ($user1, $system1, $cuser1, $csystem1) = times; $overall_time1 = $cuser1 + $csystem1; if (-e $LastTimeFile) { ($user0, $system0, $cuser0, $csystem0) = read_times($LastTimeFile); if (!(defined $csystem0) or !(defined $cuser0)) { die "$0: undefined times in saved times file: $LastTimeFile," . " try removing it\n" } $overall_time0 = $cuser0 + $csystem0; $pct_change = 100 * ($overall_time1 - $overall_time0) / (1e-4+$overall_time0); if ($overall_time0 == 0) { die "$0: Bad times in saved times file: $LastTimeFile," . " try removing it\n" } elsif ($overall_time1/$overall_time0 > $tolerate_regress) { printf STDERR "$0: RUNTIME REGRESSION: " . "%.2f sec vs last time %.2f sec. (%.2f%% worse)\n", $overall_time1, $overall_time0, $pct_change; } } write_times($LastTimeFile, $user1, $system1, $cuser1, $csystem1); printf STDERR "$0 runtime: user %g, system %g, total %g sec (%+.2f%% vs. last)\n", $cuser1, $csystem1, $overall_time1, $pct_change; } sub run_tests() { print STDERR "$0: '-D' to see any diff output\n" unless ($opt_D); print STDERR "$0: '-d' to see only significant diff output\n" unless ($opt_d); print STDERR "$0: '-o' to force overwrite references\n" unless ($opt_o); print STDERR "$0: '-e' to abort/exit on first failure\n" unless ($opt_e); my ($cmd, $out_ref, $err_ref, $pred_ref); my ($outf, $errf, $predf); mkdir('models', 0755) unless (-d 'models'); unlink(glob('*.tmp')); unlink(glob('*.cache')); unlink(glob('*/*.cache')); while (($cmd, $out_ref, $err_ref, $pred_ref, $predf) = next_test()) { last unless (defined $cmd); if (@ToTest) { if ($ToTest[0] != $TestNo) { # warn "$0: test $TestNo: skipped\n"; next; } else { shift(@ToTest); } } ($outf, $errf) = ('stdout.tmp', 'stderr.tmp'); unlink $outf, $errf; # just in case, to be safer # run the test print STDERR "Test $TestNo: ($cmd) >$outf 2>$errf\n" if ($opt_c); mysystem("($cmd) >$outf 2>$errf"); my $full_status = $?; my $status = $full_status >> 8; unless ($opt_V) { if (my $failure = command_failed($cmd)) { print STDERR `$Cat $errf`; if ($opt_e) { printf STDERR "$0: exiting with status=$failure\n"; exit $failure; } next; } } if ($status) { $ErrorCount++; if ($opt_V && $status == 100) { my $errfile = valgrind_errfile($TestNo); warn "$0: test $TestNo: FAILED: valgrind errors in $errfile\n"; } elsif ($TimeOut && $status == 124) { warn "$0: test $TestNo: FAILED: timeout $TimeOutSec exceeded\n"; } else { warn "$0: test $TestNo: '$cmd' failed: status=$status\n"; } exit $full_status if ($opt_e); next; } # command succeded # -- compare stdout $status = diff($out_ref, $outf); if ($status) { $ErrorCount++; printf STDERR "%s: test %d: FAILED: ref(%s) != stdout(%s)\n", $0, $TestNo, $out_ref, $outf; if ($opt_y) { copy_file($outf, $outf . '.test' . $TestNo); } exit $status if ($opt_e); } else { if (defined $out_ref) { print STDERR "$0: test $TestNo: stdout OK\n"; } else { v(1, "$0: test $TestNo: stdout OK (no reference)\n"); } } # -- compare stderr if (! -e $err_ref and ! $opt_o) { $ErrorCount++; print STDERR "$0: test $TestNo: FAILED: stderr ref: $err_ref: $!\n"; exit 1 if ($opt_e); next; } $status = diff($err_ref, $errf); if ($status) { $ErrorCount++; printf STDERR "%s: test %d: FAILED: ref(%s) != stderr(%s)\n", $0, $TestNo, $err_ref, $errf; if ($opt_y) { copy_file($errf, "$errf.test.$TestNo"); } exit $status if ($opt_e); } else { print STDERR "$0: test $TestNo: stderr OK\n"; } # -- compare predict next unless (defined $pred_ref); $predf = 'predict.tmp' unless (defined $predf); $status = diff($pred_ref, $predf); if ($status) { $ErrorCount++; printf STDERR "%s: test %d: FAILED: ref(%s) != predict(%s)\n", $0, $TestNo, $pred_ref, $predf; if ($opt_y) { copy_file($predf, "$predf.test.$TestNo"); } exit $status if ($opt_e); } else { print STDERR "$0: test $TestNo: predict OK\n"; } } if ($FullRun == 0) { v(1, "Partial run: not recording overall time\n"); } elsif ($ErrorCount > 0) { v(1, "Errors found: not recording overall time\n"); } elsif ($opt_V) { v(1, "valgrind run: not recording overall time\n"); } else { check_for_time_regression(); } } # --- main init(); run_tests(); exit $ErrorCount; # # Add tests below the __DATA__ line # Each test is a series of lines, terminated by an empty line (or EOF) # # Each test is comprised of: # 1st line-item is the command to run, {VW} represents the vw # executable. # # By default, 'vw' in the parent dir (../vw) is tested. # To run against a different reference executable, just pass the # executable as an argument to RunTests # # The next (output) line-items are reference files to compare outputs to: # The expected (reference file) standard output # The expected (reference file) standard error # The expected (reference file) for predictions (-p ...) # [The above reference files can come in any order. # Their 'type' is determined by their extensions: # .stdout .stderr .predict # ] # # All filenames are relative to this (test) directory # # The temporary output file-names (as opposed to the reference ones) # are implicit: # (stdout.tmp stderr.tmp predict.tmp) # Except: if -p ... appears in the command, it will be used as the # (explicit) predictions file. # # Windows note: # # Due to differences in Random-Number-Generators in Windows, # floating-point outputs may differ in some tests (not all). # # To minimize the need for changes (leverage existing tests and # reference files as much as possible), on Windows we check for # existance of files with '-mswin' suffix: # *.stderr-mswin # *.stdout-mswin # *.predict-mswin # and if any of them exists, we use it instead. # __DATA__ # Test 1: {VW} -k -l 20 --initial_t 128000 --power_t 1 -d train-sets/0001.dat -f models/0001.model -c --passes 8 --invariant --ngram 3 --skips 1 --holdout_off train-sets/ref/0001.stderr # Test 2: checking predictions as well {VW} -k -t train-sets/0001.dat -i models/0001.model -p 001.predict.tmp --invariant test-sets/ref/0001.stderr pred-sets/ref/0001.predict # Test 3: without -d, training only {VW} -k train-sets/0002.dat -f models/0002.model --invariant train-sets/ref/0002.stderr # Test 4: same, with -d {VW} -k -d train-sets/0002.dat -f models/0002.model --invariant train-sets/ref/0002.stdout train-sets/ref/0002.stderr # Test 5: add -q .., adaptive, and more (same input, different outputs) {VW} -k --initial_t 1 --adaptive --invariant -q Tf -q ff -f models/0002a.model train-sets/0002.dat train-sets/ref/0002a.stderr # Test 6: run predictions on Test 4 model # Pretending the labels aren't there {VW} -k -t -i models/0002.model -d train-sets/0002.dat -p 0002b.predict test-sets/ref/0002b.stderr pred-sets/ref/0002b.predict # Test 7: using normalized adaptive updates and a low --power_t {VW} -k --power_t 0.45 -f models/0002c.model train-sets/0002.dat train-sets/ref/0002c.stderr # Test 8: predicts on test 7 model {VW} -k -t -i models/0002c.model -d train-sets/0002.dat -p 0002c.predict test-sets/ref/0002c.stderr pred-sets/ref/0002c.predict # Test 9: label-dependent features with csoaa_ldf {VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.csoaa.predict --passes 10 --invariant --csoaa_ldf multiline --holdout_off train-sets/ref/cs_test.ldf.csoaa.stderr train-sets/ref/cs_test.ldf.csoaa.predict # Test 10: label-dependent features with wap_ldf {VW} -k -c -d train-sets/cs_test.ldf -p cs_test.ldf.wap.predict --passes 10 --invariant --wap_ldf multiline --holdout_off train-sets/ref/cs_test.ldf.wap.stderr train-sets/ref/cs_test.ldf.wap.predict # Test 11: one-against-all {VW} -k --oaa 10 -c --passes 10 train-sets/multiclass --holdout_off train-sets/ref/oaa.stderr # Test 12: Error Correcting Tournament {VW} -k --ect 10 --error 3 -c --passes 10 --invariant train-sets/multiclass --holdout_off train-sets/ref/multiclass.stderr # Test 13: Run searn on wsj_small for 12 passes, 4 passes per policy, extra features {VW} -k -c -d train-sets/wsj_small.dat.gz --passes 12 --invariant --search_passes_per_policy 4 --search_task sequence --search 5 --search_history 2 --search_bigrams --search_features 1 --quiet --holdout_off train-sets/ref/searn_wsj.stderr # Test 14: Run searn (wap) on wsj_small for 2 passes, 1 pass per policy, extra features {VW} -k -b 19 -c -d train-sets/wsj_small.dat.gz --passes 2 --invariant --search_passes_per_policy 1 --search_task sequence --search 5 --wap 5 --search_history 2 --search_bigrams --search_features 1 --quiet --holdout_off train-sets/ref/searn_wsj2.dat.stdout train-sets/ref/searn_wsj2.dat.stderr # Test 15: LBFGS on zero derivative input {VW} -k -c -d train-sets/zero.dat --loss_function=squared -b 20 --bfgs --mem 7 --passes 5 --l2 1.0 --holdout_off train-sets/ref/zero.stdout train-sets/ref/zero.stderr # Test 16: LBFGS early termination {VW} -k -c -d train-sets/rcv1_small.dat --loss_function=logistic -b 20 --bfgs --mem 7 --passes 20 --termination 0.001 --l2 1.0 --holdout_off train-sets/ref/rcv1_small.stdout train-sets/ref/rcv1_small.stderr # Test 17: Run LDA with 100 topics on 1000 Wikipedia articles {LDA} -k --lda 100 --lda_alpha 0.01 --lda_rho 0.01 --lda_D 1000 -l 1 -b 13 --minibatch 128 --invariant train-sets/wiki1K.dat train-sets/ref/wiki1K.stderr # Test 18: Run searn on seq_small for 12 passes, 4 passes per policy {VW} -k -c -d train-sets/seq_small --passes 12 --invariant --search_passes_per_policy 4 --search 4 --search_task sequence --quiet --holdout_off train-sets/ref/searn_small.stderr # Test 19: neural network 3-parity with 2 hidden units {VW} -k -c -d train-sets/3parity --hash all --passes 3000 -b 16 --nn 2 -l 10 --invariant -f models/0021.model --random_seed 15 --quiet --holdout_off train-sets/ref/3parity.stderr # Test 20: neural network 3-parity with 2 hidden units (predict) {VW} -d train-sets/3parity --hash all -t -i models/0021.model -p 0022.predict.tmp pred-sets/ref/0022.stderr pred-sets/ref/0022.predict # Test 21: cubic features -- on a parity test case {VW} -k -c -f models/xxor.model train-sets/xxor.dat --cubic abc --passes 100 --holdout_off --progress 1.33333 train-sets/ref/xxor.stderr # Test 22: matrix factorization -- training {VW} -k -d train-sets/ml100k_small_train -b 16 -q ui --rank 10 --l2 2e-6 --learning_rate 0.05 --passes 2 --decay_learning_rate 0.97 --power_t 0 -f movielens.reg --cache_file movielens.cache --loss_function classic --holdout_off train-sets/ref/ml100k_small.stdout train-sets/ref/ml100k_small.stderr # Test 23: matrix factorization -- testing {VW} -i movielens.reg -t -d test-sets/ml100k_small_test test-sets/ref/ml100k_small.stdout test-sets/ref/ml100k_small.stderr # Test 24: active-learning -- training {VW} -k --active --simulation --mellowness 0.000001 -d train-sets/rcv1_small.dat -l 10 --initial_t 10 train-sets/ref/active-simulation.t24.stderr # Test 25: bagging -- training regressor {VW} -k -d train-sets/0002.dat -f models/bs.reg.model --bootstrap 4 -p bs.reg.predict train-sets/ref/bs.reg.stderr train-sets/ref/bs.reg.predict # Test 26: bagging -- predicting with bagged regressor {VW} -d train-sets/0002.dat -i models/bs.reg.model -p bs.prreg.predict -t train-sets/ref/bs.prreg.stderr train-sets/ref/bs.prreg.predict # Test 27: bagging -- binary classifiers {VW} -d train-sets/0001.dat -f models/bs.vote.model --bootstrap 4 --bs_type vote -p bs.vote.predict train-sets/ref/bs.vote.stderr train-sets/ref/bs.vote.predict # Test 28: bagging -- predict with bagged classifier {VW} -d train-sets/0001.dat -i models/bs.vote.model -p bs.prvote.predict -t train-sets/ref/bs.prvote.stderr train-sets/ref/bs.prvote.predict # Test 29: affix features {VW} -d train-sets/affix_test.dat -k -c --passes 10 --holdout_off --affix -2 train-sets/ref/affix_test.stderr # Test 30: train --l1 regularized model {VW} -d train-sets/0001.dat -f models/mask.model --invert_hash mask.predict --l1 0.01 train-sets/ref/mask.stderr # Test 31: train model using --feature_mask {VW} -d train-sets/0001.dat --invert_hash remask.predict --feature_mask models/mask.model -f models/remask.model train-sets/ref/remask.stderr # Test 32: train model using --feature_mask and --initial_regressor {VW} -d train-sets/0001.dat --feature_mask models/mask.model -i models/remask.model train-sets/ref/remask.final.stderr # Test 33: train model for topk recommender {VW} -d train-sets/topk.vw -f topk.model -q MF --passes 100 --cache_file topk-train.cache -k --holdout_off train-sets/ref/topk-train.stderr # Test 34: train model for topk recommender {VW} -P 1 -d train-sets/topk.vw -i topk.model --top 2 -p topk.predict train-sets/ref/topk-rec.stderr train-sets/ref/topk-rec.predict # Test 35: non-centered data-set where constant >> 0 # To test the new --constant option without which performance is very weak {VW} -k --passes 100 -c --holdout_off --constant 1000 -d train-sets/big-constant.dat train-sets/ref/big-constant.stderr # Test 36: new option: --progress w/ integer arg {VW} -k -d train-sets/0001.dat --progress 10 train-sets/ref/progress-10.stderr # Test 37: new-option: --progress w/ floating-point arg # + alternate short form (-P) {VW} -k -d train-sets/0001.dat -P 0.5 train-sets/ref/progress-0.5.stderr # Test 38: --nn without --quiet to avoid nn regressions # (Needs to be a simple test, not one sensitive to symmetry breaking) {VW} -k -d train-sets/0001.dat --nn 1 train-sets/ref/nn-1-noquiet.stderr # Test 39: cb with dr {VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dr --ngram 2 --skips 4 -b 24 -l 0.25 train-sets/ref/rcv1_raw_cb_dr.stderr # Test 40: cb with ips {VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type ips --ngram 2 --skips 4 -b 24 -l 0.125 train-sets/ref/rcv1_raw_cb_ips.stderr # Test 41: cb with dm {VW} -d train-sets/rcv1_raw_cb_small.vw --cb 2 --cb_type dm --ngram 2 --skips 4 -b 24 -l 0.125 train-sets/ref/rcv1_raw_cb_dm.stderr # Test 42: --lda --passes 2 hang regression {VW} -k -d train-sets/lda-2pass-hang.dat --lda 10 -c --passes 2 --holdout_off train-sets/ref/lda-2pass-hang.stderr # Test 43: searn sequence labeling, non-ldf train {VW} -k -c -d train-sets/sequence_data --passes 20 --invariant --search_rollout oracle --search_alpha 1e-8 --search_task sequence --search 5 --holdout_off -f models/sequence_data.model train-sets/ref/sequence_data.nonldf.train.stderr # Test 44: searn sequence labeling, non-ldf test {VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.predict train-sets/ref/sequence_data.nonldf.test.stderr train-sets/ref/sequence_data.nonldf.test.predict # Test 45: searn sequence labeling, non-ldf test, beam 1 {VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.predict --search_beam 1 train-sets/ref/sequence_data.nonldf.test-beam1.stderr train-sets/ref/sequence_data.nonldf.test-beam1.predict # Test 46: searn sequence labeling, non-ldf test, beam 20 {VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.predict --search_beam 20 --search_kbest 20 train-sets/ref/sequence_data.nonldf.test-beam20.stderr train-sets/ref/sequence_data.nonldf.test-beam20.predict # Test 47: searn sequence labeling, ldf train {VW} -k -c -d train-sets/sequence_data --passes 20 --invariant --search_rollout oracle --search_alpha 1e-8 --search_task sequence_demoldf --csoaa_ldf m --search 5 --holdout_off -f models/sequence_data.model train-sets/ref/sequence_data.ldf.train.stderr # Test 48: searn sequence labeling, ldf test {VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.predict train-sets/ref/sequence_data.ldf.test.stderr train-sets/ref/sequence_data.ldf.test.predict # Test 49: searn sequence labeling, ldf test, beam 1 {VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.predict --search_beam 1 train-sets/ref/sequence_data.ldf.test-beam1.stderr train-sets/ref/sequence_data.ldf.test-beam1.predict # Test 50: searn sequence labeling, ldf test, beam 20 {VW} -d train-sets/sequence_data -t -i models/sequence_data.model -p sequence_data.predict --search_beam 20 --search_kbest 20 train-sets/ref/sequence_data.ldf.test-beam20.stderr train-sets/ref/sequence_data.ldf.test-beam20.predict # Test 51: searn sequence SPAN labeling BIO, non-ldf train {VW} -k -c -d train-sets/sequencespan_data --passes 20 --invariant --search_rollout oracle --search_alpha 1e-8 --search_task sequencespan --search 7 --holdout_off -f models/sequencespan_data.model train-sets/ref/sequencespan_data.nonldf.train.stderr # Test 52: searn sequence SPAN labeling BIO, non-ldf test {VW} -d train-sets/sequencespan_data -t -i models/sequencespan_data.model -p sequencespan_data.predict train-sets/ref/sequencespan_data.nonldf.test.stderr train-sets/ref/sequencespan_data.nonldf.test.predict # Test 53: searn sequence SPAN labeling BIO, non-ldf test, beam 1 {VW} -d train-sets/sequencespan_data -t -i models/sequencespan_data.model -p sequencespan_data.predict --search_beam 1 train-sets/ref/sequencespan_data.nonldf.test-beam1.stderr train-sets/ref/sequencespan_data.nonldf.test-beam1.predict # Test 54: searn sequence SPAN labeling BIO, non-ldf test, beam 20 {VW} -d train-sets/sequencespan_data -t --search_span_bilou -i models/sequencespan_data.model --search_beam 20 --search_kbest 20 --quiet train-sets/ref/sequencespan_data.nonldf.test-beam20.stderr # Test 55: searn sequence SPAN labeling BILOU, non-ldf train {VW} -k -c -d train-sets/sequencespan_data --passes 20 --invariant --search_rollout oracle --search_alpha 1e-8 --search_task sequencespan --search_span_bilou --search 7 --holdout_off -f models/sequencespan_data.model train-sets/ref/sequencespan_data.nonldf-bilou.train.stderr # Test 56: searn sequence SPAN labeling BILOU, non-ldf test {VW} -d train-sets/sequencespan_data -t --search_span_bilou -i models/sequencespan_data.model -p sequencespan_data.predict train-sets/ref/sequencespan_data.nonldf-bilou.test.stderr train-sets/ref/sequencespan_data.nonldf-bilou.test.predict # Test 57: searn sequence SPAN labeling BILOU, non-ldf test, beam 1 {VW} -d train-sets/sequencespan_data -t --search_span_bilou -i models/sequencespan_data.model -p sequencespan_data.predict --search_beam 1 train-sets/ref/sequencespan_data.nonldf-bilou.test-beam1.stderr train-sets/ref/sequencespan_data.nonldf-bilou.test-beam1.predict # Test 58: searn sequence SPAN labeling BILOU, non-ldf test, beam 20 {VW} -d train-sets/sequencespan_data -t --search_span_bilou -i models/sequencespan_data.model -p sequencespan_data.predict --search_beam 20 --search_kbest 20 train-sets/ref/sequencespan_data.nonldf-bilou.test-beam20.stderr train-sets/ref/sequencespan_data.nonldf-bilou.test-beam20.predict # Test 59: silly test for "argmax" task {VW} -d train-sets/argmax_data -k -c --passes 20 --search_rollout oracle --search_alpha 1e-8 --search_task argmax --search 2 --holdout_off train-sets/ref/argmax_data.stderr # Test 60: (holdout-broken regression) # ensure we have no holdout loss of '0 h' {VW} -k -c --passes 2 train-sets/0001.dat train-sets/ref/holdout-loss-not-zero.stderr # Test 61: stagewise poly with exponent 0.25 ####in the following stage_poly tests, there are minute differences in losses, which are not being fuzzy-diffed; ####thus the stderr is cleared (--quiet) and only comparing (fuzzy-diffed) predictions. {VW} --stage_poly --sched_exponent 0.25 --batch_sz 1000 --batch_sz_no_doubling -d train-sets/rcv1_small.dat -p stage_poly.s025.predict --quiet train-sets/ref/stage_poly.s025.stderr train-sets/ref/stage_poly.s025.predict # Test 62: stagewise poly with exponent 1.0 {VW} --stage_poly --sched_exponent 1.0 --batch_sz 1000 --batch_sz_no_doubling -d train-sets/rcv1_small.dat --quiet train-sets/ref/stage_poly.s100.stderr # Test 63: stagewise poly with exponent 0.25 and doubling batches {VW} --stage_poly --sched_exponent 0.25 --batch_sz 1000 -d train-sets/rcv1_small.dat -p stage_poly.s025.doubling.predict --quiet train-sets/ref/stage_poly.s025.doubling.stderr train-sets/ref/stage_poly.s025.doubling.predict # Test 64: stagewise poly with exponent 1.0 and doubling batches {VW} --stage_poly --sched_exponent 1.0 --batch_sz 1000 -d train-sets/rcv1_small.dat -p stage_poly.s100.doubling.predict --quiet train-sets/ref/stage_poly.s100.doubling.stderr train-sets/ref/stage_poly.s100.doubling.predict # Test 65: library test, train the initial model {VW} -c -k -d train-sets/library_train -f models/library_train.w -q st --passes 100 --hash all --noconstant --csoaa_ldf m --holdout_off train-sets/ref/library_train.stdout train-sets/ref/library_train.stderr # Test 66: library test, run ezexample_predict ../library/ezexample_predict models/library_train.w train-sets/ref/ezexample_predict.stdout train-sets/ref/ezexample_predict.stderr # Test 67: empty test, bad builds (without make clean) # sometimes cause a SEGV even on empty input {VW} /dev/null train-sets/ref/empty-set.stderr # Test 68: daemon test ./daemon-test.sh test-sets/ref/vw-daemon.stdout