diff options
author | ariel faigon <github.2009@yendor.com> | 2014-07-18 20:51:38 +0400 |
---|---|---|
committer | ariel faigon <github.2009@yendor.com> | 2014-07-18 20:51:38 +0400 |
commit | 4d7021eb6b2b307702e6c3f3e93dfd93a762eefe (patch) | |
tree | d3e1c11984bdb09ec7b0ae18efa9916abcde5813 /utl | |
parent | 3b4356af0a4c816d17c7a77a408e93a286f47893 (diff) |
vw-top-errors: add comments + make output wide-char safe + more sensible defaults for topN examples/features
Diffstat (limited to 'utl')
-rwxr-xr-x | utl/vw-top-errors | 53 |
1 files changed, 43 insertions, 10 deletions
diff --git a/utl/vw-top-errors b/utl/vw-top-errors index 16567f7d..864e10dc 100755 --- a/utl/vw-top-errors +++ b/utl/vw-top-errors @@ -50,8 +50,8 @@ my $TopWeights; my %ExampleNos; my @ExampleNos; -my $DefaultTopN = 5; -my $DefaultTopWeights = 10; +my $DefaultTopN = 10; +my $DefaultTopWeights = 5; my $VWCmd; @@ -135,9 +135,19 @@ sub get_args { @ARGV = @our_args; getopts('vsa'); -} + # Make wide-char output safe from warnings + binmode STDOUT, ":utf8"; +} +# +# collect_errors +# Reads vw progress output and collects delta-loss magnitudes +# in the hash %ExampleDelta. +# +# We're looking for positive deltas, i.e. examples where the +# loss went up instead of down. +# sub collect_errors($) { my $vw_stderr = shift; @@ -146,20 +156,21 @@ sub collect_errors($) { my $stderr = ''; while (<$vw_stderr>) { unless (/^([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s/) { + # Not a progress line, may be needed if 'vw' crashes etc. $stderr .= "\t$_"; next; } $good_lines++; - my ($avgloss, $sincelast, $example) = ($1, $2, $3); + my ($avgloss, $sincelast, $example_no) = ($1, $2, $3); if (defined $prev_sincelast) { my $delta_since_last = $sincelast - $prev_sincelast; if ($opt_a) { - # absolute error - $ExampleDelta{$example} = $delta_since_last; + # absolute error (rare need) + $ExampleDelta{$example_no} = $delta_since_last; } else { - # relative error + # relative error (default) my $relative_error = $delta_since_last / $avgloss; - $ExampleDelta{$example} = $relative_error; + $ExampleDelta{$example_no} = $relative_error; } } @@ -304,11 +315,18 @@ sub audit_top_weights($$@) { } } +# +# sort delta loss numerically, descending +# sub by_delta() { $ExampleDelta{$b} <=> $ExampleDelta{$a}; } -sub biggest_errors($) { +# +# biggest_error_examples(N) +# list of top N loss example_numbers +# +sub biggest_error_examples($) { my $howmany = shift; my @sorted_examples = sort by_delta keys %ExampleDelta; @@ -316,6 +334,11 @@ sub biggest_errors($) { @sorted_examples[0 .. $howmany-1]; } +# +# print_errors(@examples) +# Print the top N loss example numbers and their absolute or +# relative loss. +# sub print_errors(@) { printf "=== Top-%d (highest delta loss) diverging examples:\n", scalar(@_); printf "Example\t%s-Loss\n", ($opt_a ? 'Absolute' : 'Relative'); @@ -324,6 +347,11 @@ sub print_errors(@) { } print "\n"; } + +# +# first_pass(N) +# First training pass to capture progressive loss numbers. +# sub first_pass($) { my $top_n = shift; @@ -337,13 +365,18 @@ sub first_pass($) { open(my $vwh, "$vw_cmd 2>&1 |"); collect_errors($vwh); close $vwh; - my @top_error_examples = biggest_errors($top_n); + my @top_error_examples = biggest_error_examples($top_n); print_errors(@top_error_examples); v("+--- 1st pass: done!\n\n"); @top_error_examples; } +# +# second_pass($top_weights, @example_nos) +# 2nd training pass with --audit to capture individual features +# causing the biggest loss jumps. +# sub second_pass($@) { my ($top_weights, @example_nos) = @_; |