Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/vowpal_wabbit.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/utl
diff options
context:
space:
mode:
authorariel faigon <github.2009@yendor.com>2014-07-18 20:51:38 +0400
committerariel faigon <github.2009@yendor.com>2014-07-18 20:51:38 +0400
commit4d7021eb6b2b307702e6c3f3e93dfd93a762eefe (patch)
treed3e1c11984bdb09ec7b0ae18efa9916abcde5813 /utl
parent3b4356af0a4c816d17c7a77a408e93a286f47893 (diff)
vw-top-errors: add comments + make output wide-char safe + more sensible defaults for topN examples/features
Diffstat (limited to 'utl')
-rwxr-xr-xutl/vw-top-errors53
1 files changed, 43 insertions, 10 deletions
diff --git a/utl/vw-top-errors b/utl/vw-top-errors
index 16567f7d..864e10dc 100755
--- a/utl/vw-top-errors
+++ b/utl/vw-top-errors
@@ -50,8 +50,8 @@ my $TopWeights;
my %ExampleNos;
my @ExampleNos;
-my $DefaultTopN = 5;
-my $DefaultTopWeights = 10;
+my $DefaultTopN = 10;
+my $DefaultTopWeights = 5;
my $VWCmd;
@@ -135,9 +135,19 @@ sub get_args {
@ARGV = @our_args;
getopts('vsa');
-}
+ # Make wide-char output safe from warnings
+ binmode STDOUT, ":utf8";
+}
+#
+# collect_errors
+# Reads vw progress output and collects delta-loss magnitudes
+# in the hash %ExampleDelta.
+#
+# We're looking for positive deltas, i.e. examples where the
+# loss went up instead of down.
+#
sub collect_errors($) {
my $vw_stderr = shift;
@@ -146,20 +156,21 @@ sub collect_errors($) {
my $stderr = '';
while (<$vw_stderr>) {
unless (/^([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s/) {
+ # Not a progress line, may be needed if 'vw' crashes etc.
$stderr .= "\t$_";
next;
}
$good_lines++;
- my ($avgloss, $sincelast, $example) = ($1, $2, $3);
+ my ($avgloss, $sincelast, $example_no) = ($1, $2, $3);
if (defined $prev_sincelast) {
my $delta_since_last = $sincelast - $prev_sincelast;
if ($opt_a) {
- # absolute error
- $ExampleDelta{$example} = $delta_since_last;
+ # absolute error (rare need)
+ $ExampleDelta{$example_no} = $delta_since_last;
} else {
- # relative error
+ # relative error (default)
my $relative_error = $delta_since_last / $avgloss;
- $ExampleDelta{$example} = $relative_error;
+ $ExampleDelta{$example_no} = $relative_error;
}
}
@@ -304,11 +315,18 @@ sub audit_top_weights($$@) {
}
}
+#
+# sort delta loss numerically, descending
+#
sub by_delta() {
$ExampleDelta{$b} <=> $ExampleDelta{$a};
}
-sub biggest_errors($) {
+#
+# biggest_error_examples(N)
+# list of top N loss example_numbers
+#
+sub biggest_error_examples($) {
my $howmany = shift;
my @sorted_examples = sort by_delta keys %ExampleDelta;
@@ -316,6 +334,11 @@ sub biggest_errors($) {
@sorted_examples[0 .. $howmany-1];
}
+#
+# print_errors(@examples)
+# Print the top N loss example numbers and their absolute or
+# relative loss.
+#
sub print_errors(@) {
printf "=== Top-%d (highest delta loss) diverging examples:\n", scalar(@_);
printf "Example\t%s-Loss\n", ($opt_a ? 'Absolute' : 'Relative');
@@ -324,6 +347,11 @@ sub print_errors(@) {
}
print "\n";
}
+
+#
+# first_pass(N)
+# First training pass to capture progressive loss numbers.
+#
sub first_pass($) {
my $top_n = shift;
@@ -337,13 +365,18 @@ sub first_pass($) {
open(my $vwh, "$vw_cmd 2>&1 |");
collect_errors($vwh);
close $vwh;
- my @top_error_examples = biggest_errors($top_n);
+ my @top_error_examples = biggest_error_examples($top_n);
print_errors(@top_error_examples);
v("+--- 1st pass: done!\n\n");
@top_error_examples;
}
+#
+# second_pass($top_weights, @example_nos)
+# 2nd training pass with --audit to capture individual features
+# causing the biggest loss jumps.
+#
sub second_pass($@) {
my ($top_weights, @example_nos) = @_;