vw-varinfo: Add multi-class support. Make invocation friendlier.

author: ariel faigon <ariel.git@yendor.com> 2012-08-01 05:06:22 +0400
committer: ariel faigon <ariel.git@yendor.com> 2012-08-01 05:06:22 +0400
commit: d85b6521daa9c048266dc882b166aee7238b8dd7 (patch)
tree: 2ccacac99068fac645bdc5c1ea21f99a503700fe /utl
parent: 551d33f3559fd729f5787d15af971891935a674d (diff)
1 files changed, 296 insertions, 112 deletions
diff --git a/utl/vw-varinfo b/utl/vw-varinfo
index 8c175747..19081b73 100755
--- a/utl/vw-varinfo
+++ b/utl/vw-varinfo
@@ -34,13 +34,13 @@
 # (c) 2012 - ariel faigon for vowpal-wabbit
 # This software may be distributed under the same terms as vowpal-wabbit
 #
-use Getopt::Std;
-use vars (qw($opt_v $opt_V $opt_P $opt_O $opt_k));
+# use Getopt::Std;
+use vars (qw($opt_v $opt_V $opt_O $opt_K));
 
 my $VW = 'vw';
 my $VWARGS = '--exact_adaptive_norm --sort_features';
 
-my ($TrainSet, $Model, $FullExample, $AuditFile);
+my ($TrainFile, $ModelFile, $RmodelFile, $ExampleFile, $AuditFile);
 my (%FeatureMax, %FeatureMin);
 
 my (%Feature2Hash, %Feature2Weight);
@@ -52,7 +52,12 @@ my @QPairs = ();    # list of pairs ([a, b], [c, d] ...) for namespace pairing
 my %Ignore;         # support for --ignore X
 my %Keep;           # support for --keep X
 my $DoKeep;         # flag for whether we need to use --keep or not
-my $DoLabels = 0;   # flag for multi-class (--oaa --csoaa --wap* --sequence?)
+my $MultiClass = 0; # flag for multi-class (--oaa --csoaa --wap* --sequence?)
+my %Labels;
+my @Labels = (1);   # List of labels for super example generation
+
+my %Label2FW;       # for multi-class: every label has feature->weight
+my %Prediction;     # prediction of each isolated multi-class label
 
 my @TmpFiles;
 
@@ -79,19 +84,19 @@ sub V(@) {
 
 sub usage(@) {
     print STDERR @_, "\n" if (@_);
-    die "Usage: $0 [options] <training-set-file>
+    die "Usage: $0 [options] [vw-options] <training-set-file>
     Options:
         -v          verbose
         -V          more verbose
-        -k          keep temporary files
-        -P<Opts>    Pass-through <Opts> as-is to the vw training step
-                    To have paired cross-features in name-spaces starting
-                    with X and Y, add -q XY ... to the -P option arguments,
-                    just like you do for vw.
+        -K          keep temporary files
         -O<which>   Use order/ranking metric <which>
                     Supported metrics:
                         ... not implemented yet ...
 
+    vw-options:
+        Note that all the above options do not clash with vw options
+        All other options will be passed as-is to the vw training step.
+
     See the script source head comments for more details.
 ";
 }
@@ -99,24 +104,67 @@ sub usage(@) {
 sub get_args {
     $0 =~ s{.*/}{};
 
-    getopts('VvkP:O:') || usage();
-    $opt_v = 1 if ($opt_V);
-    $opt_O = '' unless (defined $opt_O);
+    if (-f $ARGV[-1]) {
+        $TrainFile = pop(@ARGV);
+    } else {
+        usage("last command line arg must be a training-set file");
+    }
 
+    my @vw_opts_and_args = ();
     foreach my $arg (@ARGV) {
-        if (-f $arg) {
-            $TrainSet = $arg;
+        if ($arg =~ /^-[vVKOP]+$/) {
+            # These options are for us, not for vw
+            $opt_v = 1 if ($arg =~ /v/);
+            $opt_V = 1 if ($arg =~ /V/);
+            $opt_K = 1 if ($arg =~ /K/);
+            $opt_v = 1 if ($opt_V);
+            if ($arg =~ /O/) {
+                ($opt_O) = ("@ARGV" =~ /$arg\s+(\S+)\b/);
+            }
+            if ($arg =~ /P/) {
+                usage("-P: option no longer supported.\n" .
+                       "Please pass VW options directly.\n");
+            }
             next;
         }
+        if (-f $arg) {
+            my $skip_ts = 0;
+            if ($vw_opts_and_args[-1] =~ '-d|--data') {
+                pop(@vw_opts_and_args);
+                $skip_ts = 1;
+            } elsif ($vw_opts_and_args[-1] !~
+                        /^(?:
+                            -p
+                            |--predictions
+                            |-i
+                            |--initial_regressor
+                            |-f
+                            |--final_regressor
+                            |-r
+                            |--raw_predictions
+                            |--cache_file
+                            |--pid_file
+                            |--readable_model
+                            |--output_feature_\S+
+                        )$/x) {
+                $skip_ts = 1;
+            }
+            if ($skip_ts) {
+                warning("ignoring trainset: $arg in vw-args\n");
+                next;
+            }
+        }
+        push(@vw_opts_and_args, $arg);
     }
+    $opt_O = '' unless (defined $opt_O);
     usage("You must supply a training-set file")
-        unless (defined $TrainSet);
+        unless (defined $TrainFile);
 
-    usage("training-set file: $TrainSet: $!")
-        unless (-f $TrainSet);
+    usage("training-set file: $TrainFile: $!")
+        unless (-f $TrainFile);
 
-    if ($opt_P) {
-        $VWARGS = $opt_P;
+    if (@vw_opts_and_args) {
+        $VWARGS = "@vw_opts_and_args";
     }
     while ($VWARGS =~ /-q\s*(\S)(\S)/g) {
         push(@QPairs, [$1, $2]);
@@ -129,18 +177,23 @@ sub get_args {
     while ($VWARGS =~ /--ignore\s*(\S)/g) {
         $Ignore{$1} = 1;
     }
-    if ($VWARGS =~ /--(?:(?:cs)?oaa|wap|sequence)/) {
-        $DoLabels = 1;
+    if ($VWARGS =~ /--(?:(?:cs)?oaa|wap|ect|sequence)/) {
+        if ($VWARGS =~ /--(?:wap|ect)/) {
+            # Please send a patch when/if you can figure these out
+            die "$0: --wap, --ect multi-class is not supported - sorry\n";
+        }
+        $MultiClass = 1;
     }
-    $Model = "$TrainSet.model";
-    $FullExample = "$TrainSet.full-example";
-    $AuditFile = "$TrainSet.audit";
+    $ModelFile = "$TrainFile.model";
+    $RModelFile = "$TrainFile.rmodel";
+    $ExampleFile = "$TrainFile.examples";
+    $AuditFile = "$TrainFile.audit";
 
-    @TmpFiles = ($Model, $FullExample, $AuditFile);
+    @TmpFiles = ($ModelFile, $RModelFile, $ExampleFile, $AuditFile);
 }
 
 sub cleanup {
-    if ($opt_k) {
+    if ($opt_K) {
         v("keeping temporary files: @TmpFiles\n");
         return;
     }
@@ -204,7 +257,12 @@ sub pair_features {
 
 sub parse_labels($) {
     my $labels = shift;
-    # FINISH ME
+    $labels =~ s/\s+\S+$//; # trim optional tag (touching the '|')
+    while ($labels =~ /([^:\s]+):?(\S+)?/g) {
+        # match labels and optional weights
+        $Labels{$1} = (defined $2) ? $2 : 1;
+    }
+    sort {$a <=> $b} keys %Labels;
 }
 
 #
@@ -230,8 +288,8 @@ sub read_features($) {
         die "$0: $trainset line $.: malformed example: missing '|'\n"
             unless (defined $input_features);
 
-        if ($DoLabels) {
-            parse_labels($labels);
+        if ($MultiClass) {
+            @Labels = parse_labels($labels);
         }
 
         my @name_space_region = split('\|', $input_features);
@@ -298,153 +356,243 @@ sub read_features($) {
 #
 sub do_train($$;$) {
     my ($trainset, $model, $rmodel) = @_;
-    my $cmd = "$VW --quiet $VWARGS -d $trainset -f $model";
+    my $cmd = "$VW --quiet $VWARGS -f $model";
+    if ($opt_v) {
+        $cmd =~ s/ --quiet / /;
+    }
     if (defined $rmodel) {
         $cmd .= " --readable_model $rmodel";
     }
+    $cmd .= " $trainset";
     v("training: %s\n", $cmd);
     system($cmd);
+    die "$0: vw training failed (see details above)\n"
+        unless ($? == 0);
 }
 
-sub generate_full_example($) {
-    my ($full_example) = @_;
-    open(my $fe, ">$full_example") ||
-        die "$0: can't write full_example file: '$full_example': $!\n";
-    print $fe "1";
-    
+sub generate_one_example($$) {
+    my ($fd, $label) = @_;
+
+    if ($MultiClass) {
+        printf $fd "%s:1", $label;
+        # foreach $label2 (@Labels) {
+        #    next if ($label eq $label2);
+        #    printf $fd " %s:0", $label2;
+        # }
+    } else {
+        # simple, non multi-class case
+        print $fd $label;
+    }
+    # print all possible input features, with a weight of 1
     foreach my $ns (keys %NameSpaces) {
         my $nsref = $NameSpaces{$ns};
-        printf $fe ' |%s', $ns;
+        printf $fd ' |%s', $ns;
         foreach my $key (sort keys %$nsref) {
             my $weight = 1;
-            printf $fe ' %s:%s', $key, $weight;
+            printf $fd ' %s:%s', $key, $weight;
         }
     }
-    print $fe "\n";
-    close $fe;
+    print $fd "\n";
 }
 
-#
-# audit_features()
-#   read the output of vw -a (audit) on the all-feature example
-#   to extract hash values and weights
-#   Return the list of all feature-names
-#
-sub audit_features {
-    generate_full_example($FullExample);
+sub generate_examples($) {
+    my ($example_file) = shift;
+    open(my $fd, ">$example_file") ||
+        die "$0: can't write full_example file: '$example_file': $!\n";
 
-    my $audit_cmd = "$VW --quiet -t --audit -i $Model -d $FullExample";
-    $audit_cmd .= "|tee $AuditFile" if ($opt_k);
+    v("Labels: @Labels\n");
+    foreach my $label (@Labels) {
+        # One line per label:
+        # multiclass deprecates to singleton: label=1
+        generate_one_example($fd, $label);
+    }
+    close $fd;
+}
 
-    open(my $audit_stream, "$audit_cmd |")
-        || die "$0: can't run \$audit_cmd: '$audit_cmd |'\n";
+my %SeenFeatureNames;
+my $MCLabel;
+my $MCLabelIndex = -1;
+
+sub audit_one_example($) {
+    my $audit_stream = shift;
+
+    # skip the prediction line
+    # we're only interested in the audit line
     my $prediction = <$audit_stream>;
     my $features_data = <$audit_stream>;
-    close $audit_stream;
 
-    my %seen_feature_names;
+    my $weight_href;
+    if ($MultiClass) {
+        if (++$MCLabelIndex >= @Labels) {
+            $MCLabelIndex =0;
+        }
+        $MCLabel = $Labels[$MCLabelIndex];
+        $weight_href = $Label2FW{$MCLabel} = {};
+        chomp($prediction);
+        $Prediction{$MCLabel} = $prediction;
+    }
+
+    chomp($features_data);
+    my @features_list = split(' ', $features_data);
+
+    while (@features_list) {
+        my $audited_item = shift @features_list;
+        next unless ($audited_item);
+
+        # Audited feature format:   namespace^varname:142703:1:0.0435613 ...
+        my (@fields) = split(':', $audited_item);
 
-    # Audited feature format:   namespace^varname:142703:1:0.0435613 ...
-    while ($features_data =~ /\s+([^:]+):(\d+):([^:]+):([-+e.0-9]+)/g) {
+        my ($feature, $hashval, $value, $weight) = @fields[-4 .. -1];
 
-        my ($feature, $hash, $value, $weight) = ($1, $2, $3, $4);
+        unless ($feature) {
+            if ($MultiClass) {
+                $feature = "Constant_$MCLabel";
+                $FeatureMax{$feature} = 0;
+                $FeatureMin{$feature} = 0;
+            }
+        }
 
-        $seen_feature_names{$feature} = 1;
-        $Feature2Hash{$feature} = $hash;
-        $Feature2Weight{$feature} = $weight;
+        $SeenFeatureNames{$feature} = 1;
+        $Feature2Hash{$feature} = $hashval;
 
-        V("%s\t%s\t%s\t%s\n", $feature, $hash, $value, $weight);
+        if ($MultiClass) {
+            # v("audit_one_example: MC=$MultiClass Label=$MCLabel {$feature} = $weight\n");
+            $weight_href->{$feature} = $weight;
+        } else {
+            $Feature2Weight{$feature} = $weight;
+        }
+        V("%s\t%s\t%s\t%s\n", $feature, $hashval, $value, $weight);
     }
+}
+
+#
+# audit_features()
+#   read the output of vw -a (audit) on the all-feature example
+#   to extract hash values and weights
+#   Return the list of all feature-names
+#
+sub audit_features {
+    generate_examples($ExampleFile);
+
+    # Bug in vw multiclass, looks like we need to pass the multiclass
+    # params to --audit even though they should be in the model
+    my $vw_audit_args = "--quiet -t --audit -i $ModelFile -d $ExampleFile";
+    my $vw_mcargs = '';
+    if (${VWARGS} =~ /--(?:(?:cs)?oaa|wap|sequence)(?:_ldf)?\s+\d+/) {
+        $vw_mcargs = $&;
+    }
+    my $audit_cmd = "$VW $vw_mcargs $vw_audit_args";
+    $audit_cmd .= "|tee $AuditFile" if ($opt_K);
+
+    open(my $audit_stream, "$audit_cmd |") ||
+        die "$0: can't run \$audit_cmd: '$audit_cmd |'\n";
+
+    while (!eof($audit_stream)) {
+        audit_one_example($audit_stream);
+    }
+
+    close $audit_stream;
 
     # Return the list of features actually seen in the audit
-    sort keys %seen_feature_names;
+    sort keys %SeenFeatureNames;
 }
 
+
 #
 # return 'score' metric for a given feature
 #
-my %Score;
+sub score($$;$) {
+    my ($class_label, $feature, $metric) = @_;
 
-sub by_score {
-    $Score{$b} <=> $Score{$a}
-}
-
-sub score($;$) {
-    my ($feature, $metric) = @_;
+    my $f2w_hashref = $MultiClass
+                        ? $Label2FW{$class_label}
+                        : \%Feature2Weight;
 
-    my $fweight = $Feature2Weight{$feature};
+    my $fweight = $f2w_hashref->{$feature};
     unless (defined $fweight) {
-        warn "$0: BUG?: score($feature): feature has no weight!\n";
+        warn "$0: BUG?: score($class_label, $feature): undef weight!\n";
         return undef;
     }
 
-    # Support more metrics, are the any?
+    # Support more metrics, are there any others that make sense?
     # if ($metric eq '...') ...
 
     $fweight;
 }
 
 #
-# summarize_features
-#   Output what we know about all features + relative score
+# Find maximum feature-name length and max/min values
 #
-sub summarize_features {
-    my ($min_weight, $max_weight) = (0, 0);
-    my $max_len = 0;
+sub feature_flen_min_max($@) {
+    my $w_href = shift @_;
+
+    my ($max_flen, $min_weight, $max_weight) = (10, 0, 0);
 
-    # 1st pass - determine maximum feature-name length
-    # and max/min values
-    foreach my $f (@Features) {
-        my $w = $Feature2Weight{$f};
+    foreach my $f (@_) {
+        my $w = $w_href->{$f};
         unless (defined $w) {
             # Should already be caught in score() above,
             # so warn only in verbose mode
-            v("%s: summarize_features: %s: undefined weight\n", $0, $f);
+            v("%s: feature_flen_min_max: %s: undefined weight\n", $0, $f);
             next;
         }
-        my $slen = length($f);
-        $max_len = $slen if ($slen > $max_len);
+        my $flen = length($f);
+        $max_flen = $flen if ($flen > $max_flen);
 
         $max_weight = $w if ($w > $max_weight);
         $min_weight = $w if ($w < $min_weight);
     }
-    my $range_weight = $max_weight - $min_weight;
- 
-    # 2nd pass - calculate scores of all features
-    my ($score, $min_score, $max_score) = (0, 0, 0);
-    foreach my $f (@Features) {
-        next if ($f eq 'Constant');
-
-        $score = score($f, $opt_O);
-        next unless (defined $score);
+    ($max_flen, $min_weight, $max_weight);
+}
+
+#
+# Find min/max score and zero the score of the constant feature
+#
+sub feature_min_max_score($@) {
+    my $class_label = shift @_;
 
-        $Score{$f} = $score;
+    my ($min_score, $max_score) = (0, 0);
+    my $score_href = {};    # feature->score
+
+    foreach my $f (@_) {            # features loop
+        if ($f =~ /^Constant/) {
+            my $constant_feature_name = $f;
+            $score_href->{$f} = 0;
+            next;
+        }
+
+        my $score = score($class_label, $f, $opt_O);
+        next unless (defined $score);
+        $score_href->{$f} = $score;
 
         $max_score = $score if ($score > $max_score);
         $min_score = $score if ($score < $min_score);
     }
-    my $score_range = $max_score - $min_score;
-    $Score{'Constant'} = 0;
-    # my $score_0 = score('Constant');
+    ($min_score, $max_score, $score_href);
+}
 
-    my $upper_range = abs($max_score);
-    my $lower_range = abs($min_score);
+sub print_feature_report($$$$@) {
+    my ($max_flen, $max_distance_from_0,
+        $class_label, $score_href, @features) = @_;
 
-    my $max_distance_from_0 = ($upper_range > $lower_range)
-                                    ? $upper_range
-                                    : $lower_range;
+    my $weight_href = $MultiClass
+                        ? $Label2FW{$class_label}
+                        : \%Feature2Weight;
 
     printf "%-*s\t%+10s %8s %8s %+9s %+10s\n",
-                $max_len, 'FeatureName', 'HashVal',
+                $max_flen, 'FeatureName', 'HashVal',
                 'MinVal', 'MaxVal',
                 'Weight', 'RelScore';
 
-    # FIXME: support different orders: by weight,
-    # by feature-range-normalized weights?
+    # TODO? support different orders:
+    # by weight, by feature-range-normalized weights?
+    foreach my $f (sort {
+                     $score_href->{$b} <=> $score_href->{$a}
+                  } @features) {
 
-    foreach my $f (sort by_score @Features) {
-        my $score = $Score{$f};
+        my $score = $score_href->{$f};
         my $distance_from_0 = $score;
+        $max_distance_from_0 = 1e-10 if ($max_distance_from_0 == 0);
         my $normalized_score = $distance_from_0 / $max_distance_from_0;
 
         # FIXME: support different normalization schemes
@@ -452,19 +600,55 @@ sub summarize_features {
             $normalized_score = abs($normalized_score);
         }
         printf "%-*s\t%10u %8.2f %8.2f %+9.4f %9.2f%%\n",
-                $max_len, $f,
+                $max_flen, $f,
                 $Feature2Hash{$f},
                 $FeatureMin{$f},
                 $FeatureMax{$f},
-                $Feature2Weight{$f},
+                $weight_href->{$f},
                 (100.0 * $normalized_score);
     }
+    print "\n" if ($MultiClass);
+}
+
+
+#
+# summarize_features
+#   Output what we know about all features + relative score
+#
+sub summarize_features {
+    # Per-class loop for multi-class,
+    # only one loop for non multi-class
+    foreach my $label (@Labels) {
+        my @features = ($MultiClass) ? (keys %{$Label2FW{$label}}) : @Features;
+
+        my ($min_score, $max_score, $score_href) =
+                            feature_min_max_score($label, @features);
+        my $score_range = $max_score - $min_score;
+
+        my ($max_flen, $min_wt, $max_wt) =
+                            feature_flen_min_max($score_href, @features);
+        my $range_weight = $max_wt - $min_wt;
+
+        my $upper_range = abs($max_score);
+        my $lower_range = abs($min_score);
+
+        my $max_distance_from_0 = ($upper_range > $lower_range)
+                                        ? $upper_range
+                                        : $lower_range;
+
+        printf "=== Class Label: %s\tPrediction: %s\n",
+                $label, $Prediction{$label}
+                    if ($MultiClass);
+
+        print_feature_report($max_flen, $max_distance_from_0,
+                             $label, $score_href, @features);
+    }
 }
 
 # -- main
 get_args();
-read_features($TrainSet);
-do_train($TrainSet, $Model);
+read_features($TrainFile);
+do_train($TrainFile, $ModelFile, $RModelFile);
 @Features = audit_features();
 summarize_features();
 cleanup();
author	ariel faigon <ariel.git@yendor.com>	2012-08-01 05:06:22 +0400
committer	ariel faigon <ariel.git@yendor.com>	2012-08-01 05:06:22 +0400
commit	d85b6521daa9c048266dc882b166aee7238b8dd7 (patch)
tree	2ccacac99068fac645bdc5c1ea21f99a503700fe /utl
parent	551d33f3559fd729f5787d15af971891935a674d (diff)