diff options
author | ariel faigon <ariel.git@yendor.com> | 2012-08-01 05:06:22 +0400 |
---|---|---|
committer | ariel faigon <ariel.git@yendor.com> | 2012-08-01 05:06:22 +0400 |
commit | d85b6521daa9c048266dc882b166aee7238b8dd7 (patch) | |
tree | 2ccacac99068fac645bdc5c1ea21f99a503700fe /utl | |
parent | 551d33f3559fd729f5787d15af971891935a674d (diff) |
vw-varinfo: Add multi-class support. Make invocation friendlier.
Diffstat (limited to 'utl')
-rwxr-xr-x | utl/vw-varinfo | 408 |
1 files changed, 296 insertions, 112 deletions
diff --git a/utl/vw-varinfo b/utl/vw-varinfo index 8c175747..19081b73 100755 --- a/utl/vw-varinfo +++ b/utl/vw-varinfo @@ -34,13 +34,13 @@ # (c) 2012 - ariel faigon for vowpal-wabbit # This software may be distributed under the same terms as vowpal-wabbit # -use Getopt::Std; -use vars (qw($opt_v $opt_V $opt_P $opt_O $opt_k)); +# use Getopt::Std; +use vars (qw($opt_v $opt_V $opt_O $opt_K)); my $VW = 'vw'; my $VWARGS = '--exact_adaptive_norm --sort_features'; -my ($TrainSet, $Model, $FullExample, $AuditFile); +my ($TrainFile, $ModelFile, $RmodelFile, $ExampleFile, $AuditFile); my (%FeatureMax, %FeatureMin); my (%Feature2Hash, %Feature2Weight); @@ -52,7 +52,12 @@ my @QPairs = (); # list of pairs ([a, b], [c, d] ...) for namespace pairing my %Ignore; # support for --ignore X my %Keep; # support for --keep X my $DoKeep; # flag for whether we need to use --keep or not -my $DoLabels = 0; # flag for multi-class (--oaa --csoaa --wap* --sequence?) +my $MultiClass = 0; # flag for multi-class (--oaa --csoaa --wap* --sequence?) +my %Labels; +my @Labels = (1); # List of labels for super example generation + +my %Label2FW; # for multi-class: every label has feature->weight +my %Prediction; # prediction of each isolated multi-class label my @TmpFiles; @@ -79,19 +84,19 @@ sub V(@) { sub usage(@) { print STDERR @_, "\n" if (@_); - die "Usage: $0 [options] <training-set-file> + die "Usage: $0 [options] [vw-options] <training-set-file> Options: -v verbose -V more verbose - -k keep temporary files - -P<Opts> Pass-through <Opts> as-is to the vw training step - To have paired cross-features in name-spaces starting - with X and Y, add -q XY ... to the -P option arguments, - just like you do for vw. + -K keep temporary files -O<which> Use order/ranking metric <which> Supported metrics: ... not implemented yet ... + vw-options: + Note that all the above options do not clash with vw options + All other options will be passed as-is to the vw training step. + See the script source head comments for more details. "; } @@ -99,24 +104,67 @@ sub usage(@) { sub get_args { $0 =~ s{.*/}{}; - getopts('VvkP:O:') || usage(); - $opt_v = 1 if ($opt_V); - $opt_O = '' unless (defined $opt_O); + if (-f $ARGV[-1]) { + $TrainFile = pop(@ARGV); + } else { + usage("last command line arg must be a training-set file"); + } + my @vw_opts_and_args = (); foreach my $arg (@ARGV) { - if (-f $arg) { - $TrainSet = $arg; + if ($arg =~ /^-[vVKOP]+$/) { + # These options are for us, not for vw + $opt_v = 1 if ($arg =~ /v/); + $opt_V = 1 if ($arg =~ /V/); + $opt_K = 1 if ($arg =~ /K/); + $opt_v = 1 if ($opt_V); + if ($arg =~ /O/) { + ($opt_O) = ("@ARGV" =~ /$arg\s+(\S+)\b/); + } + if ($arg =~ /P/) { + usage("-P: option no longer supported.\n" . + "Please pass VW options directly.\n"); + } next; } + if (-f $arg) { + my $skip_ts = 0; + if ($vw_opts_and_args[-1] =~ '-d|--data') { + pop(@vw_opts_and_args); + $skip_ts = 1; + } elsif ($vw_opts_and_args[-1] !~ + /^(?: + -p + |--predictions + |-i + |--initial_regressor + |-f + |--final_regressor + |-r + |--raw_predictions + |--cache_file + |--pid_file + |--readable_model + |--output_feature_\S+ + )$/x) { + $skip_ts = 1; + } + if ($skip_ts) { + warning("ignoring trainset: $arg in vw-args\n"); + next; + } + } + push(@vw_opts_and_args, $arg); } + $opt_O = '' unless (defined $opt_O); usage("You must supply a training-set file") - unless (defined $TrainSet); + unless (defined $TrainFile); - usage("training-set file: $TrainSet: $!") - unless (-f $TrainSet); + usage("training-set file: $TrainFile: $!") + unless (-f $TrainFile); - if ($opt_P) { - $VWARGS = $opt_P; + if (@vw_opts_and_args) { + $VWARGS = "@vw_opts_and_args"; } while ($VWARGS =~ /-q\s*(\S)(\S)/g) { push(@QPairs, [$1, $2]); @@ -129,18 +177,23 @@ sub get_args { while ($VWARGS =~ /--ignore\s*(\S)/g) { $Ignore{$1} = 1; } - if ($VWARGS =~ /--(?:(?:cs)?oaa|wap|sequence)/) { - $DoLabels = 1; + if ($VWARGS =~ /--(?:(?:cs)?oaa|wap|ect|sequence)/) { + if ($VWARGS =~ /--(?:wap|ect)/) { + # Please send a patch when/if you can figure these out + die "$0: --wap, --ect multi-class is not supported - sorry\n"; + } + $MultiClass = 1; } - $Model = "$TrainSet.model"; - $FullExample = "$TrainSet.full-example"; - $AuditFile = "$TrainSet.audit"; + $ModelFile = "$TrainFile.model"; + $RModelFile = "$TrainFile.rmodel"; + $ExampleFile = "$TrainFile.examples"; + $AuditFile = "$TrainFile.audit"; - @TmpFiles = ($Model, $FullExample, $AuditFile); + @TmpFiles = ($ModelFile, $RModelFile, $ExampleFile, $AuditFile); } sub cleanup { - if ($opt_k) { + if ($opt_K) { v("keeping temporary files: @TmpFiles\n"); return; } @@ -204,7 +257,12 @@ sub pair_features { sub parse_labels($) { my $labels = shift; - # FINISH ME + $labels =~ s/\s+\S+$//; # trim optional tag (touching the '|') + while ($labels =~ /([^:\s]+):?(\S+)?/g) { + # match labels and optional weights + $Labels{$1} = (defined $2) ? $2 : 1; + } + sort {$a <=> $b} keys %Labels; } # @@ -230,8 +288,8 @@ sub read_features($) { die "$0: $trainset line $.: malformed example: missing '|'\n" unless (defined $input_features); - if ($DoLabels) { - parse_labels($labels); + if ($MultiClass) { + @Labels = parse_labels($labels); } my @name_space_region = split('\|', $input_features); @@ -298,153 +356,243 @@ sub read_features($) { # sub do_train($$;$) { my ($trainset, $model, $rmodel) = @_; - my $cmd = "$VW --quiet $VWARGS -d $trainset -f $model"; + my $cmd = "$VW --quiet $VWARGS -f $model"; + if ($opt_v) { + $cmd =~ s/ --quiet / /; + } if (defined $rmodel) { $cmd .= " --readable_model $rmodel"; } + $cmd .= " $trainset"; v("training: %s\n", $cmd); system($cmd); + die "$0: vw training failed (see details above)\n" + unless ($? == 0); } -sub generate_full_example($) { - my ($full_example) = @_; - open(my $fe, ">$full_example") || - die "$0: can't write full_example file: '$full_example': $!\n"; - print $fe "1"; - +sub generate_one_example($$) { + my ($fd, $label) = @_; + + if ($MultiClass) { + printf $fd "%s:1", $label; + # foreach $label2 (@Labels) { + # next if ($label eq $label2); + # printf $fd " %s:0", $label2; + # } + } else { + # simple, non multi-class case + print $fd $label; + } + # print all possible input features, with a weight of 1 foreach my $ns (keys %NameSpaces) { my $nsref = $NameSpaces{$ns}; - printf $fe ' |%s', $ns; + printf $fd ' |%s', $ns; foreach my $key (sort keys %$nsref) { my $weight = 1; - printf $fe ' %s:%s', $key, $weight; + printf $fd ' %s:%s', $key, $weight; } } - print $fe "\n"; - close $fe; + print $fd "\n"; } -# -# audit_features() -# read the output of vw -a (audit) on the all-feature example -# to extract hash values and weights -# Return the list of all feature-names -# -sub audit_features { - generate_full_example($FullExample); +sub generate_examples($) { + my ($example_file) = shift; + open(my $fd, ">$example_file") || + die "$0: can't write full_example file: '$example_file': $!\n"; - my $audit_cmd = "$VW --quiet -t --audit -i $Model -d $FullExample"; - $audit_cmd .= "|tee $AuditFile" if ($opt_k); + v("Labels: @Labels\n"); + foreach my $label (@Labels) { + # One line per label: + # multiclass deprecates to singleton: label=1 + generate_one_example($fd, $label); + } + close $fd; +} - open(my $audit_stream, "$audit_cmd |") - || die "$0: can't run \$audit_cmd: '$audit_cmd |'\n"; +my %SeenFeatureNames; +my $MCLabel; +my $MCLabelIndex = -1; + +sub audit_one_example($) { + my $audit_stream = shift; + + # skip the prediction line + # we're only interested in the audit line my $prediction = <$audit_stream>; my $features_data = <$audit_stream>; - close $audit_stream; - my %seen_feature_names; + my $weight_href; + if ($MultiClass) { + if (++$MCLabelIndex >= @Labels) { + $MCLabelIndex =0; + } + $MCLabel = $Labels[$MCLabelIndex]; + $weight_href = $Label2FW{$MCLabel} = {}; + chomp($prediction); + $Prediction{$MCLabel} = $prediction; + } + + chomp($features_data); + my @features_list = split(' ', $features_data); + + while (@features_list) { + my $audited_item = shift @features_list; + next unless ($audited_item); + + # Audited feature format: namespace^varname:142703:1:0.0435613 ... + my (@fields) = split(':', $audited_item); - # Audited feature format: namespace^varname:142703:1:0.0435613 ... - while ($features_data =~ /\s+([^:]+):(\d+):([^:]+):([-+e.0-9]+)/g) { + my ($feature, $hashval, $value, $weight) = @fields[-4 .. -1]; - my ($feature, $hash, $value, $weight) = ($1, $2, $3, $4); + unless ($feature) { + if ($MultiClass) { + $feature = "Constant_$MCLabel"; + $FeatureMax{$feature} = 0; + $FeatureMin{$feature} = 0; + } + } - $seen_feature_names{$feature} = 1; - $Feature2Hash{$feature} = $hash; - $Feature2Weight{$feature} = $weight; + $SeenFeatureNames{$feature} = 1; + $Feature2Hash{$feature} = $hashval; - V("%s\t%s\t%s\t%s\n", $feature, $hash, $value, $weight); + if ($MultiClass) { + # v("audit_one_example: MC=$MultiClass Label=$MCLabel {$feature} = $weight\n"); + $weight_href->{$feature} = $weight; + } else { + $Feature2Weight{$feature} = $weight; + } + V("%s\t%s\t%s\t%s\n", $feature, $hashval, $value, $weight); } +} + +# +# audit_features() +# read the output of vw -a (audit) on the all-feature example +# to extract hash values and weights +# Return the list of all feature-names +# +sub audit_features { + generate_examples($ExampleFile); + + # Bug in vw multiclass, looks like we need to pass the multiclass + # params to --audit even though they should be in the model + my $vw_audit_args = "--quiet -t --audit -i $ModelFile -d $ExampleFile"; + my $vw_mcargs = ''; + if (${VWARGS} =~ /--(?:(?:cs)?oaa|wap|sequence)(?:_ldf)?\s+\d+/) { + $vw_mcargs = $&; + } + my $audit_cmd = "$VW $vw_mcargs $vw_audit_args"; + $audit_cmd .= "|tee $AuditFile" if ($opt_K); + + open(my $audit_stream, "$audit_cmd |") || + die "$0: can't run \$audit_cmd: '$audit_cmd |'\n"; + + while (!eof($audit_stream)) { + audit_one_example($audit_stream); + } + + close $audit_stream; # Return the list of features actually seen in the audit - sort keys %seen_feature_names; + sort keys %SeenFeatureNames; } + # # return 'score' metric for a given feature # -my %Score; +sub score($$;$) { + my ($class_label, $feature, $metric) = @_; -sub by_score { - $Score{$b} <=> $Score{$a} -} - -sub score($;$) { - my ($feature, $metric) = @_; + my $f2w_hashref = $MultiClass + ? $Label2FW{$class_label} + : \%Feature2Weight; - my $fweight = $Feature2Weight{$feature}; + my $fweight = $f2w_hashref->{$feature}; unless (defined $fweight) { - warn "$0: BUG?: score($feature): feature has no weight!\n"; + warn "$0: BUG?: score($class_label, $feature): undef weight!\n"; return undef; } - # Support more metrics, are the any? + # Support more metrics, are there any others that make sense? # if ($metric eq '...') ... $fweight; } # -# summarize_features -# Output what we know about all features + relative score +# Find maximum feature-name length and max/min values # -sub summarize_features { - my ($min_weight, $max_weight) = (0, 0); - my $max_len = 0; +sub feature_flen_min_max($@) { + my $w_href = shift @_; + + my ($max_flen, $min_weight, $max_weight) = (10, 0, 0); - # 1st pass - determine maximum feature-name length - # and max/min values - foreach my $f (@Features) { - my $w = $Feature2Weight{$f}; + foreach my $f (@_) { + my $w = $w_href->{$f}; unless (defined $w) { # Should already be caught in score() above, # so warn only in verbose mode - v("%s: summarize_features: %s: undefined weight\n", $0, $f); + v("%s: feature_flen_min_max: %s: undefined weight\n", $0, $f); next; } - my $slen = length($f); - $max_len = $slen if ($slen > $max_len); + my $flen = length($f); + $max_flen = $flen if ($flen > $max_flen); $max_weight = $w if ($w > $max_weight); $min_weight = $w if ($w < $min_weight); } - my $range_weight = $max_weight - $min_weight; - - # 2nd pass - calculate scores of all features - my ($score, $min_score, $max_score) = (0, 0, 0); - foreach my $f (@Features) { - next if ($f eq 'Constant'); - - $score = score($f, $opt_O); - next unless (defined $score); + ($max_flen, $min_weight, $max_weight); +} + +# +# Find min/max score and zero the score of the constant feature +# +sub feature_min_max_score($@) { + my $class_label = shift @_; - $Score{$f} = $score; + my ($min_score, $max_score) = (0, 0); + my $score_href = {}; # feature->score + + foreach my $f (@_) { # features loop + if ($f =~ /^Constant/) { + my $constant_feature_name = $f; + $score_href->{$f} = 0; + next; + } + + my $score = score($class_label, $f, $opt_O); + next unless (defined $score); + $score_href->{$f} = $score; $max_score = $score if ($score > $max_score); $min_score = $score if ($score < $min_score); } - my $score_range = $max_score - $min_score; - $Score{'Constant'} = 0; - # my $score_0 = score('Constant'); + ($min_score, $max_score, $score_href); +} - my $upper_range = abs($max_score); - my $lower_range = abs($min_score); +sub print_feature_report($$$$@) { + my ($max_flen, $max_distance_from_0, + $class_label, $score_href, @features) = @_; - my $max_distance_from_0 = ($upper_range > $lower_range) - ? $upper_range - : $lower_range; + my $weight_href = $MultiClass + ? $Label2FW{$class_label} + : \%Feature2Weight; printf "%-*s\t%+10s %8s %8s %+9s %+10s\n", - $max_len, 'FeatureName', 'HashVal', + $max_flen, 'FeatureName', 'HashVal', 'MinVal', 'MaxVal', 'Weight', 'RelScore'; - # FIXME: support different orders: by weight, - # by feature-range-normalized weights? + # TODO? support different orders: + # by weight, by feature-range-normalized weights? + foreach my $f (sort { + $score_href->{$b} <=> $score_href->{$a} + } @features) { - foreach my $f (sort by_score @Features) { - my $score = $Score{$f}; + my $score = $score_href->{$f}; my $distance_from_0 = $score; + $max_distance_from_0 = 1e-10 if ($max_distance_from_0 == 0); my $normalized_score = $distance_from_0 / $max_distance_from_0; # FIXME: support different normalization schemes @@ -452,19 +600,55 @@ sub summarize_features { $normalized_score = abs($normalized_score); } printf "%-*s\t%10u %8.2f %8.2f %+9.4f %9.2f%%\n", - $max_len, $f, + $max_flen, $f, $Feature2Hash{$f}, $FeatureMin{$f}, $FeatureMax{$f}, - $Feature2Weight{$f}, + $weight_href->{$f}, (100.0 * $normalized_score); } + print "\n" if ($MultiClass); +} + + +# +# summarize_features +# Output what we know about all features + relative score +# +sub summarize_features { + # Per-class loop for multi-class, + # only one loop for non multi-class + foreach my $label (@Labels) { + my @features = ($MultiClass) ? (keys %{$Label2FW{$label}}) : @Features; + + my ($min_score, $max_score, $score_href) = + feature_min_max_score($label, @features); + my $score_range = $max_score - $min_score; + + my ($max_flen, $min_wt, $max_wt) = + feature_flen_min_max($score_href, @features); + my $range_weight = $max_wt - $min_wt; + + my $upper_range = abs($max_score); + my $lower_range = abs($min_score); + + my $max_distance_from_0 = ($upper_range > $lower_range) + ? $upper_range + : $lower_range; + + printf "=== Class Label: %s\tPrediction: %s\n", + $label, $Prediction{$label} + if ($MultiClass); + + print_feature_report($max_flen, $max_distance_from_0, + $label, $score_href, @features); + } } # -- main get_args(); -read_features($TrainSet); -do_train($TrainSet, $Model); +read_features($TrainFile); +do_train($TrainFile, $ModelFile, $RModelFile); @Features = audit_features(); summarize_features(); cleanup(); |