Ondrej's little tools to examine weight settings

not quite fit for public use, esp. the -summarize.sh one... git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4071 1f5c12ca-751b-0410-a591-d2e778427230
author: bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> 2011-07-08 04:11:10 +0400
committer: bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> 2011-07-08 04:11:10 +0400
commit: 66b71a7f5cdae8afff02e8ff37e19ad6a2be43c1 (patch)
tree: c769daff8bcc3ceaecd638f877a20737e22bab8f /scripts/analysis
parent: 8ffbe2389ec76febc4c8f00c7b84076a8d9a3e5e (diff)
2 files changed, 334 insertions, 0 deletions
diff --git a/scripts/analysis/weight-scan-summarize.sh b/scripts/analysis/weight-scan-summarize.sh
new file mode 100755
index 000000000..237182736
--- /dev/null
+++ b/scripts/analysis/weight-scan-summarize.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Hackish summarization of weight-scan.pl results, heavily relies on tools by
+# Ondrej Bojar (bojar@ufal.mff.cuni.cz), some of which need Mercury; beware.
+
+function die() { echo "$@" >&2; exit 1; }
+set -o pipefail  # safer pipes
+
+refs="$1"
+dir="$2"
+
+[ -d "$dir" ] && [ -e "$refs" ] \
+  || die "usage: $0 ref-file weight-scan-working-dir"
+
+testbleu=$HOME/tools/src/obotools/testbleu
+projectbleu=$HOME/tools/src/obotools/projectbleu
+
+[ -x "$testbleu" ] || die "Can't run $testbleu"
+[ -x "$projectbleu" ] || die "Can't run $projectbleu"
+
+# create exact bleus and put them to bleu.*
+for f in $dir/out.*; do
+  bleuf=${f//out./bleu.}
+  [ -e "$bleuf" ] \
+    || $testbleu $refs < $f | pickre --re='BLEU...([0-9.]*)' > $bleuf \
+    || die "Failed to construct $bleuf"
+done
+
+# create bleu projections from each best* and put them to corresponding pbleu*
+# first collect all weights
+lcat $dir/weights.* \
+| tr ' ' , \
+| pickre --re='weights.([-0-9.]*)' \
+| cut -f 1,3 \
+| numsort 1 \
+> $dir/allweights
+allwparam=$(cut -f2 $dir/allweights | prefix -- '-w ' | tr '\n' ' ')
+for f in $dir/best*.*; do
+  pbleuf=$(echo $f | sed 's/best[0-9]*/pbleu/')
+  if [ ! -e "$pbleuf" ] || [ `wc -l < $pbleuf` -ne `wc -l < $dir/allweights` ]; then
+    # need to regenerate the projection
+    $projectbleu $refs $allwparam < $f \
+    | paste $dir/allweights - \
+    | cut -f1,3 \
+    > $pbleuf \
+    || die "Failed to construct $pbleuf"
+  fi
+done
+
+# summarize bleu projections
+echo "goal	proj/real	from	was" > $dir/graph.data
+for f in $dir/bleu.*; do
+  obs=$(echo $f | sed 's/^.*bleu\.//')
+  cat $dir/pbleu.$obs \
+  | pickre --re='F: ([0-9.]*)' \
+  | recut 2,1 \
+  | prefix --tab -- "$obs\tproj" \
+  >> $dir/graph.data
+  lcat $dir/bleu.$obs \
+  | pickre --re='bleu\.([-0-9.]*)' \
+  | prefix --tab -- "$obs\treal" \
+  | recut 1,2,3,5 \
+  >> $dir/graph.data
+done
+
+
+exit 0
+
+## COMMANDS TO PLOT IT:
+# plot 'walkable' graph of projections at various points
+g=weight-scan-tm_2/graph.data; cat $g | skip 1 | grep real | cut -f2- | numsort 2 | sed 's/real/all/' > cliprealall; skip 1 < $g | numsort 1,3 | split_at_colchange 1 | blockwise "(prefix --tab x cliprealall; cat -) | labelledxychart --data=3,4,0,'',linespoints --blockpivot=2" > clip
+
+# plot a combination of projections along with the individual projections and
+# the real scores
+cat best100.-0.100000 best100.-0.500000 best100.-0.300000 best100.-0.200000 | /home/obo/tools/src/obotools/projectbleu ../tune.ref $allwparam | paste allweights - > comb.-0.5_-0.3_-0.2_-0.1
+(lcat pbleu.-0.100000 pbleu.-0.500000 pbleu.-0.300000 pbleu.-0.200000 comb.-0.5_-0.3_-0.2_-0.1 | pickre --re='F: ([0-9.]*)' | recut 2,3,1 ; cat graph.data | skip 1 | grep real | cut -f2- | numsort 2 ) | tee delme | labelledxychart --blockpivot=1 --data=2,3,0,'',linespoints | gpsandbox
diff --git a/scripts/analysis/weight-scan.pl b/scripts/analysis/weight-scan.pl
new file mode 100755
index 000000000..0ed4dbe09
--- /dev/null
+++ b/scripts/analysis/weight-scan.pl
@@ -0,0 +1,259 @@
+#!/usr/bin/perl
+# runs Moses many times changing the values of one weight, all others fixed
+# nbest lists are always produced to allow for comparison of real and
+# 'projected' BLEU (BLEU estimated from n-best lists collected at a neighouring
+# node)
+# usage: weight-scan.pl <input> <moses> <moses.ini> tm_2 --range=0.0,0.1,1.0
+
+use strict;
+use warnings;
+use Getopt::Long;
+use FindBin qw($Bin);
+use File::Basename;
+use File::Path;
+my $SCRIPTS_ROOTDIR = $Bin;
+$SCRIPTS_ROOTDIR =~ s/\/training$//;
+$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
+
+my $prec = 3; # precision of weightvalue within filename
+my $jobs = 0;
+my $workdir = "weight-scan";
+my $range = "0.0,0.1,1.0";
+my $input_type = 0;
+my $normalize = 0; # normalize 
+my $nbestsize = 100;
+my $decoderflags = "";
+my $moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl";
+my $qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl";
+my $queue_flags = "-hard";  # extra parameters for parallelizer
+GetOptions(
+  "jobs=i" => \$jobs,
+  "range=s" => \$range,
+  "working-dir=s" => \$workdir,
+  "normalize!" => \$normalize,
+  "nbest=i" => \$nbestsize,
+  "decoderflags=s" => \$decoderflags,
+) or exit 1;
+
+my $inf = shift;
+my $decoder = shift;
+my $config = shift;
+my $weightspec = shift;
+
+if (!defined $inf || ! defined $decoder || !defined $config || !defined $weightspec) {
+  print STDERR "usage: $0 <input> <moses> <moses.ini> tm_2 --range=0.0,0.1,1.0
+Options:
+  --working-dir=weight-scan
+  --jobs=0
+  --range=0.0,0.1,1.0
+";
+  exit 1;
+}
+
+print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";
+
+die "Not executable: $moses_parallel_cmd" if defined $jobs && ! -x $moses_parallel_cmd;
+die "Not executable: $qsubwrapper" if defined $jobs && ! -x $qsubwrapper;
+die "Not executable: $decoder" if ! -x $decoder;
+
+my $inf_abs = ensure_full_path($inf);
+die "File not found: $inf (interpreted as $inf_abs)."
+  if ! -e $inf_abs;
+$inf = $inf_abs;
+
+my $decoder_abs = ensure_full_path($decoder);
+die "File not executable: $decoder (interpreted as $decoder_abs)."
+  if ! -x $decoder_abs;
+$decoder = $decoder_abs;
+
+my $config_abs = ensure_full_path($config);
+die "File not found: $config (interpreted as $config_abs)."
+  if ! -e $config_abs;
+$config = $config_abs;
+
+
+my ($startvalue, $step, $stopvalue) = split /,/, $range;
+die "Bad range: $range; expected start,step,stop"
+  if !defined $startvalue || !defined $step || !defined $stopvalue;
+
+
+my $featlist = get_featlist_from_moses($config);
+
+# $weightidx is within features of the name $weightname
+# $weightindex is global
+my ($weightname, $weightidx) = split /_/, $weightspec;
+my $weightindex;
+
+# scan the weights, find the one we'll test and remember values of all of the
+# given name
+my $only_one_expected = 0;
+if (!defined $weightidx) {
+  $only_one_expected = 1;
+  $weightidx = 0;
+}
+my @weightvalues = ();
+my $idx = 0;
+for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
+  my $name = $featlist->{"names"}->[$i];
+  if ($name eq $weightname) {
+    push @weightvalues, $featlist->{"values"}->[$i];
+    $weightindex = $i if $idx == $weightidx; # remember the global index of the weight
+    $idx++;
+  }
+}
+
+die "You specified only '$weightspec' but there are $idx features of the given name.\nUse e.g.: ${weightspec}_0\n"
+  if $only_one_expected && $idx > 1;
+die "Failed to find weights of the name '$weightname' in moses config."
+  if !defined $weightindex;
+
+
+
+#store current directory and create the working directory (if needed)
+my $cwd = `pawd 2>/dev/null`; 
+if(!$cwd){$cwd = `pwd`;}
+chomp($cwd);
+
+mkpath($workdir);
+{
+# open local scope
+
+#chdir to the working directory
+chdir($workdir) or die "Can't chdir to $workdir";
+
+## MAIN LOOP
+for(my $weightvalue = $startvalue; $weightvalue <= $stopvalue; $weightvalue+=$step) {
+  my $nbestout = run_decoder($featlist, $weightvalue);
+}
+
+
+#chdir back to the original directory # useless, just to remind we were not there
+chdir($cwd);
+} # end of local scope
+
+sub run_decoder {
+    my ($featlist, $weightvalue) = @_;
+    my $filebase = sprintf("%${prec}f", $weightvalue);
+    my $nbestfilename = "best$nbestsize.$filebase";
+    my $filename = "out.$filebase";
+    
+    # user-supplied parameters
+    print STDERR "params = $decoderflags\n";
+
+    # parameters to set all model weights (to override moses.ini)
+    my @vals = @{$featlist->{"values"}};
+    $vals[$weightindex] = $weightvalue; # set the one we're scanning
+    if ($normalize) {
+      print STDERR "Normalizing lambdas: @vals\n";
+      my $totlambda=0;
+      grep($totlambda+=abs($_),@vals);
+      grep($_/=$totlambda,@vals);
+    }
+    # moses now does not seem accept "-tm X -tm Y" but needs "-tm X Y"
+    my %model_weights;
+    for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) {
+      my $name = $featlist->{"names"}->[$i];
+      $model_weights{$name} = "-$name" if !defined $model_weights{$name};
+      $model_weights{$name} .= sprintf " %.6f", $vals[$i];
+    }
+    my $decoder_config = join(" ", values %model_weights);
+    print STDERR "DECODER_CFG = $decoder_config\n";
+
+    # write the weights for future use
+    open OUTF, ">weights.$filebase" or die "Can't write weights";
+    print OUTF join(" ", map { sprintf("%.6f", $_) } @vals)."\n";
+    close OUTF;
+
+    # run the decoder
+    my $nBest_cmd = "-n-best-size $nbestsize";
+    my $decoder_cmd;
+
+    if ($jobs) {
+      $decoder_cmd = "$moses_parallel_cmd -config $config -inputtype $input_type -qsub-prefix scan$weightvalue -queue-parameters \"$queue_flags\" -decoder-parameters \"$decoderflags $decoder_config\" -n-best-list \"$nbestfilename $nbestsize\" -input-file $inf -jobs $jobs -decoder $decoder > $filename";
+    } else {
+      $decoder_cmd = "$decoder $decoderflags  -config $config -inputtype $input_type $decoder_config -n-best-list $nbestfilename $nbestsize -input-file $inf > $filename";
+    }
+
+    safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n";
+
+    return $nbestfilename;
+}
+
+sub get_featlist_from_moses {
+  # run moses with the given config file and return the list of features and
+  # their initial values
+  my $configfn = shift;
+  my $featlistfn = "./features.list";
+  if (-e $featlistfn) {
+    print STDERR "Using cached features list: $featlistfn\n";
+  } else {
+    print STDERR "Asking moses for feature names and values from $configfn\n";
+    my $cmd = "$decoder $decoderflags -config $configfn  -inputtype $input_type -show-weights > $featlistfn";
+    safesystem($cmd) or die "Failed to run moses with the config $configfn";
+  }
+
+  # read feature list
+  my @names = ();
+  my @startvalues = ();
+  open(INI,$featlistfn) or die "Can't read $featlistfn";
+  my $nr = 0;
+  my @errs = ();
+  while (<INI>) {
+    $nr++;
+    chomp;
+    my ($longname, $feature, $value) = split / /;
+    push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n"
+      if $value !~ /^[+-]?[0-9.e]+$/;
+    #push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n"
+    #  if !defined $ABBR2FULL{$feature};
+    push @names, $feature;
+    push @startvalues, $value;
+  }
+  close INI;
+  if (scalar @errs) {
+    print STDERR join("", @errs);
+    exit 1;
+  }
+  return {"names"=>\@names, "values"=>\@startvalues};
+}
+
+sub safesystem {
+  print STDERR "Executing: @_\n";
+  system(@_);
+  if ($? == -1) {
+      print STDERR "Failed to execute: @_\n  $!\n";
+      exit(1);
+  }
+  elsif ($? & 127) {
+      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
+          ($? & 127),  ($? & 128) ? 'with' : 'without';
+      exit(1);
+  }
+  else {
+    my $exitcode = $? >> 8;
+    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    return ! $exitcode;
+  }
+}
+
+sub ensure_full_path {
+    my $PATH = shift;
+$PATH =~ s/\/nfsmnt//;
+    return $PATH if $PATH =~ /^\//;
+    my $dir = `pawd 2>/dev/null`; 
+    if(!$dir){$dir = `pwd`;}
+    chomp($dir);
+    $PATH = $dir."/".$PATH;
+    $PATH =~ s/[\r\n]//g;
+    $PATH =~ s/\/\.\//\//g;
+    $PATH =~ s/\/+/\//g;
+    my $sanity = 0;
+    while($PATH =~ /\/\.\.\// && $sanity++<10) {
+        $PATH =~ s/\/+/\//g;
+        $PATH =~ s/\/[^\/]+\/\.\.\//\//g;
+    }
+    $PATH =~ s/\/[^\/]+\/\.\.$//;
+    $PATH =~ s/\/+$//;
+$PATH =~ s/\/nfsmnt//;
+    return $PATH;
+}
author	bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>	2011-07-08 04:11:10 +0400
committer	bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>	2011-07-08 04:11:10 +0400
commit	66b71a7f5cdae8afff02e8ff37e19ad6a2be43c1 (patch)
tree	c769daff8bcc3ceaecd638f877a20737e22bab8f /scripts/analysis
parent	8ffbe2389ec76febc4c8f00c7b84076a8d9a3e5e (diff)