diff options
author | servan <servan@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-10-05 17:36:17 +0400 |
---|---|---|
committer | servan <servan@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-10-05 17:36:17 +0400 |
commit | f223f5a2765944ddf54d889c750b9dfc535bfb66 (patch) | |
tree | 54d42a58cd7d509970102ef651ae40461c7180ae /scripts | |
parent | 568a8cc0f49e04919ec70edde1a9bd91786f51ec (diff) |
M mert/TerScorer.cpp
M mert/BleuScorer.h
M mert/ScorerFactory.h
M mert/Scorer.h
M mert/PerScorer.h
M mert/TerScorer.h
M mert/Makefile.am
AM scripts/training/mert-moses-multi.pl
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4299 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/training/mert-moses-multi.pl | 1376 |
1 files changed, 1376 insertions, 0 deletions
diff --git a/scripts/training/mert-moses-multi.pl b/scripts/training/mert-moses-multi.pl new file mode 100755 index 000000000..56ee9a91b --- /dev/null +++ b/scripts/training/mert-moses-multi.pl @@ -0,0 +1,1376 @@ +#!/usr/bin/perl -w +# $Id$ +# Usage: +# mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config> +# For other options see below or run 'mert-moses.pl --help' + +# Notes: +# <foreign> and <english> should be raw text files, one sentence per line +# <english> can be a prefix, in which case the files are <english>0, <english>1, etc. are used + +# Excerpts from revision history + +# Sept 2011 multi-threaded mert (Barry Haddow) +# 3 Aug 2011 Added random directions, historic best, pairwise ranked (PK) +# Jul 2011 simplifications (Ondrej Bojar) +# -- rely on moses' -show-weights instead of parsing moses.ini +# ... so moses is also run once *before* mert starts, checking +# the model to some extent +# -- got rid of the 'triples' mess; +# use --range to supply bounds for random starting values: +# --range tm:-3..3 --range lm:-3..3 +# 5 Aug 2009 Handling with different reference length policies (shortest, average, closest) for BLEU +# and case-sensistive/insensitive evaluation (Nicola Bertoldi) +# 5 Jun 2008 Forked previous version to support new mert implementation. +# 13 Feb 2007 Better handling of default values for lambda, now works with multiple +# models and lexicalized reordering +# 11 Oct 2006 Handle different input types through parameter --inputype=[0|1] +# (0 for text, 1 for confusion network, default is 0) (Nicola Bertoldi) +# 10 Oct 2006 Allow skip of filtering of phrase tables (--no-filter-phrase-table) +# useful if binary phrase tables are used (Nicola Bertoldi) +# 28 Aug 2006 Use either closest or average or shortest (default) reference +# length as effective reference length +# Use either normalization or not (default) of texts (Nicola Bertoldi) +# 31 Jul 2006 move gzip run*.out to avoid failure wit restartings +# adding default paths +# 29 Jul 2006 run-filter, score-nbest and mert run on the queue (Nicola; Ondrej had to type it in again) +# 28 Jul 2006 attempt at foolproof usage, strong checking of input validity, merged the parallel and nonparallel version (Ondrej Bojar) +# 27 Jul 2006 adding the safesystem() function to handle with process failure +# 22 Jul 2006 fixed a bug about handling relative path of configuration file (Nicola Bertoldi) +# 21 Jul 2006 adapted for Moses-in-parallel (Nicola Bertoldi) +# 18 Jul 2006 adapted for Moses and cleaned up (PK) +# 21 Jan 2005 unified various versions, thorough cleanup (DWC) +# now indexing accumulated n-best list solely by feature vectors +# 14 Dec 2004 reimplemented find_threshold_points in C (NMD) +# 25 Oct 2004 Use either average or shortest (default) reference +# length as effective reference length (DWC) +# 13 Oct 2004 Use alternative decoders (DWC) +# Original version by Philipp Koehn + +use FindBin qw($Bin); +use File::Basename; +use File::Path; +my $SCRIPTS_ROOTDIR = $Bin; +$SCRIPTS_ROOTDIR =~ s/\/training$//; +$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); + +## We preserve this bit of comments to keep the traditional weight ranges. +# "w" => [ [ 0.0, -1.0, 1.0 ] ], # word penalty +# "d" => [ [ 1.0, 0.0, 2.0 ] ], # lexicalized reordering model +# "lm" => [ [ 1.0, 0.0, 2.0 ] ], # language model +# "g" => [ [ 1.0, 0.0, 2.0 ], # generation model +# [ 1.0, 0.0, 2.0 ] ], +# "tm" => [ [ 0.3, 0.0, 0.5 ], # translation model +# [ 0.2, 0.0, 0.5 ], +# [ 0.3, 0.0, 0.5 ], +# [ 0.2, 0.0, 0.5 ], +# [ 0.0,-1.0, 1.0 ] ], # ... last weight is phrase penalty +# "lex"=> [ [ 0.1, 0.0, 0.2 ] ], # global lexical model +# "I" => [ [ 0.0,-1.0, 1.0 ] ], # input lattice scores + + + +# moses.ini file uses FULL names for lambdas, while this training script +# internally (and on the command line) uses ABBR names. +my @ABBR_FULL_MAP = qw(d=weight-d lm=weight-l tm=weight-t w=weight-w + g=weight-generation lex=weight-lex I=weight-i); +my %ABBR2FULL = map {split/=/,$_,2} @ABBR_FULL_MAP; +my %FULL2ABBR = map {my ($a, $b) = split/=/,$_,2; ($b, $a);} @ABBR_FULL_MAP; + +my $minimum_required_change_in_weights = 0.00001; + # stop if no lambda changes more than this + +my $verbose = 0; +my $usage = 0; # request for --help +my $___WORKING_DIR = "mert-work"; +my $___DEV_F = undef; # required, input text to decode +my $___DEV_E = undef; # required, basename of files with references +my $___DECODER = undef; # required, pathname to the decoder executable +my $___CONFIG = undef; # required, pathname to startup ini file +my $___N_BEST_LIST_SIZE = 100; +my $queue_flags = "-hard"; # extra parameters for parallelizer + # the -l ws0ssmt was relevant only to JHU 2006 workshop +my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial) +my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder +my $continue = 0; # should we try to continue from the last saved step? +my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert) +my $___FILTER_PHRASE_TABLE = 1; # filter phrase table +my $___PREDICTABLE_SEEDS = 0; +my $___START_WITH_HISTORIC_BESTS = 0; # use best settings from all previous iterations as starting points [Foster&Kuhn,2009] +my $___RANDOM_DIRECTIONS = 0; # search in random directions only +my $___NUM_RANDOM_DIRECTIONS = 0; # number of random directions, also works with default optimizer [Cer&al.,2008] +my $___PAIRWISE_RANKED_OPTIMIZER = 0; # use Hopkins&May[2011] +my $___PRO_STARTING_POINT = 0; # get a starting point from pairwise ranked optimizer +my $___RANDOM_RESTARTS = 20; +my $___HISTORIC_INTERPOLATION = 0; # interpolate optimize weights with previous iteration's weights [Hopkins&May,2011,5.4.3] +my $__THREADS = 0; + +# Parameter for effective reference length when computing BLEU score +# Default is to use shortest reference +# Use "--shortest" to use shortest reference length +# Use "--average" to use average reference length +# Use "--closest" to use closest reference length +# Only one between --shortest, --average and --closest can be set +# If more than one choice the defualt (--shortest) is used +my $___SHORTEST = 0; +my $___AVERAGE = 0; +my $___CLOSEST = 0; + +# Use "--nocase" to compute case-insensitive scores +my $___NOCASE = 0; + +# Use "--nonorm" to non normalize translation before computing scores +my $___NONORM = 0; + +# set 0 if input type is text, set 1 if input type is confusion network +my $___INPUTTYPE = 0; + + +my $mertdir = undef; # path to new mert directory +my $mertargs = undef; # args to pass through to mert & extractor +my $mertmertargs = undef; # args to pass through to mert only +my $filtercmd = undef; # path to filter-model-given-input.pl +my $filterfile = undef; +my $qsubwrapper = undef; +my $moses_parallel_cmd = undef; +my $scorer_config = "BLEU:1"; +my $old_sge = 0; # assume sge<6.0 +my $___CONFIG_ORIG = undef; # pathname to startup ini file before filtering +my $___ACTIVATE_FEATURES = undef; # comma-separated (or blank-separated) list of features to work on + # if undef work on all features + # (others are fixed to the starting values) +my $___RANGES = undef; +my $prev_aggregate_nbl_size = -1; # number of previous step to consider when loading data (default =-1) + # -1 means all previous, i.e. from iteration 1 + # 0 means no previous data, i.e. from actual iteration + # 1 means 1 previous data , i.e. from the actual iteration and from the previous one + # and so on +my $maximum_iterations = 25; + +use strict; +use Getopt::Long; +GetOptions( + "working-dir=s" => \$___WORKING_DIR, + "input=s" => \$___DEV_F, + "inputtype=i" => \$___INPUTTYPE, + "refs=s" => \$___DEV_E, + "decoder=s" => \$___DECODER, + "config=s" => \$___CONFIG, + "nbest=i" => \$___N_BEST_LIST_SIZE, + "queue-flags=s" => \$queue_flags, + "jobs=i" => \$___JOBS, + "decoder-flags=s" => \$___DECODER_FLAGS, + "continue" => \$continue, + "skip-decoder" => \$skip_decoder, + "shortest" => \$___SHORTEST, + "average" => \$___AVERAGE, + "closest" => \$___CLOSEST, + "nocase" => \$___NOCASE, + "nonorm" => \$___NONORM, + "help" => \$usage, + "verbose" => \$verbose, + "mertdir=s" => \$mertdir, + "mertargs=s" => \$mertargs, + "mertmertargs=s" => \$mertmertargs, + "rootdir=s" => \$SCRIPTS_ROOTDIR, + "filtercmd=s" => \$filtercmd, # allow to override the default location + "filterfile=s" => \$filterfile, # input to filtering script (useful for lattices/confnets) + "qsubwrapper=s" => \$qsubwrapper, # allow to override the default location + "mosesparallelcmd=s" => \$moses_parallel_cmd, # allow to override the default location + "old-sge" => \$old_sge, #passed to moses-parallel + "filter-phrase-table!" => \$___FILTER_PHRASE_TABLE, # (dis)allow of phrase tables + "predictable-seeds" => \$___PREDICTABLE_SEEDS, # make random restarts deterministic + "historic-bests" => \$___START_WITH_HISTORIC_BESTS, # use best settings from all previous iterations as starting points + "random-directions" => \$___RANDOM_DIRECTIONS, # search only in random directions + "number-of-random-directions=i" => \$___NUM_RANDOM_DIRECTIONS, # number of random directions + "random-restarts=i" => \$___RANDOM_RESTARTS, # number of random restarts + "activate-features=s" => \$___ACTIVATE_FEATURES, #comma-separated (or blank-separated) list of features to work on (others are fixed to the starting values) + "range=s@" => \$___RANGES, + "prev-aggregate-nbestlist=i" => \$prev_aggregate_nbl_size, #number of previous step to consider when loading data (default =-1, i.e. all previous) + "maximum-iterations=i" => \$maximum_iterations, + "pairwise-ranked" => \$___PAIRWISE_RANKED_OPTIMIZER, + "pro-starting-point" => \$___PRO_STARTING_POINT, + "historic-interpolation=f" => \$___HISTORIC_INTERPOLATION, + "threads=i" => \$__THREADS, + "sc-config=s" => \$scorer_config +) or exit(1); + +# the 4 required parameters can be supplied on the command line directly +# or using the --options +if (scalar @ARGV == 4) { + # required parameters: input_file references_basename decoder_executable + $___DEV_F = shift; + $___DEV_E = shift; + $___DECODER = shift; + $___CONFIG = shift; +} + +if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER || !defined $___CONFIG) { + print STDERR "usage: $0 input-text references decoder-executable decoder.ini +Options: + --working-dir=mert-dir ... where all the files are created + --nbest=100 ... how big nbestlist to generate + --jobs=N ... set this to anything to run moses in parallel + --mosesparallelcmd=STR ... use a different script instead of moses-parallel + --queue-flags=STRING ... anything you with to pass to qsub, eg. + '-l ws06osssmt=true'. The default is: '-hard' + To reset the parameters, please use + --queue-flags=' ' + (i.e. a space between the quotes). + --decoder-flags=STRING ... extra parameters for the decoder + --continue ... continue from the last successful iteration + --skip-decoder ... skip the decoder run for the first time, + assuming that we got interrupted during + optimization + --shortest --average --closest + ... Use shortest/average/closest reference length + as effective reference length (mutually exclusive) + --nocase ... Do not preserve case information; i.e. + case-insensitive evaluation (default is false). + --nonorm ... Do not use text normalization (flag is not active, + i.e. text is NOT normalized) + --filtercmd=STRING ... path to filter-model-given-input.pl + --filterfile=STRING ... path to alternative to input-text for filtering + model. useful for lattice decoding + --rootdir=STRING ... where do helpers reside (if not given explicitly) + --mertdir=STRING ... path to new mert implementation + --mertargs=STRING ... extra args for mert, eg. to specify scorer + --mertmertargs=STRING ... extra args for mert only, + --scorenbestcmd=STRING ... path to score-nbest.py + --old-sge ... passed to parallelizers, assume Grid Engine < 6.0 + --inputtype=[0|1|2] ... Handle different input types: (0 for text, + 1 for confusion network, 2 for lattices, + default is 0) + --no-filter-phrase-table ... disallow filtering of phrase tables + (useful if binary phrase tables are available) + --random-restarts=INT ... number of random restarts (default: 20) + --predictable-seeds ... provide predictable seeds to mert so that random + restarts are the same on every run + --range=tm:0..1,-1..1 ... specify min and max value for some features + --range can be repeated as needed. + The order of the various --range specifications + is important only within a feature name. + E.g.: + --range=tm:0..1,-1..1 --range=tm:0..2 + is identical to: + --range=tm:0..1,-1..1,0..2 + but not to: + --range=tm:0..2 --range=tm:0..1,-1..1 + --activate-features=STRING ... comma-separated list of features to optimize, + others are fixed to the starting values + default: optimize all features + example: tm_0,tm_4,d_0 + --prev-aggregate-nbestlist=INT ... number of previous step to consider when + loading data (default = $prev_aggregate_nbl_size) + -1 means all previous, i.e. from iteration 1 + 0 means no previous data, i.e. only the + current iteration + N means this and N previous iterations + + --maximum-iterations=ITERS ... Maximum number of iterations. Default: $maximum_iterations + --random-directions ... search only in random directions + --number-of-random-directions=int ... number of random directions + (also works with regular optimizer, default: 0) + --pairwise-ranked ... Use PRO for optimisation (Hopkins and May, emnlp 2011) + --pro-starting-point ... Use PRO to get a starting point for MERT + --threads=NUMBER ... Use multi-threaded mert (must be compiled in). + --historic-interpolation ... Interpolate optimized weights with prior iterations' weight + (parameter sets factor [0;1] given to current weights) + --sc-config=STRING ... extra option to specify multiscoring. +"; + exit 1; +} + + +# Check validity of input parameters and set defaults if needed + +print STDERR "Using WORKING_DIR: $___WORKING_DIR\n"; +print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n"; + +# path of script for filtering phrase tables and running the decoder +$filtercmd="$SCRIPTS_ROOTDIR/training/filter-model-given-input.pl" if !defined $filtercmd; + +if ( ! -x $filtercmd && ! $___FILTER_PHRASE_TABLE) { + print STDERR "Filtering command not found: $filtercmd.\n"; + print STDERR "Use --filtercmd=PATH to specify a valid one or --no-filter-phrase-table\n"; + exit 1; +} + +$qsubwrapper="$SCRIPTS_ROOTDIR/generic/qsub-wrapper.pl" if !defined $qsubwrapper; + +$moses_parallel_cmd = "$SCRIPTS_ROOTDIR/generic/moses-parallel.pl" + if !defined $moses_parallel_cmd; + +if (!defined $mertdir) { + $mertdir = "$SCRIPTS_ROOTDIR/../mert"; + print STDERR "Assuming --mertdir=$mertdir\n"; +} + +my $mert_extract_cmd = "$mertdir/extractor"; +my $mert_mert_cmd = "$mertdir/mert"; + +die "Not executable: $mert_extract_cmd" if ! -x $mert_extract_cmd; +die "Not executable: $mert_mert_cmd" if ! -x $mert_mert_cmd; + +my $pro_optimizer = "$mertdir/megam_i686.opt"; # or set to your installation +if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) { + print "did not find $pro_optimizer, installing it in $mertdir\n"; + `cd $mertdir; wget http://www.cs.utah.edu/~hal/megam/megam_i686.opt.gz;`; + `gunzip $pro_optimizer.gz`; + `chmod +x $pro_optimizer`; + die("ERROR: Installation of megam_i686.opt failed! Install by hand from http://www.cs.utah.edu/~hal/megam/") unless -x $pro_optimizer; +} + +$mertargs = "" if !defined $mertargs; + +my $scconfig = undef; +if ($mertargs =~ /\-\-scconfig\s+(.+?)(\s|$)/){ + $scconfig=$1; + $scconfig =~ s/\,/ /g; + $mertargs =~ s/\-\-scconfig\s+(.+?)(\s|$)//; +} + +# handling reference lengh strategy +if (($___CLOSEST + $___AVERAGE + $___SHORTEST) > 1){ + die "You can specify just ONE reference length strategy (closest or shortest or average) not both\n"; +} + +if ($___SHORTEST){ + $scconfig .= " reflen:shortest"; +}elsif ($___AVERAGE){ + $scconfig .= " reflen:average"; +}elsif ($___CLOSEST){ + $scconfig .= " reflen:closest"; +} + +# handling case-insensitive flag +if ($___NOCASE) { + $scconfig .= " case:false"; +}else{ + $scconfig .= " case:true"; +} +$scconfig =~ s/^\s+//; +$scconfig =~ s/\s+$//; +$scconfig =~ s/\s+/,/g; + +$scconfig = "--scconfig $scconfig" if ($scconfig); + +my $mert_extract_args=$mertargs; +$mert_extract_args .=" $scconfig"; + +$mertmertargs = "" if !defined $mertmertargs; + +my $mert_mert_args="$mertargs $mertmertargs"; +$mert_mert_args =~ s/\-+(binary|b)\b//; +$mert_mert_args .=" $scconfig"; +if ($___ACTIVATE_FEATURES){ $mert_mert_args .=" -o \"$___ACTIVATE_FEATURES\""; } + +my ($just_cmd_filtercmd,$x) = split(/ /,$filtercmd); +die "Not executable: $just_cmd_filtercmd" if ! -x $just_cmd_filtercmd; +die "Not executable: $moses_parallel_cmd" if defined $___JOBS && ! -x $moses_parallel_cmd; +die "Not executable: $qsubwrapper" if defined $___JOBS && ! -x $qsubwrapper; +die "Not executable: $___DECODER" if ! -x $___DECODER; + +my $input_abs = ensure_full_path($___DEV_F); +die "File not found: $___DEV_F (interpreted as $input_abs)." + if ! -e $input_abs; +$___DEV_F = $input_abs; + +# Option to pass to qsubwrapper and moses-parallel +my $pass_old_sge = $old_sge ? "-old-sge" : ""; + +my $decoder_abs = ensure_full_path($___DECODER); +die "File not executable: $___DECODER (interpreted as $decoder_abs)." + if ! -x $decoder_abs; +$___DECODER = $decoder_abs; + +my $ref_abs = ensure_full_path($___DEV_E); +# check if English dev set (reference translations) exist and store a list of all references +my @references; +if (-e $ref_abs) { + push @references, $ref_abs; +} +else { + # if multiple file, get a full list of the files + my $part = 0; + while (-e $ref_abs.$part) { + push @references, $ref_abs.$part; + $part++; + } + die("Reference translations not found: $___DEV_E (interpreted as $ref_abs)") unless $part; +} + +my $config_abs = ensure_full_path($___CONFIG); +die "File not found: $___CONFIG (interpreted as $config_abs)." + if ! -e $config_abs; +$___CONFIG = $config_abs; + +# moses should use our config +if ($___DECODER_FLAGS =~ /(^|\s)-(config|f) / +|| $___DECODER_FLAGS =~ /(^|\s)-(ttable-file|t) / +|| $___DECODER_FLAGS =~ /(^|\s)-(distortion-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(generation-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(lmodel-file) / +|| $___DECODER_FLAGS =~ /(^|\s)-(global-lexical-file) / +) { + die "It is forbidden to supply any of -config, -ttable-file, -distortion-file, -generation-file or -lmodel-file in the --decoder-flags.\nPlease use only the --config option to give the config file that lists all the supplementary files."; +} + +# as weights are normalized in the next steps (by cmert) +# normalize initial LAMBDAs, too +my $need_to_normalize = 1; + +#store current directory and create the working directory (if needed) +my $cwd = `pawd 2>/dev/null`; +if(!$cwd){$cwd = `pwd`;} +chomp($cwd); + +mkpath($___WORKING_DIR); + +{ +# open local scope + +#chdir to the working directory +chdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR"; + +# fixed file names +my $mert_outfile = "mert.out"; +my $mert_logfile = "mert.log"; +my $weights_in_file = "init.opt"; +my $weights_out_file = "weights.txt"; + +# set start run +my $start_run = 1; +my $bestpoint = undef; +my $devbleu = undef; +my $sparse_weights_file = undef; + +my $prev_feature_file = undef; +my $prev_score_file = undef; +my $prev_init_file = undef; + +if ($___FILTER_PHRASE_TABLE) { + my $outdir = "filtered"; + if (-e "$outdir/moses.ini") { + print STDERR "Assuming the tables are already filtered, reusing $outdir/moses.ini\n"; + } + else { + # filter the phrase tables with respect to input, use --decoder-flags + print STDERR "filtering the phrase tables... ".`date`; + my $___FILTER_F = $___DEV_F; + $___FILTER_F = $filterfile if (defined $filterfile); + my $cmd = "$filtercmd ./$outdir $___CONFIG $___FILTER_F"; + &submit_or_exec($cmd,"filterphrases.out","filterphrases.err"); + } + + # make a backup copy of startup ini filepath + $___CONFIG_ORIG = $___CONFIG; + # the decoder should now use the filtered model + $___CONFIG = "$outdir/moses.ini"; +} +else{ + # do not filter phrase tables (useful if binary phrase tables are available) + # use the original configuration file + $___CONFIG_ORIG = $___CONFIG; +} + +# we run moses to check validity of moses.ini and to obtain all the feature +# names +my $featlist = get_featlist_from_moses($___CONFIG); +$featlist = insert_ranges_to_featlist($featlist, $___RANGES); + +# Mark which features are disabled: +if (defined $___ACTIVATE_FEATURES) { + my %enabled = map { ($_, 1) } split /[, ]+/, $___ACTIVATE_FEATURES; + my %cnt; + for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $cnt{$name} = 0 if !defined $cnt{$name}; + $featlist->{"enabled"}->[$i] = $enabled{$name."_".$cnt{$name}}; + $cnt{$name}++; + } +} else { + # all enabled + for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) { + $featlist->{"enabled"}->[$i] = 1; + } +} + +print STDERR "MERT starting values and ranges for random generation:\n"; +for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + my $val = $featlist->{"values"}->[$i]; + my $min = $featlist->{"mins"}->[$i]; + my $max = $featlist->{"maxs"}->[$i]; + my $enabled = $featlist->{"enabled"}->[$i]; + printf STDERR " %5s = %7.3f", $name, $val; + if ($enabled) { + printf STDERR " (%5.2f .. %5.2f)\n", $min, $max; + } else { + print STDERR " --- inactive, not optimized ---\n"; + } +} + +if ($continue) { + # getting the last finished step + print STDERR "Trying to continue an interrupted optimization.\n"; + open IN, "finished_step.txt" or die "Failed to find the step number, failed to read finished_step.txt"; + my $step = <IN>; + chomp $step; + close IN; + + print STDERR "Last finished step is $step\n"; + + # getting the first needed step + my $firststep; + if ($prev_aggregate_nbl_size==-1){ + $firststep=1; + } + else{ + $firststep=$step-$prev_aggregate_nbl_size+1; + $firststep=($firststep>0)?$firststep:1; + } + +#checking if all needed data are available + if ($firststep<=$step){ + print STDERR "First previous needed data index is $firststep\n"; + print STDERR "Checking whether all needed data (from step $firststep to step $step) are available\n"; + + for (my $prevstep=$firststep; $prevstep<=$step;$prevstep++){ + print STDERR "Checking whether data of step $prevstep are available\n"; + if (! -e "run$prevstep.features.dat"){ + die "Can't start from step $step, because run$prevstep.features.dat was not found!"; + }else{ + if (defined $prev_feature_file){ + $prev_feature_file = "${prev_feature_file},run$prevstep.features.dat"; + } + else{ + $prev_feature_file = "run$prevstep.features.dat"; + } + } + if (! -e "run$prevstep.scores.dat"){ + die "Can't start from step $step, because run$prevstep.scores.dat was not found!"; + }else{ + if (defined $prev_score_file){ + $prev_score_file = "${prev_score_file},run$prevstep.scores.dat"; + } + else{ + $prev_score_file = "run$prevstep.scores.dat"; + } + } + if (! -e "run$prevstep.${weights_in_file}"){ + die "Can't start from step $step, because run$prevstep.${weights_in_file} was not found!"; + }else{ + if (defined $prev_init_file){ + $prev_init_file = "${prev_init_file},run$prevstep.${weights_in_file}"; + } + else{ + $prev_init_file = "run$prevstep.${weights_in_file}"; + } + } + } + if (! -e "run$step.weights.txt"){ + die "Can't start from step $step, because run$step.weights.txt was not found!"; + } + if (! -e "run$step.$mert_logfile"){ + die "Can't start from step $step, because run$step.$mert_logfile was not found!"; + } + if (! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz"){ + die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!"; + } + print STDERR "All needed data are available\n"; + + print STDERR "Loading information from last step ($step)\n"; + my %dummy; # sparse features + ($bestpoint,$devbleu) = &get_weights_from_mert("run$step.$mert_outfile","run$step.$mert_logfile",scalar @{$featlist->{"names"}},\%dummy); + die "Failed to parse mert.log, missed Best point there." + if !defined $bestpoint || !defined $devbleu; + print "($step) BEST at $step $bestpoint => $devbleu at ".`date`; + my @newweights = split /\s+/, $bestpoint; + + # Sanity check: order of lambdas must match + sanity_check_order_of_lambdas($featlist, + "gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |"); + + # update my cache of lambda values + $featlist->{"values"} = \@newweights; + } + else{ + print STDERR "No previous data are needed\n"; + } + + $start_run = $step +1; +} + +###### MERT MAIN LOOP + +my $run=$start_run-1; + +my $oldallsorted = undef; +my $allsorted = undef; + +my $nbest_file=undef; + +while(1) { + $run++; + if ($maximum_iterations && $run > $maximum_iterations) { + print "Maximum number of iterations exceeded - stopping\n"; + last; + } + # run beamdecoder with option to output nbestlists + # the end result should be (1) @NBEST_LIST, a list of lists; (2) @SCORE, a list of lists of lists + + print "run $run start at ".`date`; + + # In case something dies later, we might wish to have a copy + create_config($___CONFIG, "./run$run.moses.ini", $featlist, $run, (defined$devbleu?$devbleu:"--not-estimated--"),$sparse_weights_file); + + + # skip running the decoder if the user wanted + if (!$skip_decoder) { + print "($run) run decoder to produce n-best lists\n"; + $nbest_file = run_decoder($featlist, $run, $need_to_normalize); + $need_to_normalize = 0; + safesystem("gzip -f $nbest_file") or die "Failed to gzip run*out"; + $nbest_file = $nbest_file.".gz"; + } + else { + $nbest_file="run$run.best$___N_BEST_LIST_SIZE.out.gz"; + print "skipped decoder run $run\n"; + $skip_decoder = 0; + $need_to_normalize = 0; + } + + # extract score statistics and features from the nbest lists + print STDERR "Scoring the nbestlist.\n"; + + my $base_feature_file = "features.dat"; + my $base_score_file = "scores.dat"; + my $feature_file = "run$run.${base_feature_file}"; + my $score_file = "run$run.${base_score_file}"; + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + my $cmd = ""; + my $scorer_name; + my $scorer_weight; + $scorer_config=~s/ //g; + my @lists_scorer_config=split(",",$scorer_config); + $mert_mert_args=$mert_mert_args." --sctype MERGE "; + my $scorer_config_spec; + foreach $scorer_config_spec(@lists_scorer_config) + { +# print STDERR $scorer_config_spec."\n"; + my @lists_scorer_config_spec=split(":",$scorer_config_spec); + $scorer_name=$lists_scorer_config_spec[0]; + $scorer_weight=$lists_scorer_config_spec[1]; +# print STDERR $scorer_name."\n"; +# print STDERR $scorer_weight."\n"; + $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file.$scorer_name --ffile $feature_file.$scorer_name --sctype $scorer_name -r ".join(",", @references)." -n $nbest_file"; +# print STDERR "LANCEMENT $scorer_name ********************************************\n"; + &submit_or_exec($cmd,"extract.out.$scorer_name","extract.err.$scorer_name"); +# print STDERR "FIN $scorer_name ************************************************** \n"; +# print STDERR "executing $cmd\n"; + +# print STDERR "\n"; +# safesystem("date"); +# print STDERR "\n"; + +# if (defined $___JOBS) { +# safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=extract.out.$scorer_name -stderr=extract.err.$scorer_name" ) +# or die "$scorer_name Failed to submit extraction to queue (via $qsubwrapper)"; +# } else { +# safesystem("$cmd > extract.out.$scorer_name 2> extract.err.$scorer_name") or die "$scorer_name Failed to do extraction of statistics."; +# } + +# print FILE "$scorer_name $scorer_weight $score_file.$scorer_name $feature_file.$scorer_name\n"; + } +# print STDERR "CREATION INI\n"; + my @scorer_content; + my $fileIncrement=0; + open(FILE,">merge.init") || die ("File creation ERROR : merge.init"); + foreach $scorer_config_spec(@lists_scorer_config) + { + my @lists_scorer_config_spec=split(":",$scorer_config_spec); + $scorer_name=$lists_scorer_config_spec[0]; + $scorer_weight=$lists_scorer_config_spec[1]; + print FILE "$scorer_name $scorer_weight $score_file.$scorer_name $feature_file.$scorer_name\n"; + my @tmp_content=`/bin/cat $score_file.$scorer_name`; + $scorer_content[$fileIncrement] = [ @tmp_content ]; + if ($fileIncrement==0) + { + `/bin/cp $feature_file.$scorer_name $feature_file`; + } + $fileIncrement++; + } + close(FILE); +# print STDERR "\n"; +# safesystem("date"); +# print STDERR "\n"; + +# print STDERR "ON VA RASSEMBLER dans $score_file\n"; + open(SCOREFILE,">$score_file") || die ("File creation ERROR : $score_file"); + my $newFileIncrement=0; + my $contentIncrement=0; + my $contentSize=scalar(@{$scorer_content[0]}); +# print STDERR "TAILLE : ".$contentSize."|".$fileIncrement."\n"; + while ($contentIncrement< $contentSize) + { + my $line=""; + $newFileIncrement=0; + while($newFileIncrement< $fileIncrement) + { + if (rindex($scorer_content[$newFileIncrement][$contentIncrement],"BEGIN")<0) + { + $line=$line." ".$scorer_content[$newFileIncrement][$contentIncrement]; + chomp($line); + } + else + { + my @split_line_input=split(" ",$scorer_content[$newFileIncrement][$contentIncrement]); + my @split_line=split(" ",$line); + if (scalar(@split_line)>0) + { + $split_line_input[3]=$split_line[3]+$split_line_input[3]; + } + $line=$split_line_input[0]." ".$split_line_input[1]." ".$split_line_input[2]." ".$split_line_input[3]." MERGE"; + } + $newFileIncrement++; + } + $line=~s/^[ ]+//g; + $line=~s/[ ]+$//g; + $line=~s/[ ]+/ /g; +# print STDERR $line."\n"; + print SCOREFILE $line."\n"; + $contentIncrement++; + } + close(SCOREFILE); +# `/bin/cp ` + +# $cmd="$mertdir/mergeWeights -c merge.init -s $score_file -f $feature_file"; +# print STDERR "executing : $cmd\n"; + +# if (defined $___JOBS) { +# safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=mergeWeight.out.MERGE -stderr=mergeWeight.err.MERGE" ) +# or die "MERGE Failed to submit extraction to queue (via $qsubwrapper)"; +# } else { +# safesystem("$cmd > mergeWeight.out.MERGE 2> mergeWeight.err.MERGE") or die "MERGE Failed to do extraction of statistics."; +# } + +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + +# my $cmd = "$mert_extract_cmd $mert_extract_args --scfile $score_file --ffile $feature_file -r ".join(",", @references)." -n $nbest_file"; +# &submit_or_exec($cmd,"extract.out","extract.err"); + + # Create the initial weights file for mert: init.opt + + my @MIN = @{$featlist->{"mins"}}; + my @MAX = @{$featlist->{"maxs"}}; + my @CURR = @{$featlist->{"values"}}; + my @NAME = @{$featlist->{"names"}}; + + open(OUT,"> $weights_in_file") + or die "Can't write $weights_in_file (WD now $___WORKING_DIR)"; + print OUT join(" ", @CURR)."\n"; + print OUT join(" ", @MIN)."\n"; # this is where we could pass MINS + print OUT join(" ", @MAX)."\n"; # this is where we could pass MAXS + close(OUT); + # print join(" ", @NAME)."\n"; + + # make a backup copy labelled with this run number + safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die; + + my $DIM = scalar(@CURR); # number of lambdas + + # run mert + $cmd = "$mert_mert_cmd -d $DIM $mert_mert_args"; + + my $mert_settings = " -n $___RANDOM_RESTARTS"; + if ($___PREDICTABLE_SEEDS) { + my $seed = $run * 1000; + $mert_settings .= " -r $seed"; + } + if ($___RANDOM_DIRECTIONS) { + if ($___NUM_RANDOM_DIRECTIONS == 0) { + $mert_settings .= " -m 50"; + } + $mert_settings .= " -t random-direction"; + } + if ($___NUM_RANDOM_DIRECTIONS) { + $mert_settings .= " -m $___NUM_RANDOM_DIRECTIONS"; + } + if ($__THREADS) { + $mert_settings .= " --threads $__THREADS"; + } + + my $file_settings = ""; + if (defined $prev_feature_file) { + $file_settings .= " --ffile $prev_feature_file,$feature_file"; + } + else{ + $file_settings .= " --ffile $feature_file"; + } + if (defined $prev_score_file) { + $file_settings .= " --scfile $prev_score_file,$score_file"; + } + else{ + $file_settings .= " --scfile $score_file"; + } + if ($___START_WITH_HISTORIC_BESTS && defined $prev_init_file) { + $file_settings .= " --ifile $prev_init_file,run$run.$weights_in_file"; + } + else{ + $file_settings .= " --ifile run$run.$weights_in_file"; + } + + $cmd .= $file_settings; + + # pro optimization + if ($___PAIRWISE_RANKED_OPTIMIZER) { + $cmd .= " --pro run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data"; + &submit_or_exec($cmd,$mert_outfile,$mert_logfile); + } + # first pro, then mert + elsif ($___PRO_STARTING_POINT) { + # run pro... + my $pro_cmd = $cmd." --pro run$run.pro.data ; $pro_optimizer -fvals -maxi 30 -nobias binary run$run.pro.data"; + &submit_or_exec($pro_cmd,"run$run.pro.out","run$run.pro.err"); + # ... get results ... + my %dummy; + ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.pro.out","run$run.pro.err",scalar @{$featlist->{"names"}},\%dummy); + open(PRO_START,">run$run.init.pro"); + print PRO_START $bestpoint."\n"; + close(PRO_START); + # ... and run mert + $cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/; + &submit_or_exec($cmd.$mert_settings,$mert_outfile,$mert_logfile); + } + # just mert + else { + &submit_or_exec($cmd.$mert_settings,$mert_outfile,$mert_logfile); + } + + die "Optimization failed, file $weights_out_file does not exist or is empty" + if ! -s $weights_out_file; + + # backup copies + foreach my $extractFiles(`/bin/ls extract.*`) + { + chomp $extractFiles; + safesystem ("\\cp -f $extractFiles run$run.$extractFiles") or die; + } + +# safesystem ("\\cp -f extract.err run$run.extract.err") or die; +# safesystem ("\\cp -f extract.out run$run.extract.out") or die; + safesystem ("\\cp -f $mert_outfile run$run.$mert_outfile") or die; + safesystem ("\\cp -f $mert_logfile run$run.$mert_logfile") or die; + safesystem ("touch $mert_logfile run$run.$mert_logfile") or die; + safesystem ("\\cp -f $weights_out_file run$run.$weights_out_file") or die; # this one is needed for restarts, too + + print "run $run end at ".`date`; + + my %sparse_weights; # sparse features + ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.$mert_outfile","run$run.$mert_logfile",scalar @{$featlist->{"names"}},\%sparse_weights); + + die "Failed to parse mert.log, missed Best point there." + if !defined $bestpoint || !defined $devbleu; + + print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`; + + # update my cache of lambda values + my @newweights = split /\s+/, $bestpoint; + + # interpolate with prior's interation weight, if historic-interpolation is specified + if ($___HISTORIC_INTERPOLATION>0 && $run>3) { + my %historic_sparse_weights; + if (-e "run$run.sparse-weights") { + open(SPARSE,"run$run.sparse-weights"); + while(<SPARSE>) { + chop; + my ($feature,$weight) = split; + $historic_sparse_weights{$feature} = $weight; + } + } + my $prev = $run-1; + my @historic_weights = split /\s+/, `cat run$prev.$weights_out_file`; + for(my $i=0;$i<scalar(@newweights);$i++) { + $newweights[$i] = $___HISTORIC_INTERPOLATION * $newweights[$i] + (1-$___HISTORIC_INTERPOLATION) * $historic_weights[$i]; + } + print "interpolate with ".join(",",@historic_weights)." to ".join(",",@newweights); + foreach (keys %sparse_weights) { + $sparse_weights{$_} *= $___HISTORIC_INTERPOLATION; + #print STDERR "sparse_weights{$_} *= $___HISTORIC_INTERPOLATION -> $sparse_weights{$_}\n"; + } + foreach (keys %historic_sparse_weights) { + $sparse_weights{$_} += (1-$___HISTORIC_INTERPOLATION) * $historic_sparse_weights{$_}; + #print STDERR "sparse_weights{$_} += (1-$___HISTORIC_INTERPOLATION) * $historic_sparse_weights{$_} -> $sparse_weights{$_}\n"; + } + } + if ($___HISTORIC_INTERPOLATION>0) { + open(WEIGHTS,">run$run.$weights_out_file"); + print WEIGHTS join(" ",@newweights); + close(WEIGHTS); + } + + $featlist->{"values"} = \@newweights; + + if (scalar keys %sparse_weights) { + $sparse_weights_file = "run".($run+1).".sparse-weights"; + open(SPARSE,">".$sparse_weights_file); + foreach my $feature (keys %sparse_weights) { + print SPARSE "$feature $sparse_weights{$feature}\n"; + } + close(SPARSE); + } + + ## additional stopping criterion: weights have not changed + my $shouldstop = 1; + for(my $i=0; $i<@CURR; $i++) { + die "Lost weight! mert reported fewer weights (@newweights) than we gave it (@CURR)" + if !defined $newweights[$i]; + if (abs($CURR[$i] - $newweights[$i]) >= $minimum_required_change_in_weights) { + $shouldstop = 0; + last; + } + } + + open F, "> finished_step.txt" or die "Can't mark finished step"; + print F $run."\n"; + close F; + + if ($shouldstop) { + print STDERR "None of the weights changed more than $minimum_required_change_in_weights. Stopping.\n"; + last; + } + + my $firstrun; + if ($prev_aggregate_nbl_size==-1){ + $firstrun=1; + } + else{ + $firstrun=$run-$prev_aggregate_nbl_size+1; + $firstrun=($firstrun>0)?$firstrun:1; + } + print "loading data from $firstrun to $run (prev_aggregate_nbl_size=$prev_aggregate_nbl_size)\n"; + $prev_feature_file = undef; + $prev_score_file = undef; + $prev_init_file = undef; + for (my $i=$firstrun;$i<=$run;$i++){ + if (defined $prev_feature_file){ + $prev_feature_file = "${prev_feature_file},run${i}.${base_feature_file}"; + } + else{ + $prev_feature_file = "run${i}.${base_feature_file}"; + } + if (defined $prev_score_file){ + $prev_score_file = "${prev_score_file},run${i}.${base_score_file}"; + } + else{ + $prev_score_file = "run${i}.${base_score_file}"; + } + if (defined $prev_init_file){ + $prev_init_file = "${prev_init_file},run${i}.${weights_in_file}"; + } + else{ + $prev_init_file = "run${i}.${weights_in_file}"; + } + } + print "loading data from $prev_feature_file\n" if defined($prev_feature_file); + print "loading data from $prev_score_file\n" if defined($prev_score_file); + print "loading data from $prev_init_file\n" if defined($prev_init_file); +} +print "Training finished at ".`date`; + +if (defined $allsorted){ safesystem ("\\rm -f $allsorted") or die; }; + +safesystem("\\cp -f $weights_in_file run$run.$weights_in_file") or die; +safesystem("\\cp -f $mert_logfile run$run.$mert_logfile") or die; + +create_config($___CONFIG_ORIG, "./moses.ini", $featlist, $run, $devbleu); + +# just to be sure that we have the really last finished step marked +open F, "> finished_step.txt" or die "Can't mark finished step"; +print F $run."\n"; +close F; + + +#chdir back to the original directory # useless, just to remind we were not there +chdir($cwd); + +} # end of local scope + +sub get_weights_from_mert { + my ($outfile,$logfile,$weight_count,$sparse_weights) = @_; + my ($bestpoint,$devbleu); + if ($___PAIRWISE_RANKED_OPTIMIZER || ($___PRO_STARTING_POINT && $logfile =~ /pro/)) { + open(IN,$outfile) or die "Can't open $outfile"; + my (@WEIGHT,$sum); + for(my $i=0;$i<$weight_count;$i++) { push @WEIGHT, 0; } + while(<IN>) { + # regular features + if (/^F(\d+) ([\-\.\de]+)/) { + $WEIGHT[$1] = $2; + $sum += abs($2); + } + # sparse features + elsif(/^(.+_.+) ([\-\.\de]+)/) { + $$sparse_weights{$1} = $2; + } + } + $devbleu = "unknown"; + foreach (@WEIGHT) { $_ /= $sum; } + foreach (keys %{$sparse_weights}) { $$sparse_weights{$_} /= $sum; } + $bestpoint = join(" ",@WEIGHT); + close IN; + } + else { + open(IN,$logfile) or die "Can't open $logfile"; + while (<IN>) { + if (/Best point:\s*([\s\d\.\-e]+?)\s*=> ([\-\d\.]+)/) { + $bestpoint = $1; + $devbleu = $2; + last; + } + } + close IN; + } + return ($bestpoint,$devbleu); +} + +sub run_decoder { + my ($featlist, $run, $need_to_normalize) = @_; + my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out"; + my $filename = sprintf($filename_template, $run); + + # user-supplied parameters + print "params = $___DECODER_FLAGS\n"; + + # parameters to set all model weights (to override moses.ini) + my @vals = @{$featlist->{"values"}}; + if ($need_to_normalize) { + print STDERR "Normalizing lambdas: @vals\n"; + my $totlambda=0; + grep($totlambda+=abs($_),@vals); + grep($_/=$totlambda,@vals); + } + # moses now does not seem accept "-tm X -tm Y" but needs "-tm X Y" + my %model_weights; + for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $model_weights{$name} = "-$name" if !defined $model_weights{$name}; + $model_weights{$name} .= sprintf " %.6f", $vals[$i]; + } + my $decoder_config = join(" ", values %model_weights); + print STDERR "DECODER_CFG = $decoder_config\n"; + print "decoder_config = $decoder_config\n"; + + # run the decoder + my $nBest_cmd = "-n-best-size $___N_BEST_LIST_SIZE"; + my $decoder_cmd; + + if (defined $___JOBS && $___JOBS > 0) { + $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; + } else { + $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out"; + } + + safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n"; + + sanity_check_order_of_lambdas($featlist, $filename); + return $filename; +} + + +sub insert_ranges_to_featlist { + my $featlist = shift; + my $ranges = shift; + + $ranges = [] if !defined $ranges; + + # first collect the ranges from options + my $niceranges; + foreach my $range (@$ranges) { + my $name = undef; + foreach my $namedpair (split /,/, $range) { + if ($namedpair =~ /^(.*?):/) { + $name = $1; + $namedpair =~ s/^.*?://; + die "Unrecognized name '$name' in --range=$range" + if !defined $ABBR2FULL{$name}; + } + my ($min, $max) = split /\.\./, $namedpair; + die "Bad min '$min' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "Bad max '$max' in --range=$range" if $min !~ /^-?[0-9.]+$/; + die "No name given in --range=$range" if !defined $name; + push @{$niceranges->{$name}}, [$min, $max]; + } + } + + # now populate featlist + my $seen = undef; + for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + $seen->{$name} ++; + my $min = 0.0; + my $max = 1.0; + if (defined $niceranges->{$name}) { + my $minmax = shift @{$niceranges->{$name}}; + ($min, $max) = @$minmax if defined $minmax; + } + $featlist->{"mins"}->[$i] = $min; + $featlist->{"maxs"}->[$i] = $max; + } + return $featlist; +} + +sub sanity_check_order_of_lambdas { + my $featlist = shift; + my $filename_or_stream = shift; + + my @expected_lambdas = @{$featlist->{"names"}}; + my @got = get_order_of_scores_from_nbestlist($filename_or_stream); + die "Mismatched lambdas. Decoder returned @got, we expected @expected_lambdas" + if "@got" ne "@expected_lambdas"; +} + + +sub get_featlist_from_moses { + # run moses with the given config file and return the list of features and + # their initial values + my $configfn = shift; + my $featlistfn = "./features.list"; + if (-e $featlistfn) { + print STDERR "Using cached features list: $featlistfn\n"; + } else { + print STDERR "Asking moses for feature names and values from $___CONFIG\n"; + my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn -inputtype $___INPUTTYPE -show-weights > $featlistfn"; + safesystem($cmd) or die "Failed to run moses with the config $configfn"; + } + + # read feature list + my @names = (); + my @startvalues = (); + open(INI,$featlistfn) or die "Can't read $featlistfn"; + my $nr = 0; + my @errs = (); + while (<INI>) { + $nr++; + chomp; + /^(.+) (\S+) (\S+)$/ || die("invalid feature: $_"); + my ($longname, $feature, $value) = ($1,$2,$3); + next if $value eq "sparse"; + push @errs, "$featlistfn:$nr:Bad initial value of $feature: $value\n" + if $value !~ /^[+-]?[0-9.e]+$/; + push @errs, "$featlistfn:$nr:Unknown feature '$feature', please add it to \@ABBR_FULL_MAP\n" + if !defined $ABBR2FULL{$feature}; + push @names, $feature; + push @startvalues, $value; + } + close INI; + if (scalar @errs) { + print STDERR join("", @errs); + exit 1; + } + return {"names"=>\@names, "values"=>\@startvalues}; +} + + +sub get_order_of_scores_from_nbestlist { + # read the first line and interpret the ||| label: num num num label2: num ||| column in nbestlist + # return the score labels in order + my $fname_or_source = shift; + # print STDERR "Peeking at the beginning of nbestlist to get order of scores: $fname_or_source\n"; + open IN, $fname_or_source or die "Failed to get order of scores from nbestlist '$fname_or_source'"; + my $line = <IN>; + close IN; + die "Line empty in nbestlist '$fname_or_source'" if !defined $line; + my ($sent, $hypo, $scores, $total) = split /\|\|\|/, $line; + $scores =~ s/^\s*|\s*$//g; + die "No scores in line: $line" if $scores eq ""; + + my @order = (); + my $label = undef; + my $sparse = 0; # we ignore sparse features here + foreach my $tok (split /\s+/, $scores) { + if ($tok =~ /.+_.+:/) { + $sparse = 1; + } elsif ($tok =~ /^([a-z][0-9a-z]*):/i) { + $label = $1; + } elsif ($tok =~ /^-?[-0-9.e]+$/) { + if (!$sparse) { + # a score found, remember it + die "Found a score but no label before it! Bad nbestlist '$fname_or_source'!" + if !defined $label; + push @order, $label; + } + $sparse = 0; + } else { + die "Not a label, not a score '$tok'. Failed to parse the scores string: '$scores' of nbestlist '$fname_or_source'"; + } + } + print STDERR "The decoder returns the scores in this order: @order\n"; + return @order; +} + +sub create_config { + my $infn = shift; # source config + my $outfn = shift; # where to save the config + my $featlist = shift; # the lambdas we should write + my $iteration = shift; # just for verbosity + my $bleu_achieved = shift; # just for verbosity + my $sparse_weights_file = shift; # only defined when optimizing sparse features + + my %P; # the hash of all parameters we wish to override + + # first convert the command line parameters to the hash + { # ensure local scope of vars + my $parameter=undef; + print "Parsing --decoder-flags: |$___DECODER_FLAGS|\n"; + $___DECODER_FLAGS =~ s/^\s*|\s*$//; + $___DECODER_FLAGS =~ s/\s+/ /; + foreach (split(/ /,$___DECODER_FLAGS)) { + if (/^\-([^\d].*)$/) { + $parameter = $1; + $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); + } + else { + die "Found value with no -paramname before it: $_" + if !defined $parameter; + push @{$P{$parameter}},$_; + } + } + } + + # First delete all weights params from the input, we're overwriting them. + # Delete both short and long-named version. + for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + delete($P{$name}); + delete($P{$ABBR2FULL{$name}}); + } + + # Convert weights to elements in P + for(my $i=0; $i<scalar(@{$featlist->{"names"}}); $i++) { + my $name = $featlist->{"names"}->[$i]; + my $val = $featlist->{"values"}->[$i]; + $name = defined $ABBR2FULL{$name} ? $ABBR2FULL{$name} : $name; + # ensure long name + push @{$P{$name}}, $val; + } + + if (defined($sparse_weights_file)) { + push @{$P{"weights-file"}}, $___WORKING_DIR."/".$sparse_weights_file; + } + + # create new moses.ini decoder config file by cloning and overriding the original one + open(INI,$infn) or die "Can't read $infn"; + delete($P{"config"}); # never output + print "Saving new config to: $outfn\n"; + open(OUT,"> $outfn") or die "Can't write $outfn"; + print OUT "# MERT optimized configuration\n"; + print OUT "# decoder $___DECODER\n"; + print OUT "# BLEU $bleu_achieved on dev $___DEV_F\n"; + print OUT "# We were before running iteration $iteration\n"; + print OUT "# finished ".`date`; + my $line = <INI>; + while(1) { + last unless $line; + + # skip until hit [parameter] + if ($line !~ /^\[(.+)\]\s*$/) { + $line = <INI>; + print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; + next; + } + + # parameter name + my $parameter = $1; + $parameter = $ABBR2FULL{$parameter} if defined($ABBR2FULL{$parameter}); + print OUT "[$parameter]\n"; + + # change parameter, if new values + if (defined($P{$parameter})) { + # write new values + foreach (@{$P{$parameter}}) { + print OUT $_."\n"; + } + delete($P{$parameter}); + # skip until new parameter, only write comments + while($line = <INI>) { + print OUT $line if $line =~ /^\#/ || $line =~ /^\s+$/; + last if $line =~ /^\[/; + last unless $line; + } + next; + } + + # unchanged parameter, write old + while($line = <INI>) { + last if $line =~ /^\[/; + print OUT $line; + } + } + + # write all additional parameters + foreach my $parameter (keys %P) { + print OUT "\n[$parameter]\n"; + foreach (@{$P{$parameter}}) { + print OUT $_."\n"; + } + } + + close(INI); + close(OUT); + print STDERR "Saved: $outfn\n"; +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} +sub ensure_full_path { + my $PATH = shift; +$PATH =~ s/\/nfsmnt//; + return $PATH if $PATH =~ /^\//; + my $dir = `pawd 2>/dev/null`; + if(!$dir){$dir = `pwd`;} + chomp($dir); + $PATH = $dir."/".$PATH; + $PATH =~ s/[\r\n]//g; + $PATH =~ s/\/\.\//\//g; + $PATH =~ s/\/+/\//g; + my $sanity = 0; + while($PATH =~ /\/\.\.\// && $sanity++<10) { + $PATH =~ s/\/+/\//g; + $PATH =~ s/\/[^\/]+\/\.\.\//\//g; + } + $PATH =~ s/\/[^\/]+\/\.\.$//; + $PATH =~ s/\/+$//; +$PATH =~ s/\/nfsmnt//; + return $PATH; +} + +sub submit_or_exec { + my ($cmd,$stdout,$stderr) = @_; + print STDERR "exec: $cmd\n"; + if (defined $___JOBS && $___JOBS > 0) { + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=$stdout -stderr=$stderr" ) + or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; + } + else { + safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'."; + } +} |