diff options
author | bhaddow <bhaddow@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-10-05 00:45:47 +0400 |
---|---|---|
committer | bhaddow <bhaddow@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-10-05 00:45:47 +0400 |
commit | 210f87bebddc2ca3a4cae66f365cd78126a3dff0 (patch) | |
tree | a56979269cefe98227d1f705090add97af730d38 | |
parent | 84d73700af580918a4e4ddef7c08950590e546bb (diff) |
Support for lattice sampling (use -lattice-samples n)
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4296 1f5c12ca-751b-0410-a591-d2e778427230
-rwxr-xr-x | scripts/training/mert-moses.pl | 35 |
1 files changed, 30 insertions, 5 deletions
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index ee0cc02d8..f7bb7b8b6 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -88,6 +88,7 @@ my $___DEV_E = undef; # required, basename of files with references my $___DECODER = undef; # required, pathname to the decoder executable my $___CONFIG = undef; # required, pathname to startup ini file my $___N_BEST_LIST_SIZE = 100; +my $___LATTICE_SAMPLES = 0; my $queue_flags = "-hard"; # extra parameters for parallelizer # the -l ws0ssmt was relevant only to JHU 2006 workshop my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial) @@ -156,6 +157,7 @@ GetOptions( "decoder=s" => \$___DECODER, "config=s" => \$___CONFIG, "nbest=i" => \$___N_BEST_LIST_SIZE, + "lattice-samples=i" => \$___LATTICE_SAMPLES, "queue-flags=s" => \$queue_flags, "jobs=i" => \$___JOBS, "decoder-flags=s" => \$___DECODER_FLAGS, @@ -208,6 +210,7 @@ if ($usage || !defined $___DEV_F || !defined $___DEV_E || !defined $___DECODER | Options: --working-dir=mert-dir ... where all the files are created --nbest=100 ... how big nbestlist to generate + --lattice-samples ... how many lattice samples (Chatterjee & Cancedda, emnlp 2010) --jobs=N ... set this to anything to run moses in parallel --mosesparallelcmd=STR ... use a different script instead of moses-parallel --queue-flags=STRING ... anything you with to pass to qsub, eg. @@ -606,6 +609,8 @@ my $oldallsorted = undef; my $allsorted = undef; my $nbest_file=undef; +my $lsamp_file=undef; #Lattice samples +my $orig_nbest_file=undef; # replaced if lattice sampling while(1) { $run++; @@ -625,8 +630,20 @@ while(1) { # skip running the decoder if the user wanted if (!$skip_decoder) { print "($run) run decoder to produce n-best lists\n"; - $nbest_file = run_decoder($featlist, $run, $need_to_normalize); + ($nbest_file,$lsamp_file) = run_decoder($featlist, $run, $need_to_normalize); $need_to_normalize = 0; + if ($___LATTICE_SAMPLES) { + my $combined_file = "$nbest_file.comb"; + safesystem("sort -k1,1n $nbest_file $lsamp_file > $combined_file") or + die("failed to merge nbest and lattice samples"); + safesystem("gzip -f $nbest_file; gzip -f $lsamp_file") or + die "Failed to gzip nbests and lattice samples"; + $orig_nbest_file = "$nbest_file.gz"; + $orig_nbest_file = "$nbest_file.gz"; + $lsamp_file = "$lsamp_file.gz"; + $lsamp_file = "$lsamp_file.gz"; + $nbest_file = "$combined_file"; + } safesystem("gzip -f $nbest_file") or die "Failed to gzip run*out"; $nbest_file = $nbest_file.".gz"; } @@ -921,6 +938,11 @@ sub run_decoder { my ($featlist, $run, $need_to_normalize) = @_; my $filename_template = "run%d.best$___N_BEST_LIST_SIZE.out"; my $filename = sprintf($filename_template, $run); + my $lsamp_filename = undef; + if ($___LATTICE_SAMPLES) { + my $lsamp_filename_template = "run%d.lsamp$___LATTICE_SAMPLES.out"; + $lsamp_filename = sprintf($lsamp_filename_template, $run); + } # user-supplied parameters print "params = $___DECODER_FLAGS\n"; @@ -945,19 +967,22 @@ sub run_decoder { print "decoder_config = $decoder_config\n"; # run the decoder - my $nBest_cmd = "-n-best-size $___N_BEST_LIST_SIZE"; my $decoder_cmd; + my $lsamp_cmd = ""; + if ($___LATTICE_SAMPLES) { + $lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES "; + } if (defined $___JOBS && $___JOBS > 0) { - $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; + $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG -inputtype $___INPUTTYPE -qsub-prefix mert$run -queue-parameters \"$queue_flags\" -decoder-parameters \"$___DECODER_FLAGS $decoder_config\" $lsamp_cmd -n-best-list \"$filename $___N_BEST_LIST_SIZE\" -input-file $___DEV_F -jobs $___JOBS -decoder $___DECODER > run$run.out"; } else { - $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out"; + $decoder_cmd = "$___DECODER $___DECODER_FLAGS -config $___CONFIG -inputtype $___INPUTTYPE $decoder_config $lsamp_cmd -n-best-list $filename $___N_BEST_LIST_SIZE -input-file $___DEV_F > run$run.out"; } safesystem($decoder_cmd) or die "The decoder died. CONFIG WAS $decoder_config \n"; sanity_check_order_of_lambdas($featlist, $filename); - return $filename; + return ($filename, $lsamp_filename); } |