Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2015-05-03 10:50:31 +0300
committerHieu Hoang <hieuhoang@gmail.com>2015-05-03 10:50:31 +0300
commite5f76ee99e1da3f9ef32338243ef046154f282d9 (patch)
treec8dc0eb33e8301a8d24c658161a737407dfe20f0 /scripts
parent73ae7d7e209b78c40a4106d94c62d35ff98bf83b (diff)
parent402b958d9069ff11df3e603f473ade9487bac481 (diff)
Merge branch 'master' of github.com:moses-smt/mosesdecoder
Diffstat (limited to 'scripts')
-rw-r--r--scripts/ems/experiment.meta28
-rwxr-xr-xscripts/ems/experiment.perl62
-rwxr-xr-xscripts/ems/support/build-sparse-features.perl16
-rwxr-xr-xscripts/ems/support/fast-align-in-parts.perl91
-rwxr-xr-xscripts/ems/support/generic-parallelizer.perl4
-rwxr-xr-xscripts/ems/support/lmplz-wrapper.perl10
-rwxr-xr-xscripts/training/mert-moses.pl38
-rwxr-xr-xscripts/training/wrappers/make-factor-brown-cluster-mkcls.perl12
-rwxr-xr-xscripts/training/wrappers/make-factor-de-lemma.perl33
-rwxr-xr-xscripts/training/wrappers/make-factor-de-morph.perl62
-rwxr-xr-xscripts/training/wrappers/make-factor-en-porter.perl10
11 files changed, 288 insertions, 78 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 57ef4f9d6..d1448ef44 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -233,6 +233,8 @@ train
template: $lm-training -order $order $settings -text IN -lm OUT
error: cannot execute binary file
error: unrecognised option
+ not-error: BadDiscountException
+ not-error: To override this error
randomize
in: lm
out: rlm
@@ -309,8 +311,14 @@ split-tuning
default-name: lm/interpolate-tuning.split
pass-unless: output-splitter
template: $output-splitter -model IN1.$output-extension < IN > OUT
+strip-tuning
+ in: split-tuning
+ out: stripped-tuning
+ default-name: lm/interpolate-tuning.stripped
+ pass-unless: mock-output-parser-lm
+ template: $moses-script-dir/training/strip-xml.perl < IN > OUT && $moses-script-dir/training/wrappers/mosesxml2brackets.py < IN > OUT.trees
interpolate
- in: script split-tuning LM:lm
+ in: script stripped-tuning LM:lm
rerun-on-change: srilm-dir group weights
out: lm
default-name: lm/interpolated-lm
@@ -466,14 +474,32 @@ fast-align
in: prepared-data-fast-align
out: fast-alignment
rerun-on-change: fast-align-settings
+ ignore-if: fast-align-max-lines
template: $external-bin-dir/fast_align -i IN $fast-align-settings > OUT
default-name: fast-align
fast-align-inverse
in: prepared-data-fast-align
out: fast-alignment-inverse
rerun-on-change: fast-align-settings
+ ignore-if: fast-align-max-lines
template: $external-bin-dir/fast_align -i IN -r $fast-align-settings > OUT
default-name: fast-align-inverse
+fast-align-in-parts
+ in: prepared-data-fast-align
+ out: fast-alignment
+ rerun-on-change: fast-align-settings fast-align-max-lines
+ ignore-unless: fast-align-max-lines
+ tmp-name: training/tmp.fast-align
+ template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
+ default-name: fast-align
+fast-align-in-parts-inverse
+ in: prepared-data-fast-align
+ out: fast-alignment-inverse
+ rerun-on-change: fast-align-settings fast-align-max-lines
+ ignore-unless: fast-align-max-lines
+ tmp-name: training/tmp.fast-align-inverse
+ template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
+ default-name: fast-align
symmetrize-fast-align
in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus
out: word-alignment
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 7070a7c9e..8e23b7b18 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -312,10 +312,10 @@ sub read_meta {
$ONLY_FACTOR_0{"$module:$step"}++;
}
elsif ($1 eq "error") {
- @{$ERROR{"$module:$step"}} = split(/,/,$2);
+ push @{$ERROR{"$module:$step"}}, $2;
}
elsif ($1 eq "not-error") {
- @{$NOT_ERROR{"$module:$step"}} = split(/,/,$2);
+ push @{$NOT_ERROR{"$module:$step"}}, $2;
}
else {
die("META ERROR unknown parameter: $1");
@@ -1282,10 +1282,10 @@ sub execute_steps {
&write_info($i);
# cluster job submission
- if ($CLUSTER && ! &is_qsub_script($i)) {
+ if ($CLUSTER && (!&is_qsub_script($i) || (&backoff_and_get($DO_STEP[$i].":jobs") && (&backoff_and_get($DO_STEP[$i].":jobs")==1)))) {
$DO{$i}++;
my $qsub_args = &get_qsub_args($DO_STEP[$i]);
- print "\texecuting $step via qsub ($active active)\n";
+ print "\texecuting $step via qsub $qsub_args ($active active)\n";
my $qsub_command="qsub $qsub_args -S /bin/bash -e $step.STDERR -o $step.STDOUT $step";
print "\t$qsub_command\n" if $VERBOSE;
`$qsub_command`;
@@ -1338,15 +1338,15 @@ sub execute_steps {
sub get_qsub_args {
my ($step) = @_;
- my $qsub_args = &get("$step:qsub-settings");
- $qsub_args = &get("GENERAL:qsub-settings") unless defined($qsub_args);
+ my $qsub_args = &backoff_and_get("$step:qsub-settings");
$qsub_args = "" unless defined($qsub_args);
my $memory = &get("$step:qsub-memory");
$qsub_args .= " -pe memory $memory" if defined($memory);
my $hours = &get("$step:qsub-hours");
$qsub_args .= " -l h_rt=$hours:0:0" if defined($hours);
my $project = &backoff_and_get("$step:qsub-project");
- $qsub_args = "-P $project" if defined($project);
+ $qsub_args .= " -P $project" if defined($project);
+ $qsub_args =~ s/^ //;
print "qsub args: $qsub_args\n" if $VERBOSE;
return $qsub_args;
}
@@ -1880,7 +1880,7 @@ sub define_tuning_tune {
my $decoder_settings = &backoff_and_get("TUNING:decoder-settings");
$decoder_settings = "" unless $decoder_settings;
- $decoder_settings .= " -v 0 " unless $CLUSTER && $jobs;
+ $decoder_settings .= " -v 0 " unless $CLUSTER && $jobs && $jobs>1;
my $tuning_settings = &backoff_and_get("TUNING:tuning-settings");
$tuning_settings = "" unless $tuning_settings;
@@ -1891,9 +1891,9 @@ sub define_tuning_tune {
$cmd .= " --skip-decoder" if $skip_decoder;
$cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
- my $qsub_args = &get_qsub_args("TUNING");
+ my $qsub_args = &get_qsub_args($DO_STEP[$step_id]);
$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
- $cmd .= " --jobs $jobs" if $CLUSTER && $jobs;
+ $cmd .= " --jobs $jobs" if $CLUSTER && $jobs && $jobs>1;
my $tuning_dir = $tuned_config;
$tuning_dir =~ s/\/[^\/]+$//;
$cmd .= "\nmkdir -p $tuning_dir";
@@ -2576,6 +2576,7 @@ sub define_training_create_config {
my $set = shift @LM_SETS;
next if defined($INTERPOLATED_AWAY{$set});
my $order = &check_backoff_and_get("LM:$set:order");
+
my $lm_file = "$lm";
my $type = 0; # default: SRILM
@@ -2591,6 +2592,13 @@ sub define_training_create_config {
# manually set type
$type = &backoff_and_get("LM:$set:type") if (&backoff_and_get("LM:$set:type"));
+ # binarized by INTERPOLATED-LM
+ if (&get("INTERPOLATED-LM:lm-binarizer")) {
+ $lm_file =~ s/\.lm/\.binlm/;
+ $type = 1;
+ $type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type");
+ }
+
# which factor is the model trained on?
my $factor = 0;
if (&backoff_and_get("TRAINING:output-factors") &&
@@ -2696,7 +2704,7 @@ sub define_interpolated_lm_interpolate {
sub define_interpolated_lm_process {
my ($step_id) = @_;
- my ($processed_lm, $interpolatd_lm) = &get_output_and_input($step_id);
+ my ($processed_lm, $interpolated_lm) = &get_output_and_input($step_id);
my ($module,$set,$stepname) = &deconstruct_name($DO_STEP[$step_id]);
my $tool = &check_backoff_and_get("INTERPOLATED-LM:lm-${stepname}r");
my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
@@ -2706,11 +2714,23 @@ sub define_interpolated_lm_process {
my $cmd = "";
foreach my $factor (keys %{$ILM_SETS}) {
foreach my $order (keys %{$$ILM_SETS{$factor}}) {
- next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1;
- my $suffix = "";
- $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
- $suffix .= ".order$order" if $icount > 1;
- $cmd .= "$tool $interpolatd_lm$suffix $processed_lm$suffix\n";
+ my ($name,$name_processed);
+ if (scalar(@{$$ILM_SETS{$factor}{$order}}) == 1) {
+ # not interpolated -> get name from LM version of these steps
+ my($id,$set) = split(/ /,$$ILM_SETS{$factor}{$order}[0]);
+ $name = &get_default_file("LM",$set,"train"); # well... works for now;
+ $name_processed = $STEP_OUTNAME{"LM:$stepname"};
+ $name_processed =~ s/^(.+\/)([^\/]+)$/$1$set.$2/;
+ $name_processed = &versionize(&long_file_name($name_processed,"lm",""));
+ }
+ else {
+ my $suffix = "";
+ $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
+ $suffix .= ".order$order" if $icount > 1;
+ $name = "$interpolated_lm$suffix";
+ $name_processed = "$processed_lm$suffix";
+ }
+ $cmd .= "$tool $name $name_processed\n";
}
}
@@ -3072,7 +3092,7 @@ sub define_evaluation_decode {
my $nbest_size;
$nbest_size = $nbest if $nbest;
$nbest_size =~ s/[^\d]//g if $nbest;
- if ($jobs && $CLUSTER) {
+ if ($jobs && $jobs>1 && $CLUSTER) {
$cmd .= "mkdir -p $dir/evaluation/tmp.$set.$VERSION\n";
$cmd .= "cd $dir/evaluation/tmp.$set.$VERSION\n";
if (defined $moses_parallel) {
@@ -3496,9 +3516,15 @@ sub check_backoff_and_get_array {
return $CONFIG{$parameter} if defined($CONFIG{$parameter});
# remove set -> find setting for module
- $parameter =~ s/:.*:/:/;
+ $parameter =~ s/:[^:]+:/:/;
return $CONFIG{$parameter} if defined($CONFIG{$parameter});
+ # remove step (if exists)
+ if ($parameter =~ /:[^:]+:/) {
+ $parameter =~ s/:[^:]+:/:/;
+ return $CONFIG{$parameter} if defined($CONFIG{$parameter});
+ }
+
# remove model -> find global setting
$parameter =~ s/^[^:]+:/GENERAL:/;
return $CONFIG{$parameter} if defined($CONFIG{$parameter});
diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl
index 5d9b786ad..3f4b505d5 100755
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@@ -12,15 +12,17 @@ use strict;
my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV;
my $ini = "[feature]\n";
my %ALREADY;
+my %ID;
foreach my $feature_spec (split(/,\s*/,$specification)) {
my @SPEC = split(/\s+/,$feature_spec);
my $factor = ($SPEC[0] eq 'word-translation') ? "0-0" : "0";
$factor = $1 if $feature_spec =~ / factor ([\d\-]+)/;
+ $feature_spec =~ s/ factor ([\d\-]+)//;
if ($SPEC[0] eq 'target-word-insertion') {
- $ini .= "TargetWordInsertionFeature name=TWI factor=$factor";
+ $ini .= "TargetWordInsertionFeature name=TWI".&get_id($SPEC[0])." factor=$factor";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($output_extension, $SPEC[2]);
@@ -34,7 +36,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
$ini .= "\n";
}
elsif ($SPEC[0] eq 'source-word-deletion') {
- $ini .= "SourceWordDeletionFeature name=SWD factor=$factor";
+ $ini .= "SourceWordDeletionFeature name=SWD".&get_id($SPEC[0])." factor=$factor";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($input_extension, $SPEC[2]);
$ini .= " path=$file";
@@ -60,7 +62,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
die("ERROR: Unknown parameter specification in '$SPEC[1]'\n");
}
my ($input_factor,$output_factor) = split(/\-/,$factor);
- $ini .= "WordTranslationFeature name=WT input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
+ $ini .= "WordTranslationFeature name=WT".&get_id($SPEC[0])." input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
}
elsif ($SPEC[0] eq 'phrase-length') {
$ini .= "PhraseLengthFeature name=PL\n";
@@ -111,3 +113,11 @@ sub create_top_words {
return $file;
}
+
+sub get_id {
+ my ($name) = @_;
+ $ID{$name}++;
+ return "" if $ID{$name} == 1;
+ return $ID{$name};
+}
+
diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl
new file mode 100755
index 000000000..fa501b454
--- /dev/null
+++ b/scripts/ems/support/fast-align-in-parts.perl
@@ -0,0 +1,91 @@
+#!/usr/bin/env perl
+
+#######################
+# Revision history
+#
+# 28 Apr 2015 first version
+
+use warnings;
+use strict;
+use Getopt::Long qw(:config pass_through no_ignore_case permute);
+
+my ($BIN,$IN,$MAX_LINES,$SETTINGS,$REVERSE,$TMP);
+
+GetOptions('bin=s' => \$BIN,
+ 'i=s' => \$IN,
+ 'max-lines=i' => \$MAX_LINES,
+ 'settings=s' => \$SETTINGS,
+ 'r' => \$REVERSE,
+ 'tmp=s' => \$TMP,
+ ) or exit(1);
+
+die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR")
+ unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) && defined($MAX_LINES)
+ && $MAX_LINES > 0;
+die("ERROR - input file does not exist: $IN") unless -e $IN;
+die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN;
+
+chomp(my $line_count = `cat $IN | wc -l`);
+
+# not more than maximal number of lines -> just run it regulary
+if ($MAX_LINES > $line_count) {
+ my $cmd = "$BIN -i $IN $SETTINGS";
+ $cmd .= " -r" if defined($REVERSE);
+ safesystem($cmd) or die;
+ exit(0);
+}
+
+my $cmd = "mkdir -p $TMP";
+safesystem($cmd) or die;
+
+# split input
+$cmd = "split -a 2 -l $MAX_LINES $IN $TMP/prepared-";
+safesystem($cmd) or die;
+
+# process
+my @INPUT_FILES = `ls $TMP/prepared-*`;
+chop(@INPUT_FILES);
+foreach my $input_file (@INPUT_FILES) {
+ # create output file name
+ die("ERROR") unless $input_file =~ /prepared-(..)$/;
+ my $output_file = "$TMP/aligned-$1";
+
+ # process part
+ my $cmd = "$BIN -i $input_file $SETTINGS";
+ $cmd .= " -r" if defined($REVERSE);
+ $cmd .= " >$output_file";
+ safesystem($cmd) or die;
+ die("ERROR: no output produced from command $cmd") unless -e $output_file;
+
+ # check line count
+ chomp(my $input_line_count = `cat $input_file | wc -l`);
+ chomp(my $output_line_count = `cat $output_file | wc -l`);
+ die("ERROR: mismatched number of lines in part $1\n\t$input_line_count\t$input_file\n\t$output_line_count\t$output_file\n") unless $input_line_count == $output_line_count;
+}
+
+# join output
+$cmd = "cat $TMP/aligned-*";
+safesystem($cmd) or die;
+
+$cmd = "rm -r $TMP/* ; rmdir $TMP";
+safesystem($cmd);
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit 1;
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
+
diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl
index 0b248be7e..fd7fb2552 100755
--- a/scripts/ems/support/generic-parallelizer.perl
+++ b/scripts/ems/support/generic-parallelizer.perl
@@ -4,7 +4,7 @@ use warnings;
use strict;
my $jobs = 20;
-my ($infile,$outfile,$cmd,$tmpdir);
+my ($infile,$outfile,$cmd,$tmpdir,$qflags);
use Getopt::Long qw(:config pass_through no_ignore_case);
GetOptions('jobs=i' => \$jobs,
@@ -12,7 +12,7 @@ GetOptions('jobs=i' => \$jobs,
'in=s' => \$infile,
'out=s' => \$outfile,
'cmd=s' => \$cmd,
- 'queue-flags=s' => \$qflags,
+ 'queue-flags=s' => \$qflags,
) or exit(1);
die("ERROR: specify infile with -in") unless $infile;
diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl
index eadca6263..0f1e03d15 100755
--- a/scripts/ems/support/lmplz-wrapper.perl
+++ b/scripts/ems/support/lmplz-wrapper.perl
@@ -7,11 +7,12 @@ use Getopt::Long "GetOptions";
Getopt::Long::config("no_auto_abbrev");
Getopt::Long::config("pass_through");
-
-my ($TEXT,$ORDER,$BIN,$LM);
+my ($TEXT,$ORDER,$BIN,$LM,$MEMORY,$TMPDIR);
&GetOptions('text=s' => \$TEXT,
'lm=s' => \$LM,
+ 'S=s' => \$MEMORY,
+ 'T=s' => \$TMPDIR,
'bin=s' => \$BIN,
'order=i' => \$ORDER);
@@ -19,8 +20,9 @@ die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!")
unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER);
my $settings = join(' ', @ARGV);
-#print STDERR "settngs=$settings \n";
-
my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings";
+$cmd .= " -T $TMPDIR" if defined($TMPDIR);
+$cmd .= " -S $MEMORY" if defined($MEMORY);
+$cmd .= " " . join(' ', @ARGV) if scalar(@ARGV); # Pass remaining args through.
print "exec: $cmd\n";
`$cmd`;
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 86084abbf..a7263d4bd 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -76,7 +76,7 @@ my $___N_BEST_LIST_SIZE = 100;
my $___LATTICE_SAMPLES = 0;
my $queue_flags = "-hard"; # extra parameters for parallelizer
# the -l ws0ssmt was relevant only to JHU 2006 workshop
-my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial)
+my $___JOBS = undef; # if parallel, number of jobs to use (undef or <= 0 -> serial)
my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
my $continue = 0; # should we try to continue from the last saved step?
my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert)
@@ -544,7 +544,7 @@ if ($__PROMIX_TRAINING) {
my $___FILTER_F = $___DEV_F;
$___FILTER_F = $filterfile if (defined $filterfile);
my $cmd = "$filtercmd ./$filtered_path $filtered_config $___FILTER_F";
- &submit_or_exec($cmd, "filterphrases_$i.out", "filterphrases_$i.err");
+ &submit_or_exec($cmd, "filterphrases_$i.out", "filterphrases_$i.err", 1);
push (@_PROMIX_TABLES_BIN,"$filtered_path/phrase-table.0-0.1.1");
}
}
@@ -559,7 +559,7 @@ if ($___FILTER_PHRASE_TABLE) {
my $___FILTER_F = $___DEV_F;
$___FILTER_F = $filterfile if (defined $filterfile);
my $cmd = "$filtercmd ./$outdir $___CONFIG $___FILTER_F";
- &submit_or_exec($cmd, "filterphrases.out", "filterphrases.err");
+ &submit_or_exec($cmd, "filterphrases.out", "filterphrases.err", 1);
}
# make a backup copy of startup ini filepath
@@ -829,7 +829,7 @@ while (1) {
# remove segmentation
$cmd .= " -l $__REMOVE_SEGMENTATION" if $__PROMIX_TRAINING;
$cmd = &create_extractor_script($cmd, $___WORKING_DIR);
- &submit_or_exec($cmd, "extract.out","extract.err");
+ &submit_or_exec($cmd, "extract.out","extract.err", 1);
}
# Create the initial weights file for mert: init.opt
@@ -919,11 +919,11 @@ while (1) {
my $pro_optimizer_cmd = "$pro_optimizer $megam_default_options run$run.pro.data";
if ($___PAIRWISE_RANKED_OPTIMIZER) { # pro optimization
$cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd";
- &submit_or_exec($cmd, $mert_outfile, $mert_logfile);
+ &submit_or_exec($cmd, $mert_outfile, $mert_logfile, 1);
} elsif ($___PRO_STARTING_POINT) { # First, run pro, then mert
# run pro...
my $pro_cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd";
- &submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err");
+ &submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err", 1);
# ... get results ...
($bestpoint,$devbleu) = &get_weights_from_mert("run$run.pro.out","run$run.pro.err",scalar @{$featlist->{"names"}},\%sparse_weights, \@promix_weights);
# Get the pro outputs ready for mert. Add the weight ranges,
@@ -951,11 +951,11 @@ while (1) {
# ... and run mert
$cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/;
- &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile);
+ &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile, ($__THREADS ? $__THREADS : 1) );
} elsif ($___BATCH_MIRA) { # batch MIRA optimization
safesystem("echo 'not used' > $weights_out_file") or die;
$cmd = "$mert_mira_cmd $mira_settings $seed_settings $pro_file_settings -o $mert_outfile";
- &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile);
+ &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile, 1);
} elsif ($___HG_MIRA) {
safesystem("echo 'not used' > $weights_out_file") or die;
$mira_settings .= " --type hypergraph ";
@@ -963,7 +963,7 @@ while (1) {
$mira_settings .= " --hgdir $hypergraph_dir ";
#$mira_settings .= "--verbose ";
$cmd = "$mert_mira_cmd $mira_settings $seed_settings -o $mert_outfile";
- &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile);
+ &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile, 1);
} elsif ($__PROMIX_TRAINING) {
# PRO trained mixture model
safesystem("echo 'not used' > $weights_out_file") or die;
@@ -972,10 +972,10 @@ while (1) {
$cmd .= join(" ", map {"-p $_"} @_PROMIX_TABLES_BIN);
$cmd .= " -i $___DEV_F";
print "Starting promix optimisation at " . `date`;
- &submit_or_exec($cmd, "$mert_outfile", $mert_logfile);
+ &submit_or_exec($cmd, "$mert_outfile", $mert_logfile, 1);
print "Finished promix optimisation at " . `date`;
} else { # just mert
- &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile);
+ &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile, ($__THREADS ? $__THREADS : 1) );
}
die "Optimization failed, file $weights_out_file does not exist or is empty"
@@ -1283,7 +1283,7 @@ sub run_decoder {
$lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES ";
}
- if (defined $___JOBS && $___JOBS > 0) {
+ if (defined $___JOBS && $___JOBS > 1) {
die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
$decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
$decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
@@ -1378,9 +1378,9 @@ sub get_featlist_from_moses {
print STDERR "Asking moses for feature names and values from $___CONFIG\n";
my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
$cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
- $cmd .= " -show-weights > $featlistfn";
+ $cmd .= " -show-weights";
print STDERR "Executing: $cmd\n";
- safesystem($cmd) or die "Failed to run moses with the config $configfn";
+ &submit_or_exec($cmd, $featlistfn, "/dev/null", 1);
}
return get_featlist_from_file($featlistfn);
}
@@ -1706,10 +1706,14 @@ sub ensure_full_path {
}
sub submit_or_exec {
- my ($cmd, $stdout, $stderr) = @_;
+ my ($cmd, $stdout, $stderr, $threads) = @_;
print STDERR "exec: $cmd\n";
- if (defined $___JOBS && $___JOBS > 0) {
- safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=$stdout -stderr=$stderr" )
+ if (defined $___JOBS && $___JOBS > 1) {
+ # request fewer CPU slots, if not needed
+ my $queue_flags_for_this_command = $queue_flags;
+ $threads = 1 unless defined($threads);
+ $queue_flags_for_this_command =~ s/(\-pe smp) \d+/$1 $threads/;
+ safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags_for_this_command\" -stdout=$stdout -stderr=$stderr" )
or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)";
} else {
safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'.";
diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
index 88d16b3f6..35714271c 100755
--- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
+++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
@@ -3,11 +3,18 @@
use warnings;
use strict;
-my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV;
+my ($lowercase,$cluster_file,$in,$out,$tmp) = @ARGV;
my $CLUSTER = &read_cluster_from_mkcls($cluster_file);
-open(IN,$in) || die("ERROR: could not open input");
+# is $lowercase a script?
+if ($lowercase =~ /\//) {
+ open(IN,"$lowercase < $in|") || die("ERROR: could not open input");
+ $lowercase = 0;
+}
+else {
+ open(IN,$in) || die("ERROR: could not open input");
+}
binmode(IN, ":utf8");
open(OUT,">$out");
binmode(OUT, ":utf8");
@@ -18,6 +25,7 @@ while(<IN>) {
s/ $//;
my $first = 1;
foreach my $word (split) {
+ # if lowercase is a flag
if ($lowercase) {
$word = lc($word);
}
diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl
new file mode 100755
index 000000000..db978317e
--- /dev/null
+++ b/scripts/training/wrappers/make-factor-de-lemma.perl
@@ -0,0 +1,33 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Encode;
+use FindBin qw($RealBin);
+
+my ($in,$out,$tmpdir) = @ARGV;
+
+`mkdir -p $tmpdir`;
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
+`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
+
+open(LOPAR,"$tmpdir/lopar.$$");
+open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out");
+while(<LOPAR>) {
+ chomp;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ my $first = 1;
+ foreach (split) {
+ die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
+ my ($word,$pos,$lemma) = ($1,$2,$3);
+ print OUT " " unless $first;
+ $first = 0;
+ $lemma =~ s/\|.+$//;
+ $lemma = $word if $lemma =~ /^\<.+\>$/;
+ print OUT encode('utf8', decode('iso-8859-1', $lemma));
+ }
+ print OUT "\n";
+}
+close(LOPAR);
+close(OUT);
diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl
index 1cc917bce..366a5a76d 100755
--- a/scripts/training/wrappers/make-factor-de-morph.perl
+++ b/scripts/training/wrappers/make-factor-de-morph.perl
@@ -1,31 +1,31 @@
-#!/usr/bin/env perl
-
-use warnings;
-use strict;
-use Encode;
-use FindBin qw($RealBin);
-my ($in,$out,$tmpdir) = @ARGV;
-
-`mkdir -p $tmpdir`;
-`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
-`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
-
-open(LOPAR,"$tmpdir/lopar.$$");
-open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl >$out");
-while(<LOPAR>) {
- chomp;
- s/ +/ /g;
- s/^ //;
- s/ $//;
- my $first = 1;
- foreach (split) {
- die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
- my ($word,$morph,$lemma) = ($1,$2,$3);
- print OUT " " unless $first;
- $first = 0;
- print OUT encode('utf8', decode('iso-8859-1', $morph));
- }
- print OUT "\n";
-}
-close(LOPAR);
-close(OUT);
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use Encode;
+use FindBin qw($RealBin);
+my ($in,$out,$tmpdir) = @ARGV;
+
+`mkdir -p $tmpdir`;
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
+`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
+
+open(LOPAR,"$tmpdir/lopar.$$");
+open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl >$out");
+while(<LOPAR>) {
+ chomp;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ my $first = 1;
+ foreach (split) {
+ die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
+ my ($word,$morph,$lemma) = ($1,$2,$3);
+ print OUT " " unless $first;
+ $first = 0;
+ print OUT encode('utf8', decode('iso-8859-1', $morph));
+ }
+ print OUT "\n";
+}
+close(LOPAR);
+close(OUT);
diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl
new file mode 100755
index 000000000..749dc1318
--- /dev/null
+++ b/scripts/training/wrappers/make-factor-en-porter.perl
@@ -0,0 +1,10 @@
+#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+
+my ($in,$out,$tmpdir) = @ARGV;
+
+my $porter_in = "$tmpdir/porter-in.$$";
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in > $porter_in`;
+`/home/pkoehn/statmt/bin/porter-stemmer $porter_in | $RealBin/../../tokenizer/escape-special-chars.perl > $out`;