diff options
author | Barry Haddow <barry.haddow@gmail.com> | 2015-05-08 11:16:55 +0300 |
---|---|---|
committer | Barry Haddow <barry.haddow@gmail.com> | 2015-05-08 11:16:55 +0300 |
commit | 85c1af4d72686d2bc95960040cd8407b22f3df53 (patch) | |
tree | 67725a66e32fc9bcd109597b5854067aac01107e /scripts | |
parent | f403f5e4785361487969ad4865adea14651bfa15 (diff) | |
parent | 8e6eb067bca1ee4f9d36cb2c305f7ac60b81f230 (diff) |
Merge branch 'master' of github.com:moses-smt/mosesdecoder
Diffstat (limited to 'scripts')
-rw-r--r-- | scripts/ems/experiment.meta | 40 | ||||
-rwxr-xr-x | scripts/ems/experiment.perl | 62 | ||||
-rwxr-xr-x | scripts/ems/support/build-sparse-features.perl | 16 | ||||
-rwxr-xr-x | scripts/ems/support/fast-align-in-parts.perl | 91 | ||||
-rwxr-xr-x | scripts/ems/support/generic-parallelizer.perl | 4 | ||||
-rwxr-xr-x | scripts/ems/support/lmplz-wrapper.perl | 10 | ||||
-rwxr-xr-x | scripts/ems/support/report-experiment-scores.perl | 6 | ||||
-rwxr-xr-x | scripts/generic/extract-parallel.perl | 84 | ||||
-rwxr-xr-x | scripts/training/mert-moses.pl | 38 | ||||
-rwxr-xr-x | scripts/training/wrappers/madamira-tok.perl | 43 | ||||
-rwxr-xr-x | scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl | 12 | ||||
-rwxr-xr-x | scripts/training/wrappers/make-factor-de-lemma.perl | 33 | ||||
-rwxr-xr-x | scripts/training/wrappers/make-factor-de-morph.perl | 62 | ||||
-rwxr-xr-x | scripts/training/wrappers/make-factor-en-porter.perl | 10 |
14 files changed, 379 insertions, 132 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 92e88c0f7..aa9a457bb 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -233,6 +233,8 @@ train template: $lm-training -order $order $settings -text IN -lm OUT error: cannot execute binary file error: unrecognised option + not-error: BadDiscountException + not-error: To override this error randomize in: lm out: rlm @@ -472,14 +474,32 @@ fast-align in: prepared-data-fast-align out: fast-alignment rerun-on-change: fast-align-settings + ignore-if: fast-align-max-lines template: $external-bin-dir/fast_align -i IN $fast-align-settings > OUT default-name: fast-align fast-align-inverse in: prepared-data-fast-align out: fast-alignment-inverse rerun-on-change: fast-align-settings + ignore-if: fast-align-max-lines template: $external-bin-dir/fast_align -i IN -r $fast-align-settings > OUT default-name: fast-align-inverse +fast-align-in-parts + in: prepared-data-fast-align + out: fast-alignment + rerun-on-change: fast-align-settings fast-align-max-lines + ignore-unless: fast-align-max-lines + tmp-name: training/tmp.fast-align + template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT + default-name: fast-align +fast-align-in-parts-inverse + in: prepared-data-fast-align + out: fast-alignment-inverse + rerun-on-change: fast-align-settings fast-align-max-lines + ignore-unless: fast-align-max-lines + tmp-name: training/tmp.fast-align-inverse + template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT + default-name: fast-align symmetrize-fast-align in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus out: word-alignment @@ -1330,6 +1350,24 @@ multi-bleu-c rerun-on-change: multi-bleu-c template: $multi-bleu-c IN1 < IN > OUT final-model: yes + +multi-bleu-detok + in: detokenized-output tokenized-reference + out: multi-bleu-detok-score + default-name: evaluation/multi-bleu-detok + ignore-unless: multi-bleu-detok + rerun-on-change: multi-bleu-detok + template: $multi-bleu-detok IN1 < IN > OUT + final-model: yes +multi-bleu-c-detok + in: detokenized-output tokenized-reference + out: multi-bleu-c-detok-score + default-name: evaluation/multi-bleu-c-detok + ignore-unless: multi-bleu-c-detok + rerun-on-change: multi-bleu-c-detok + template: $multi-bleu-c-detok IN1 < IN > OUT + final-model: yes + ter in: wrapped-output reference-sgm out: ter-score @@ -1377,6 +1415,6 @@ analysis-precision [REPORTING] single report - in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis + in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:multi-bleu-detok-score EVALUATION:multi-bleu-c-detok-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis out: report default-name: evaluation/report diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index def5b9a82..5d68e409c 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -312,10 +312,10 @@ sub read_meta { $ONLY_FACTOR_0{"$module:$step"}++; } elsif ($1 eq "error") { - @{$ERROR{"$module:$step"}} = split(/,/,$2); + push @{$ERROR{"$module:$step"}}, $2; } elsif ($1 eq "not-error") { - @{$NOT_ERROR{"$module:$step"}} = split(/,/,$2); + push @{$NOT_ERROR{"$module:$step"}}, $2; } else { die("META ERROR unknown parameter: $1"); @@ -1282,10 +1282,10 @@ sub execute_steps { &write_info($i); # cluster job submission - if ($CLUSTER && ! &is_qsub_script($i)) { + if ($CLUSTER && (!&is_qsub_script($i) || (&backoff_and_get($DO_STEP[$i].":jobs") && (&backoff_and_get($DO_STEP[$i].":jobs")==1)))) { $DO{$i}++; my $qsub_args = &get_qsub_args($DO_STEP[$i]); - print "\texecuting $step via qsub ($active active)\n"; + print "\texecuting $step via qsub $qsub_args ($active active)\n"; my $qsub_command="qsub $qsub_args -S /bin/bash -e $step.STDERR -o $step.STDOUT $step"; print "\t$qsub_command\n" if $VERBOSE; `$qsub_command`; @@ -1338,15 +1338,15 @@ sub execute_steps { sub get_qsub_args { my ($step) = @_; - my $qsub_args = &get("$step:qsub-settings"); - $qsub_args = &get("GENERAL:qsub-settings") unless defined($qsub_args); + my $qsub_args = &backoff_and_get("$step:qsub-settings"); $qsub_args = "" unless defined($qsub_args); my $memory = &get("$step:qsub-memory"); $qsub_args .= " -pe memory $memory" if defined($memory); my $hours = &get("$step:qsub-hours"); $qsub_args .= " -l h_rt=$hours:0:0" if defined($hours); my $project = &backoff_and_get("$step:qsub-project"); - $qsub_args = "-P $project" if defined($project); + $qsub_args .= " -P $project" if defined($project); + $qsub_args =~ s/^ //; print "qsub args: $qsub_args\n" if $VERBOSE; return $qsub_args; } @@ -1880,7 +1880,7 @@ sub define_tuning_tune { my $decoder_settings = &backoff_and_get("TUNING:decoder-settings"); $decoder_settings = "" unless $decoder_settings; - $decoder_settings .= " -v 0 " unless $CLUSTER && $jobs; + $decoder_settings .= " -v 0 " unless $CLUSTER && $jobs && $jobs>1; my $tuning_settings = &backoff_and_get("TUNING:tuning-settings"); $tuning_settings = "" unless $tuning_settings; @@ -1891,9 +1891,9 @@ sub define_tuning_tune { $cmd .= " --skip-decoder" if $skip_decoder; $cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype); - my $qsub_args = &get_qsub_args("TUNING"); + my $qsub_args = &get_qsub_args($DO_STEP[$step_id]); $cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args); - $cmd .= " --jobs $jobs" if $CLUSTER && $jobs; + $cmd .= " --jobs $jobs" if $CLUSTER && $jobs && $jobs>1; my $tuning_dir = $tuned_config; $tuning_dir =~ s/\/[^\/]+$//; $cmd .= "\nmkdir -p $tuning_dir"; @@ -2575,6 +2575,7 @@ sub define_training_create_config { my $set = shift @LM_SETS; next if defined($INTERPOLATED_AWAY{$set}); my $order = &check_backoff_and_get("LM:$set:order"); + my $lm_file = "$lm"; my $type = 0; # default: SRILM @@ -2590,6 +2591,13 @@ sub define_training_create_config { # manually set type $type = &backoff_and_get("LM:$set:type") if (&backoff_and_get("LM:$set:type")); + # binarized by INTERPOLATED-LM + if (&get("INTERPOLATED-LM:lm-binarizer")) { + $lm_file =~ s/\.lm/\.binlm/; + $type = 1; + $type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type"); + } + # which factor is the model trained on? my $factor = 0; if (&backoff_and_get("TRAINING:output-factors") && @@ -2695,7 +2703,7 @@ sub define_interpolated_lm_interpolate { sub define_interpolated_lm_process { my ($step_id) = @_; - my ($processed_lm, $interpolatd_lm) = &get_output_and_input($step_id); + my ($processed_lm, $interpolated_lm) = &get_output_and_input($step_id); my ($module,$set,$stepname) = &deconstruct_name($DO_STEP[$step_id]); my $tool = &check_backoff_and_get("INTERPOLATED-LM:lm-${stepname}r"); my $FACTOR = &backoff_and_get_array("TRAINING:output-factors"); @@ -2705,11 +2713,23 @@ sub define_interpolated_lm_process { my $cmd = ""; foreach my $factor (keys %{$ILM_SETS}) { foreach my $order (keys %{$$ILM_SETS{$factor}}) { - next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1; - my $suffix = ""; - $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR); - $suffix .= ".order$order" if $icount > 1; - $cmd .= "$tool $interpolatd_lm$suffix $processed_lm$suffix\n"; + my ($name,$name_processed); + if (scalar(@{$$ILM_SETS{$factor}{$order}}) == 1) { + # not interpolated -> get name from LM version of these steps + my($id,$set) = split(/ /,$$ILM_SETS{$factor}{$order}[0]); + $name = &get_default_file("LM",$set,"train"); # well... works for now; + $name_processed = $STEP_OUTNAME{"LM:$stepname"}; + $name_processed =~ s/^(.+\/)([^\/]+)$/$1$set.$2/; + $name_processed = &versionize(&long_file_name($name_processed,"lm","")); + } + else { + my $suffix = ""; + $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR); + $suffix .= ".order$order" if $icount > 1; + $name = "$interpolated_lm$suffix"; + $name_processed = "$processed_lm$suffix"; + } + $cmd .= "$tool $name $name_processed\n"; } } @@ -3071,7 +3091,7 @@ sub define_evaluation_decode { my $nbest_size; $nbest_size = $nbest if $nbest; $nbest_size =~ s/[^\d]//g if $nbest; - if ($jobs && $CLUSTER) { + if ($jobs && $jobs>1 && $CLUSTER) { $cmd .= "mkdir -p $dir/evaluation/tmp.$set.$VERSION\n"; $cmd .= "cd $dir/evaluation/tmp.$set.$VERSION\n"; if (defined $moses_parallel) { @@ -3495,9 +3515,15 @@ sub check_backoff_and_get_array { return $CONFIG{$parameter} if defined($CONFIG{$parameter}); # remove set -> find setting for module - $parameter =~ s/:.*:/:/; + $parameter =~ s/:[^:]+:/:/; return $CONFIG{$parameter} if defined($CONFIG{$parameter}); + # remove step (if exists) + if ($parameter =~ /:[^:]+:/) { + $parameter =~ s/:[^:]+:/:/; + return $CONFIG{$parameter} if defined($CONFIG{$parameter}); + } + # remove model -> find global setting $parameter =~ s/^[^:]+:/GENERAL:/; return $CONFIG{$parameter} if defined($CONFIG{$parameter}); diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl index 5d9b786ad..3f4b505d5 100755 --- a/scripts/ems/support/build-sparse-features.perl +++ b/scripts/ems/support/build-sparse-features.perl @@ -12,15 +12,17 @@ use strict; my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV; my $ini = "[feature]\n"; my %ALREADY; +my %ID; foreach my $feature_spec (split(/,\s*/,$specification)) { my @SPEC = split(/\s+/,$feature_spec); my $factor = ($SPEC[0] eq 'word-translation') ? "0-0" : "0"; $factor = $1 if $feature_spec =~ / factor ([\d\-]+)/; + $feature_spec =~ s/ factor ([\d\-]+)//; if ($SPEC[0] eq 'target-word-insertion') { - $ini .= "TargetWordInsertionFeature name=TWI factor=$factor"; + $ini .= "TargetWordInsertionFeature name=TWI".&get_id($SPEC[0])." factor=$factor"; if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) { my $file = &create_top_words($output_extension, $SPEC[2]); @@ -34,7 +36,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) { $ini .= "\n"; } elsif ($SPEC[0] eq 'source-word-deletion') { - $ini .= "SourceWordDeletionFeature name=SWD factor=$factor"; + $ini .= "SourceWordDeletionFeature name=SWD".&get_id($SPEC[0])." factor=$factor"; if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) { my $file = &create_top_words($input_extension, $SPEC[2]); $ini .= " path=$file"; @@ -60,7 +62,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) { die("ERROR: Unknown parameter specification in '$SPEC[1]'\n"); } my ($input_factor,$output_factor) = split(/\-/,$factor); - $ini .= "WordTranslationFeature name=WT input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n"; + $ini .= "WordTranslationFeature name=WT".&get_id($SPEC[0])." input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n"; } elsif ($SPEC[0] eq 'phrase-length') { $ini .= "PhraseLengthFeature name=PL\n"; @@ -111,3 +113,11 @@ sub create_top_words { return $file; } + +sub get_id { + my ($name) = @_; + $ID{$name}++; + return "" if $ID{$name} == 1; + return $ID{$name}; +} + diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl new file mode 100755 index 000000000..fa501b454 --- /dev/null +++ b/scripts/ems/support/fast-align-in-parts.perl @@ -0,0 +1,91 @@ +#!/usr/bin/env perl + +####################### +# Revision history +# +# 28 Apr 2015 first version + +use warnings; +use strict; +use Getopt::Long qw(:config pass_through no_ignore_case permute); + +my ($BIN,$IN,$MAX_LINES,$SETTINGS,$REVERSE,$TMP); + +GetOptions('bin=s' => \$BIN, + 'i=s' => \$IN, + 'max-lines=i' => \$MAX_LINES, + 'settings=s' => \$SETTINGS, + 'r' => \$REVERSE, + 'tmp=s' => \$TMP, + ) or exit(1); + +die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR") + unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) && defined($MAX_LINES) + && $MAX_LINES > 0; +die("ERROR - input file does not exist: $IN") unless -e $IN; +die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN; + +chomp(my $line_count = `cat $IN | wc -l`); + +# not more than maximal number of lines -> just run it regulary +if ($MAX_LINES > $line_count) { + my $cmd = "$BIN -i $IN $SETTINGS"; + $cmd .= " -r" if defined($REVERSE); + safesystem($cmd) or die; + exit(0); +} + +my $cmd = "mkdir -p $TMP"; +safesystem($cmd) or die; + +# split input +$cmd = "split -a 2 -l $MAX_LINES $IN $TMP/prepared-"; +safesystem($cmd) or die; + +# process +my @INPUT_FILES = `ls $TMP/prepared-*`; +chop(@INPUT_FILES); +foreach my $input_file (@INPUT_FILES) { + # create output file name + die("ERROR") unless $input_file =~ /prepared-(..)$/; + my $output_file = "$TMP/aligned-$1"; + + # process part + my $cmd = "$BIN -i $input_file $SETTINGS"; + $cmd .= " -r" if defined($REVERSE); + $cmd .= " >$output_file"; + safesystem($cmd) or die; + die("ERROR: no output produced from command $cmd") unless -e $output_file; + + # check line count + chomp(my $input_line_count = `cat $input_file | wc -l`); + chomp(my $output_line_count = `cat $output_file | wc -l`); + die("ERROR: mismatched number of lines in part $1\n\t$input_line_count\t$input_file\n\t$output_line_count\t$output_file\n") unless $input_line_count == $output_line_count; +} + +# join output +$cmd = "cat $TMP/aligned-*"; +safesystem($cmd) or die; + +$cmd = "rm -r $TMP/* ; rmdir $TMP"; +safesystem($cmd); + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit 1; + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl index 0b248be7e..fd7fb2552 100755 --- a/scripts/ems/support/generic-parallelizer.perl +++ b/scripts/ems/support/generic-parallelizer.perl @@ -4,7 +4,7 @@ use warnings; use strict; my $jobs = 20; -my ($infile,$outfile,$cmd,$tmpdir); +my ($infile,$outfile,$cmd,$tmpdir,$qflags); use Getopt::Long qw(:config pass_through no_ignore_case); GetOptions('jobs=i' => \$jobs, @@ -12,7 +12,7 @@ GetOptions('jobs=i' => \$jobs, 'in=s' => \$infile, 'out=s' => \$outfile, 'cmd=s' => \$cmd, - 'queue-flags=s' => \$qflags, + 'queue-flags=s' => \$qflags, ) or exit(1); die("ERROR: specify infile with -in") unless $infile; diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl index eadca6263..f36d2d9e0 100755 --- a/scripts/ems/support/lmplz-wrapper.perl +++ b/scripts/ems/support/lmplz-wrapper.perl @@ -7,11 +7,12 @@ use Getopt::Long "GetOptions"; Getopt::Long::config("no_auto_abbrev"); Getopt::Long::config("pass_through"); - -my ($TEXT,$ORDER,$BIN,$LM); +my ($TEXT,$ORDER,$BIN,$LM,$MEMORY,$TMPDIR); &GetOptions('text=s' => \$TEXT, 'lm=s' => \$LM, + 'S=s' => \$MEMORY, + 'T=s' => \$TMPDIR, 'bin=s' => \$BIN, 'order=i' => \$ORDER); @@ -19,8 +20,9 @@ die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!") unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER); my $settings = join(' ', @ARGV); -#print STDERR "settngs=$settings \n"; my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings"; -print "exec: $cmd\n"; +$cmd .= " -T $TMPDIR" if defined($TMPDIR); +$cmd .= " -S $MEMORY" if defined($MEMORY); +print STDERR "Executing: $cmd\n"; `$cmd`; diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl index 2e433f291..ef64d4c2d 100755 --- a/scripts/ems/support/report-experiment-scores.perl +++ b/scripts/ems/support/report-experiment-scores.perl @@ -20,6 +20,9 @@ $TYPE{"bolt-bleu-c"} = "BLEU-c"; $TYPE{"bolt-ter"} = "TER"; $TYPE{"bolt-ter-c"} = "TER-c"; +$TYPE{"multi-bleu-detok"} = "BLEU"; +$TYPE{"multi-bleu-c-detok"}= "BLEU-c"; + my %SCORE; my %AVERAGE; foreach (@ARGV) { @@ -59,7 +62,8 @@ sub process { elsif ($type eq 'ibm-bleu' || $type eq 'ibm-bleu-c') { $SCORE{$set} .= &extract_ibm_bleu($file,$type)." "; } - elsif ($type eq 'multi-bleu' || $type eq 'multi-bleu-c') { + elsif ($type eq 'multi-bleu' || $type eq 'multi-bleu-c' + || $type eq 'multi-bleu-detok' || $type eq 'multi-bleu-c-detok') { $SCORE{$set} .= &extract_multi_bleu($file,$type)." "; } elsif ($type eq 'meteor') { diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index ebfe6639a..fe5666a8b 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -6,11 +6,17 @@ use warnings; use strict; use File::Basename; -use Cwd 'abs_path'; sub RunFork($); sub systemCheck($); sub NumStr($); +sub DigitStr($); +sub CharStr($); + +my $is_osx = ($^O eq "darwin"); + +my $alph = "abcdefghijklmnopqrstuvwxyz"; +my @alph = (split(//,$alph)); print "Started ".localtime() ."\n"; @@ -33,6 +39,7 @@ my $baselineExtract; my $glueFile; my $phraseOrientation = 0; my $phraseOrientationPriorsFile; +my $splitCmdOption="-d"; my $GZIP_EXEC; if(`which pigz`) { @@ -63,13 +70,15 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i) $phraseOrientationPriorsFile = $ARGV[++$i]; next; } + $splitCmdOption="",next if $ARGV[$i] eq "--NoNumericSuffix"; $otherExtractArgs .= $ARGV[$i] ." "; } my $cmd; my $TMPDIR=dirname($extract) ."/tmp.$$"; -$cmd = "mkdir -p $TMPDIR"; +$cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR"; +print STDERR "Executing: $cmd \n"; `$cmd`; my $totalLines = int(`cat $align | wc -l`); @@ -82,20 +91,20 @@ my $pid; if ($numParallel > 1) { - $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $target $TMPDIR/target."; + $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $target $TMPDIR/target."; $pid = RunFork($cmd); push(@children, $pid); - $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $source $TMPDIR/source."; + $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $source $TMPDIR/source."; $pid = RunFork($cmd); push(@children, $pid); - $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $align $TMPDIR/align."; + $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $align $TMPDIR/align."; $pid = RunFork($cmd); push(@children, $pid); if ($weights) { - $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $weights $TMPDIR/weights."; + $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $weights $TMPDIR/weights."; $pid = RunFork($cmd); push(@children, $pid); } @@ -110,21 +119,17 @@ else { my $numStr = NumStr(0); - $cmd = "ln -s ".abs_path($target)." $TMPDIR/target.$numStr"; - print STDERR "Executing: $cmd \n"; + $cmd = "ln -s $target $TMPDIR/target.$numStr"; `$cmd`; - $cmd = "ln -s ".abs_path($source)." $TMPDIR/source.$numStr"; - print STDERR "Executing: $cmd \n"; + $cmd = "ln -s $source $TMPDIR/source.$numStr"; `$cmd`; - $cmd = "ln -s ".abs_path($align)." $TMPDIR/align.$numStr"; - print STDERR "Executing: $cmd \n"; + $cmd = "ln -s $align $TMPDIR/align.$numStr"; `$cmd`; if ($weights) { - $cmd = "ln -s ".abs_path($weights)." $TMPDIR/weights.$numStr"; - print STDERR "Executing: $cmd \n"; + $cmd = "ln -s $weights $TMPDIR/weights.$numStr"; `$cmd`; } } @@ -150,8 +155,8 @@ for (my $i = 0; $i < $numParallel; ++$i) print "glueArg=$glueArg \n"; my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n"; + `$cmd`; - safesystem($cmd) or die; exit(); } else @@ -163,10 +168,6 @@ for (my $i = 0; $i < $numParallel; ++$i) # wait for everything is finished foreach (@children) { waitpid($_, 0); - if($? != 0) { - print STDERR "ERROR: Failed to execute: @_\n $!\n"; - exit(1); - } } # merge @@ -268,7 +269,6 @@ if ($phraseOrientation && defined($phraseOrientationPriorsFile)) { # delete temporary files $cmd = "rm -rf $TMPDIR \n"; -print STDERR $cmd; `$cmd`; print STDERR "Finished ".localtime() ."\n"; @@ -301,7 +301,7 @@ sub systemCheck($) } } -sub NumStr($) +sub DigitStr($) { my $i = shift; my $numStr; @@ -329,22 +329,30 @@ sub NumStr($) return $numStr; } -sub safesystem { - print STDERR "Executing: @_\n"; - system(@_); - if ($? == -1) { - print STDERR "ERROR: Failed to execute: @_\n $!\n"; - exit(1); - } - elsif ($? & 127) { - printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - exit(1); - } - else { - my $exitcode = $? >> 8; - print STDERR "Exit code: $exitcode\n" if $exitcode; - return ! $exitcode; - } +sub CharStr($) +{ + my $i = shift; + my $charStr; + my @bit=(); + + while ($i>0){ + push @bit, $i%26; + $i=int($i/26); + } + my $offset=scalar(@bit); + my $h; + for ($h=6;$h>=$offset;--$h) { $charStr.="a"; } + for ($h=$offset-1;$h>=0;--$h) { $charStr.="$alph[$bit[$h]]"; } + return $charStr; +} + +sub NumStr($) +{ + my $i = shift; + if ($is_osx){ + return CharStr($i); + }else{ + return DigitStr($i); + } } diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 86084abbf..a7263d4bd 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -76,7 +76,7 @@ my $___N_BEST_LIST_SIZE = 100; my $___LATTICE_SAMPLES = 0; my $queue_flags = "-hard"; # extra parameters for parallelizer # the -l ws0ssmt was relevant only to JHU 2006 workshop -my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial) +my $___JOBS = undef; # if parallel, number of jobs to use (undef or <= 0 -> serial) my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder my $continue = 0; # should we try to continue from the last saved step? my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert) @@ -544,7 +544,7 @@ if ($__PROMIX_TRAINING) { my $___FILTER_F = $___DEV_F; $___FILTER_F = $filterfile if (defined $filterfile); my $cmd = "$filtercmd ./$filtered_path $filtered_config $___FILTER_F"; - &submit_or_exec($cmd, "filterphrases_$i.out", "filterphrases_$i.err"); + &submit_or_exec($cmd, "filterphrases_$i.out", "filterphrases_$i.err", 1); push (@_PROMIX_TABLES_BIN,"$filtered_path/phrase-table.0-0.1.1"); } } @@ -559,7 +559,7 @@ if ($___FILTER_PHRASE_TABLE) { my $___FILTER_F = $___DEV_F; $___FILTER_F = $filterfile if (defined $filterfile); my $cmd = "$filtercmd ./$outdir $___CONFIG $___FILTER_F"; - &submit_or_exec($cmd, "filterphrases.out", "filterphrases.err"); + &submit_or_exec($cmd, "filterphrases.out", "filterphrases.err", 1); } # make a backup copy of startup ini filepath @@ -829,7 +829,7 @@ while (1) { # remove segmentation $cmd .= " -l $__REMOVE_SEGMENTATION" if $__PROMIX_TRAINING; $cmd = &create_extractor_script($cmd, $___WORKING_DIR); - &submit_or_exec($cmd, "extract.out","extract.err"); + &submit_or_exec($cmd, "extract.out","extract.err", 1); } # Create the initial weights file for mert: init.opt @@ -919,11 +919,11 @@ while (1) { my $pro_optimizer_cmd = "$pro_optimizer $megam_default_options run$run.pro.data"; if ($___PAIRWISE_RANKED_OPTIMIZER) { # pro optimization $cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd"; - &submit_or_exec($cmd, $mert_outfile, $mert_logfile); + &submit_or_exec($cmd, $mert_outfile, $mert_logfile, 1); } elsif ($___PRO_STARTING_POINT) { # First, run pro, then mert # run pro... my $pro_cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd"; - &submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err"); + &submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err", 1); # ... get results ... ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.pro.out","run$run.pro.err",scalar @{$featlist->{"names"}},\%sparse_weights, \@promix_weights); # Get the pro outputs ready for mert. Add the weight ranges, @@ -951,11 +951,11 @@ while (1) { # ... and run mert $cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/; - &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile); + &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile, ($__THREADS ? $__THREADS : 1) ); } elsif ($___BATCH_MIRA) { # batch MIRA optimization safesystem("echo 'not used' > $weights_out_file") or die; $cmd = "$mert_mira_cmd $mira_settings $seed_settings $pro_file_settings -o $mert_outfile"; - &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile); + &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile, 1); } elsif ($___HG_MIRA) { safesystem("echo 'not used' > $weights_out_file") or die; $mira_settings .= " --type hypergraph "; @@ -963,7 +963,7 @@ while (1) { $mira_settings .= " --hgdir $hypergraph_dir "; #$mira_settings .= "--verbose "; $cmd = "$mert_mira_cmd $mira_settings $seed_settings -o $mert_outfile"; - &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile); + &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile, 1); } elsif ($__PROMIX_TRAINING) { # PRO trained mixture model safesystem("echo 'not used' > $weights_out_file") or die; @@ -972,10 +972,10 @@ while (1) { $cmd .= join(" ", map {"-p $_"} @_PROMIX_TABLES_BIN); $cmd .= " -i $___DEV_F"; print "Starting promix optimisation at " . `date`; - &submit_or_exec($cmd, "$mert_outfile", $mert_logfile); + &submit_or_exec($cmd, "$mert_outfile", $mert_logfile, 1); print "Finished promix optimisation at " . `date`; } else { # just mert - &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile); + &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile, ($__THREADS ? $__THREADS : 1) ); } die "Optimization failed, file $weights_out_file does not exist or is empty" @@ -1283,7 +1283,7 @@ sub run_decoder { $lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES "; } - if (defined $___JOBS && $___JOBS > 0) { + if (defined $___JOBS && $___JOBS > 1) { die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA; $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG"; $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); @@ -1378,9 +1378,9 @@ sub get_featlist_from_moses { print STDERR "Asking moses for feature names and values from $___CONFIG\n"; my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn"; $cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); - $cmd .= " -show-weights > $featlistfn"; + $cmd .= " -show-weights"; print STDERR "Executing: $cmd\n"; - safesystem($cmd) or die "Failed to run moses with the config $configfn"; + &submit_or_exec($cmd, $featlistfn, "/dev/null", 1); } return get_featlist_from_file($featlistfn); } @@ -1706,10 +1706,14 @@ sub ensure_full_path { } sub submit_or_exec { - my ($cmd, $stdout, $stderr) = @_; + my ($cmd, $stdout, $stderr, $threads) = @_; print STDERR "exec: $cmd\n"; - if (defined $___JOBS && $___JOBS > 0) { - safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=$stdout -stderr=$stderr" ) + if (defined $___JOBS && $___JOBS > 1) { + # request fewer CPU slots, if not needed + my $queue_flags_for_this_command = $queue_flags; + $threads = 1 unless defined($threads); + $queue_flags_for_this_command =~ s/(\-pe smp) \d+/$1 $threads/; + safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags_for_this_command\" -stdout=$stdout -stderr=$stderr" ) or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)"; } else { safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'."; diff --git a/scripts/training/wrappers/madamira-tok.perl b/scripts/training/wrappers/madamira-tok.perl index bc7e55d43..00639b7a7 100755 --- a/scripts/training/wrappers/madamira-tok.perl +++ b/scripts/training/wrappers/madamira-tok.perl @@ -16,6 +16,7 @@ my $KEEP_TMP = 0; my $MADA_DIR; my $CONFIG; my $SCHEME; +my $USE_PARALLEL = 1; my $FACTORS_STR; my @FACTORS; @@ -26,7 +27,8 @@ GetOptions( "mada-dir=s" => \$MADA_DIR, "factors=s" => \$FACTORS_STR, "config=s" => \$CONFIG, - "scheme=s" => \$SCHEME + "scheme=s" => \$SCHEME, + "use-parallel=i" => \$USE_PARALLEL ) or die("ERROR: unknown options"); die("must have -scheme arg") unless defined($SCHEME); @@ -61,25 +63,36 @@ close(TMP); my $cmd; -# split input file -my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; -if($SPLIT_EXEC) { +if ($USE_PARALLEL) { + # split input file + my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; + if($SPLIT_EXEC) { $SPLIT_EXEC = 'gsplit'; -} -else { + } + else { $SPLIT_EXEC = 'split'; -} + } -$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x"; -`$cmd`; + $cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x"; + `$cmd`; -$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*"; -print STDERR "Executing: $cmd\n"; -`$cmd`; + $cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*"; + print STDERR "Executing: $cmd\n"; + `$cmd`; -$cmd = "cat $TMPDIR/out/x*.$SCHEME.tok > $infile.mada"; -print STDERR "Executing: $cmd\n"; -`$cmd`; + $cmd = "cat $TMPDIR/out/x*.$SCHEME.tok > $infile.mada"; + print STDERR "Executing: $cmd\n"; + `$cmd`; +} +else { + $cmd = "cd $MADA_DIR && java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput $infile -rawoutdir $TMPDIR/out -rawconfig $CONFIG"; + print STDERR "Executing: $cmd\n"; + `$cmd`; + + $cmd = "cat $TMPDIR/out/input.$SCHEME.tok > $infile.mada"; + print STDERR "Executing: $cmd\n"; + `$cmd`; +} # get stuff out of mada output open(MADA_OUT,"<$infile.mada"); diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl index 88d16b3f6..35714271c 100755 --- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl +++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl @@ -3,11 +3,18 @@ use warnings; use strict; -my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV; +my ($lowercase,$cluster_file,$in,$out,$tmp) = @ARGV; my $CLUSTER = &read_cluster_from_mkcls($cluster_file); -open(IN,$in) || die("ERROR: could not open input"); +# is $lowercase a script? +if ($lowercase =~ /\//) { + open(IN,"$lowercase < $in|") || die("ERROR: could not open input"); + $lowercase = 0; +} +else { + open(IN,$in) || die("ERROR: could not open input"); +} binmode(IN, ":utf8"); open(OUT,">$out"); binmode(OUT, ":utf8"); @@ -18,6 +25,7 @@ while(<IN>) { s/ $//; my $first = 1; foreach my $word (split) { + # if lowercase is a flag if ($lowercase) { $word = lc($word); } diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl new file mode 100755 index 000000000..db978317e --- /dev/null +++ b/scripts/training/wrappers/make-factor-de-lemma.perl @@ -0,0 +1,33 @@ +#!/usr/bin/perl -w
+
+use strict;
+use Encode;
+use FindBin qw($RealBin);
+
+my ($in,$out,$tmpdir) = @ARGV;
+
+`mkdir -p $tmpdir`;
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
+`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
+
+open(LOPAR,"$tmpdir/lopar.$$");
+open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out");
+while(<LOPAR>) {
+ chomp;
+ s/ +/ /g;
+ s/^ //;
+ s/ $//;
+ my $first = 1;
+ foreach (split) {
+ die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
+ my ($word,$pos,$lemma) = ($1,$2,$3);
+ print OUT " " unless $first;
+ $first = 0;
+ $lemma =~ s/\|.+$//;
+ $lemma = $word if $lemma =~ /^\<.+\>$/;
+ print OUT encode('utf8', decode('iso-8859-1', $lemma));
+ }
+ print OUT "\n";
+}
+close(LOPAR);
+close(OUT);
diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl index 1cc917bce..366a5a76d 100755 --- a/scripts/training/wrappers/make-factor-de-morph.perl +++ b/scripts/training/wrappers/make-factor-de-morph.perl @@ -1,31 +1,31 @@ -#!/usr/bin/env perl
-
-use warnings;
-use strict;
-use Encode;
-use FindBin qw($RealBin);
-my ($in,$out,$tmpdir) = @ARGV;
-
-`mkdir -p $tmpdir`;
-`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
-`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
-
-open(LOPAR,"$tmpdir/lopar.$$");
-open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl >$out");
-while(<LOPAR>) {
- chomp;
- s/ +/ /g;
- s/^ //;
- s/ $//;
- my $first = 1;
- foreach (split) {
- die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
- my ($word,$morph,$lemma) = ($1,$2,$3);
- print OUT " " unless $first;
- $first = 0;
- print OUT encode('utf8', decode('iso-8859-1', $morph));
- }
- print OUT "\n";
-}
-close(LOPAR);
-close(OUT);
+#!/usr/bin/env perl + +use warnings; +use strict; +use Encode; +use FindBin qw($RealBin); +my ($in,$out,$tmpdir) = @ARGV; + +`mkdir -p $tmpdir`; +`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`; +`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`; + +open(LOPAR,"$tmpdir/lopar.$$"); +open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl >$out"); +while(<LOPAR>) { + chomp; + s/ +/ /g; + s/^ //; + s/ $//; + my $first = 1; + foreach (split) { + die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/; + my ($word,$morph,$lemma) = ($1,$2,$3); + print OUT " " unless $first; + $first = 0; + print OUT encode('utf8', decode('iso-8859-1', $morph)); + } + print OUT "\n"; +} +close(LOPAR); +close(OUT); diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl new file mode 100755 index 000000000..749dc1318 --- /dev/null +++ b/scripts/training/wrappers/make-factor-en-porter.perl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+
+my ($in,$out,$tmpdir) = @ARGV;
+
+my $porter_in = "$tmpdir/porter-in.$$";
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in > $porter_in`;
+`/home/pkoehn/statmt/bin/porter-stemmer $porter_in | $RealBin/../../tokenizer/escape-special-chars.perl > $out`;
|