Merge branch 'master' of github.com:moses-smt/mosesdecoder

author: Barry Haddow <barry.haddow@gmail.com> 2015-05-08 11:16:55 +0300
committer: Barry Haddow <barry.haddow@gmail.com> 2015-05-08 11:16:55 +0300
commit: 85c1af4d72686d2bc95960040cd8407b22f3df53 (patch)
tree: 67725a66e32fc9bcd109597b5854067aac01107e /scripts
parent: f403f5e4785361487969ad4865adea14651bfa15 (diff)
parent: 8e6eb067bca1ee4f9d36cb2c305f7ac60b81f230 (diff)
14 files changed, 379 insertions, 132 deletions
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 92e88c0f7..aa9a457bb 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -233,6 +233,8 @@ train
 	template: $lm-training -order $order $settings -text IN -lm OUT
 	error: cannot execute binary file
 	error: unrecognised option
+	not-error: BadDiscountException
+	not-error: To override this error
 randomize
 	in: lm
 	out: rlm
@@ -472,14 +474,32 @@ fast-align
 	in: prepared-data-fast-align
 	out: fast-alignment
 	rerun-on-change: fast-align-settings
+	ignore-if: fast-align-max-lines
 	template: $external-bin-dir/fast_align -i IN $fast-align-settings > OUT
 	default-name: fast-align
 fast-align-inverse
 	in: prepared-data-fast-align
 	out: fast-alignment-inverse
 	rerun-on-change: fast-align-settings
+	ignore-if: fast-align-max-lines
 	template: $external-bin-dir/fast_align -i IN -r $fast-align-settings > OUT
 	default-name: fast-align-inverse
+fast-align-in-parts
+	in: prepared-data-fast-align
+	out: fast-alignment
+	rerun-on-change: fast-align-settings fast-align-max-lines
+	ignore-unless: fast-align-max-lines
+	tmp-name: training/tmp.fast-align
+	template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
+	default-name: fast-align
+fast-align-in-parts-inverse
+	in: prepared-data-fast-align
+	out: fast-alignment-inverse
+	rerun-on-change: fast-align-settings fast-align-max-lines
+	ignore-unless: fast-align-max-lines
+	tmp-name: training/tmp.fast-align-inverse
+	template: $moses-script-dir/ems/support/fast-align-in-parts.perl -bin $external-bin-dir/fast_align -i IN -r -max-lines $fast-align-max-lines -tmp TMP -settings '$fast-align-settings' > OUT
+	default-name: fast-align
 symmetrize-fast-align
 	in: fast-alignment fast-alignment-inverse corpus-mml-prefilter=OR=corpus
 	out: word-alignment
@@ -1330,6 +1350,24 @@ multi-bleu-c
 	rerun-on-change: multi-bleu-c
 	template: $multi-bleu-c IN1 < IN > OUT
 	final-model: yes
+
+multi-bleu-detok
+        in: detokenized-output tokenized-reference
+        out: multi-bleu-detok-score
+        default-name: evaluation/multi-bleu-detok
+        ignore-unless: multi-bleu-detok
+        rerun-on-change: multi-bleu-detok
+        template: $multi-bleu-detok IN1 < IN > OUT
+        final-model: yes
+multi-bleu-c-detok
+        in: detokenized-output tokenized-reference
+        out: multi-bleu-c-detok-score
+        default-name: evaluation/multi-bleu-c-detok
+        ignore-unless: multi-bleu-c-detok
+        rerun-on-change: multi-bleu-c-detok
+        template: $multi-bleu-c-detok IN1 < IN > OUT
+        final-model: yes
+
 ter
 	in: wrapped-output reference-sgm
 	out: ter-score
@@ -1377,6 +1415,6 @@ analysis-precision
 
 [REPORTING] single
 report
-	in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis
+	in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:multi-bleu-detok-score EVALUATION:multi-bleu-c-detok-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis
 	out: report
 	default-name: evaluation/report
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index def5b9a82..5d68e409c 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -312,10 +312,10 @@ sub read_meta {
 		$ONLY_FACTOR_0{"$module:$step"}++;
 	    }
 	    elsif ($1 eq "error") {
-		@{$ERROR{"$module:$step"}} = split(/,/,$2);
+		push @{$ERROR{"$module:$step"}}, $2;
 	    }
 	    elsif ($1 eq "not-error") {
-		@{$NOT_ERROR{"$module:$step"}} = split(/,/,$2);
+		push @{$NOT_ERROR{"$module:$step"}}, $2;
 	    }
 	    else {
 		die("META ERROR unknown parameter: $1");
@@ -1282,10 +1282,10 @@ sub execute_steps {
 		&write_info($i);
 
 		# cluster job submission
-		if ($CLUSTER && ! &is_qsub_script($i)) {
+		if ($CLUSTER && (!&is_qsub_script($i) || (&backoff_and_get($DO_STEP[$i].":jobs") && (&backoff_and_get($DO_STEP[$i].":jobs")==1)))) {
 		    $DO{$i}++;
 		    my $qsub_args = &get_qsub_args($DO_STEP[$i]);		    
-		    print "\texecuting $step via qsub ($active active)\n";
+		    print "\texecuting $step via qsub $qsub_args ($active active)\n";
 		    my $qsub_command="qsub $qsub_args -S /bin/bash -e $step.STDERR -o $step.STDOUT $step";
 		    print "\t$qsub_command\n" if $VERBOSE;
 		    `$qsub_command`;
@@ -1338,15 +1338,15 @@ sub execute_steps {
 
 sub get_qsub_args {
     my ($step) = @_;
-    my $qsub_args = &get("$step:qsub-settings");
-    $qsub_args = &get("GENERAL:qsub-settings") unless defined($qsub_args);
+    my $qsub_args = &backoff_and_get("$step:qsub-settings");
     $qsub_args = "" unless defined($qsub_args);
     my $memory = &get("$step:qsub-memory");
     $qsub_args .= " -pe memory $memory" if defined($memory);
     my $hours = &get("$step:qsub-hours");
     $qsub_args .= " -l h_rt=$hours:0:0" if defined($hours);
     my $project = &backoff_and_get("$step:qsub-project");
-    $qsub_args = "-P $project" if defined($project);
+    $qsub_args .= " -P $project" if defined($project);
+    $qsub_args =~ s/^ //;
     print "qsub args: $qsub_args\n" if $VERBOSE;
     return $qsub_args;
 }
@@ -1880,7 +1880,7 @@ sub define_tuning_tune {
 
 	my $decoder_settings = &backoff_and_get("TUNING:decoder-settings");
 	$decoder_settings = "" unless $decoder_settings;
-	$decoder_settings .= " -v 0 " unless $CLUSTER && $jobs;
+	$decoder_settings .= " -v 0 " unless $CLUSTER && $jobs && $jobs>1;
 	
 	my $tuning_settings = &backoff_and_get("TUNING:tuning-settings");
 	$tuning_settings = "" unless $tuning_settings;
@@ -1891,9 +1891,9 @@ sub define_tuning_tune {
 	$cmd .= " --skip-decoder" if $skip_decoder;
 	$cmd .= " --inputtype $tune_inputtype" if defined($tune_inputtype);
     
-	my $qsub_args = &get_qsub_args("TUNING");
+	my $qsub_args = &get_qsub_args($DO_STEP[$step_id]);
 	$cmd .= " --queue-flags=\"$qsub_args\"" if ($CLUSTER && $qsub_args);
-	$cmd .= " --jobs $jobs" if $CLUSTER && $jobs;
+	$cmd .= " --jobs $jobs" if $CLUSTER && $jobs && $jobs>1;
 	my $tuning_dir = $tuned_config;
 	$tuning_dir =~ s/\/[^\/]+$//;
 	$cmd .= "\nmkdir -p $tuning_dir";
@@ -2575,6 +2575,7 @@ sub define_training_create_config {
 	    my $set = shift @LM_SETS;
       next if defined($INTERPOLATED_AWAY{$set});
 	    my $order = &check_backoff_and_get("LM:$set:order");
+
 	    my $lm_file = "$lm";
 	    my $type = 0; # default: SRILM
 
@@ -2590,6 +2591,13 @@ sub define_training_create_config {
       # manually set type 
       $type = &backoff_and_get("LM:$set:type") if (&backoff_and_get("LM:$set:type"));
 
+	    # binarized by INTERPOLATED-LM
+	    if (&get("INTERPOLATED-LM:lm-binarizer")) {
+	      $lm_file =~ s/\.lm/\.binlm/;
+	      $type = 1;
+              $type = &get("INTERPOLATED-LM:type") if &get("INTERPOLATED-LM:type");
+            }	
+
 	    # which factor is the model trained on?
 	    my $factor = 0;
 	    if (&backoff_and_get("TRAINING:output-factors") &&
@@ -2695,7 +2703,7 @@ sub define_interpolated_lm_interpolate {
 sub define_interpolated_lm_process {
   my ($step_id) = @_;
 
-  my ($processed_lm, $interpolatd_lm) = &get_output_and_input($step_id);
+  my ($processed_lm, $interpolated_lm) = &get_output_and_input($step_id);
   my ($module,$set,$stepname) = &deconstruct_name($DO_STEP[$step_id]);
   my $tool = &check_backoff_and_get("INTERPOLATED-LM:lm-${stepname}r");
   my $FACTOR = &backoff_and_get_array("TRAINING:output-factors");
@@ -2705,11 +2713,23 @@ sub define_interpolated_lm_process {
   my $cmd = "";
   foreach my $factor (keys %{$ILM_SETS}) {
     foreach my $order (keys %{$$ILM_SETS{$factor}}) {
-      next unless scalar(@{$$ILM_SETS{$factor}{$order}}) > 1;
-      my $suffix = "";
-      $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
-      $suffix .= ".order$order" if $icount > 1;
-      $cmd .= "$tool $interpolatd_lm$suffix $processed_lm$suffix\n"; 
+      my ($name,$name_processed);
+      if (scalar(@{$$ILM_SETS{$factor}{$order}}) == 1) {
+        # not interpolated -> get name from LM version of these steps
+        my($id,$set) = split(/ /,$$ILM_SETS{$factor}{$order}[0]);
+        $name = &get_default_file("LM",$set,"train"); # well... works for now;
+        $name_processed = $STEP_OUTNAME{"LM:$stepname"};
+        $name_processed =~ s/^(.+\/)([^\/]+)$/$1$set.$2/;
+        $name_processed = &versionize(&long_file_name($name_processed,"lm",""));
+      }
+      else {
+        my $suffix = "";
+        $suffix = ".$$FACTOR[$factor]" if $icount > 1 && defined($FACTOR);
+        $suffix .= ".order$order" if $icount > 1;
+        $name = "$interpolated_lm$suffix";
+        $name_processed = "$processed_lm$suffix";
+      }
+      $cmd .= "$tool $name $name_processed\n"; 
     }
   }
 
@@ -3071,7 +3091,7 @@ sub define_evaluation_decode {
     my $nbest_size;
     $nbest_size = $nbest if $nbest;
     $nbest_size =~ s/[^\d]//g if $nbest;
-    if ($jobs && $CLUSTER) {
+    if ($jobs && $jobs>1 && $CLUSTER) {
 	$cmd .= "mkdir -p $dir/evaluation/tmp.$set.$VERSION\n";
 	$cmd .= "cd $dir/evaluation/tmp.$set.$VERSION\n";
 	if (defined $moses_parallel) {
@@ -3495,9 +3515,15 @@ sub check_backoff_and_get_array {
     return $CONFIG{$parameter} if defined($CONFIG{$parameter});
 
     # remove set -> find setting for module
-    $parameter =~ s/:.*:/:/;
+    $parameter =~ s/:[^:]+:/:/;
     return $CONFIG{$parameter} if defined($CONFIG{$parameter});
 
+    # remove step (if exists)
+    if ($parameter =~ /:[^:]+:/) {
+        $parameter =~ s/:[^:]+:/:/;
+        return $CONFIG{$parameter} if defined($CONFIG{$parameter});
+    }
+
     # remove model -> find global setting
     $parameter =~ s/^[^:]+:/GENERAL:/;
     return $CONFIG{$parameter} if defined($CONFIG{$parameter});
diff --git a/scripts/ems/support/build-sparse-features.perl b/scripts/ems/support/build-sparse-features.perl
index 5d9b786ad..3f4b505d5 100755
--- a/scripts/ems/support/build-sparse-features.perl
+++ b/scripts/ems/support/build-sparse-features.perl
@@ -12,15 +12,17 @@ use strict;
 my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV;
 my $ini = "[feature]\n";
 my %ALREADY;
+my %ID;
 
 foreach my $feature_spec (split(/,\s*/,$specification)) {
   my @SPEC = split(/\s+/,$feature_spec);
 
   my $factor = ($SPEC[0] eq 'word-translation') ? "0-0" : "0";
   $factor = $1 if $feature_spec =~ / factor ([\d\-]+)/; 
+  $feature_spec =~ s/ factor ([\d\-]+)//;
 
   if ($SPEC[0] eq 'target-word-insertion') {
-    $ini .= "TargetWordInsertionFeature name=TWI factor=$factor";
+    $ini .= "TargetWordInsertionFeature name=TWI".&get_id($SPEC[0])." factor=$factor";
 
     if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
       my $file = &create_top_words($output_extension, $SPEC[2]);
@@ -34,7 +36,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
     $ini .= "\n";
   }
   elsif ($SPEC[0] eq 'source-word-deletion') {
-    $ini .= "SourceWordDeletionFeature name=SWD factor=$factor";
+    $ini .= "SourceWordDeletionFeature name=SWD".&get_id($SPEC[0])." factor=$factor";
     if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
       my $file = &create_top_words($input_extension, $SPEC[2]);
       $ini .= " path=$file";
@@ -60,7 +62,7 @@ foreach my $feature_spec (split(/,\s*/,$specification)) {
       die("ERROR: Unknown parameter specification in '$SPEC[1]'\n");
     }
     my ($input_factor,$output_factor) = split(/\-/,$factor);
-    $ini .= "WordTranslationFeature name=WT input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
+    $ini .= "WordTranslationFeature name=WT".&get_id($SPEC[0])." input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
   }
   elsif ($SPEC[0] eq 'phrase-length') {
     $ini .= "PhraseLengthFeature name=PL\n";
@@ -111,3 +113,11 @@ sub create_top_words {
 
   return $file;
 }
+
+sub get_id {
+  my ($name) = @_;
+  $ID{$name}++;
+  return "" if $ID{$name} == 1;
+  return $ID{$name};
+}
+
diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl
new file mode 100755
index 000000000..fa501b454
--- /dev/null
+++ b/scripts/ems/support/fast-align-in-parts.perl
@@ -0,0 +1,91 @@
+#!/usr/bin/env perl
+
+#######################
+# Revision history
+#
+# 28 Apr 2015 first version
+
+use warnings;
+use strict;
+use Getopt::Long qw(:config pass_through no_ignore_case permute);
+
+my ($BIN,$IN,$MAX_LINES,$SETTINGS,$REVERSE,$TMP);
+
+GetOptions('bin=s' => \$BIN,
+           'i=s' => \$IN,
+           'max-lines=i' => \$MAX_LINES,
+           'settings=s' => \$SETTINGS,
+           'r' => \$REVERSE,
+           'tmp=s' => \$TMP,
+          ) or exit(1);
+
+die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR")
+  unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) && defined($MAX_LINES)
+      && $MAX_LINES > 0;
+die("ERROR - input file does not exist: $IN") unless -e $IN;
+die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN;
+        
+chomp(my $line_count = `cat $IN | wc -l`);
+
+# not more than maximal number of lines -> just run it regulary
+if ($MAX_LINES > $line_count) {
+  my $cmd = "$BIN -i $IN $SETTINGS";
+  $cmd .= " -r" if defined($REVERSE);
+  safesystem($cmd) or die;
+  exit(0);
+}
+
+my $cmd = "mkdir -p $TMP";
+safesystem($cmd) or die;
+
+# split input
+$cmd = "split -a 2 -l $MAX_LINES $IN $TMP/prepared-";
+safesystem($cmd) or die;
+
+# process
+my @INPUT_FILES = `ls $TMP/prepared-*`;
+chop(@INPUT_FILES);
+foreach my $input_file (@INPUT_FILES) {
+  # create output file name
+  die("ERROR") unless $input_file =~ /prepared-(..)$/;
+  my $output_file = "$TMP/aligned-$1";
+
+  # process part
+  my $cmd = "$BIN -i $input_file $SETTINGS";
+  $cmd .= " -r" if defined($REVERSE);
+  $cmd .= " >$output_file";
+  safesystem($cmd) or die;
+  die("ERROR: no output produced from command $cmd") unless -e $output_file;
+
+  # check line count
+  chomp(my $input_line_count = `cat $input_file | wc -l`);
+  chomp(my $output_line_count = `cat $output_file | wc -l`);
+  die("ERROR: mismatched number of lines in part $1\n\t$input_line_count\t$input_file\n\t$output_line_count\t$output_file\n") unless $input_line_count == $output_line_count;
+}
+
+# join output
+$cmd = "cat $TMP/aligned-*";
+safesystem($cmd) or die;
+
+$cmd = "rm -r $TMP/* ; rmdir $TMP";
+safesystem($cmd);
+
+sub safesystem {
+  print STDERR "Executing: @_\n";
+  system(@_);
+  if ($? == -1) {
+    print STDERR "Failed to execute: @_\n  $!\n";
+    exit(1);
+  }
+  elsif ($? & 127) {
+    printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",
+      ($? & 127),  ($? & 128) ? 'with' : 'without';
+    exit 1;
+  }
+  else {
+    my $exitcode = $? >> 8;
+    print STDERR "Exit code: $exitcode\n" if $exitcode;
+    return ! $exitcode;
+  }
+}
+
diff --git a/scripts/ems/support/generic-parallelizer.perl b/scripts/ems/support/generic-parallelizer.perl
index 0b248be7e..fd7fb2552 100755
--- a/scripts/ems/support/generic-parallelizer.perl
+++ b/scripts/ems/support/generic-parallelizer.perl
@@ -4,7 +4,7 @@ use warnings;
 use strict;
 
 my $jobs = 20;
-my ($infile,$outfile,$cmd,$tmpdir);
+my ($infile,$outfile,$cmd,$tmpdir,$qflags);
 
 use Getopt::Long qw(:config pass_through no_ignore_case);
 GetOptions('jobs=i' => \$jobs,
@@ -12,7 +12,7 @@ GetOptions('jobs=i' => \$jobs,
 	   'in=s' => \$infile,
 	   'out=s' => \$outfile,
 	   'cmd=s' => \$cmd,
-       'queue-flags=s' => \$qflags,
+	   'queue-flags=s' => \$qflags,
 	   ) or exit(1);
 
 die("ERROR: specify infile with -in") unless $infile;
diff --git a/scripts/ems/support/lmplz-wrapper.perl b/scripts/ems/support/lmplz-wrapper.perl
index eadca6263..f36d2d9e0 100755
--- a/scripts/ems/support/lmplz-wrapper.perl
+++ b/scripts/ems/support/lmplz-wrapper.perl
@@ -7,11 +7,12 @@ use Getopt::Long "GetOptions";
 Getopt::Long::config("no_auto_abbrev");
 Getopt::Long::config("pass_through");
 
-
-my ($TEXT,$ORDER,$BIN,$LM);
+my ($TEXT,$ORDER,$BIN,$LM,$MEMORY,$TMPDIR);
 
 &GetOptions('text=s' => \$TEXT,
 	    'lm=s' => \$LM,
+	    'S=s' => \$MEMORY,
+	    'T=s' => \$TMPDIR,
             'bin=s' => \$BIN,
 	    'order=i' => \$ORDER);
 
@@ -19,8 +20,9 @@ die("ERROR: specify at least --bin BIN --text CORPUS --lm LM and --order N!")
   unless defined($BIN) && defined($TEXT) && defined($LM) && defined($ORDER);
 
 my $settings = join(' ', @ARGV);
-#print STDERR "settngs=$settings \n";
 
 my $cmd = "$BIN --text $TEXT --order $ORDER --arpa $LM $settings";
-print "exec: $cmd\n";
+$cmd .= " -T $TMPDIR" if defined($TMPDIR);
+$cmd .= " -S $MEMORY" if defined($MEMORY);
+print STDERR "Executing: $cmd\n";
 `$cmd`;
diff --git a/scripts/ems/support/report-experiment-scores.perl b/scripts/ems/support/report-experiment-scores.perl
index 2e433f291..ef64d4c2d 100755
--- a/scripts/ems/support/report-experiment-scores.perl
+++ b/scripts/ems/support/report-experiment-scores.perl
@@ -20,6 +20,9 @@ $TYPE{"bolt-bleu-c"}   = "BLEU-c";
 $TYPE{"bolt-ter"}      = "TER";
 $TYPE{"bolt-ter-c"}    = "TER-c";
 
+$TYPE{"multi-bleu-detok"}  = "BLEU";
+$TYPE{"multi-bleu-c-detok"}= "BLEU-c";
+
 my %SCORE;
 my %AVERAGE;
 foreach (@ARGV) {
@@ -59,7 +62,8 @@ sub process {
     elsif ($type eq 'ibm-bleu' || $type eq 'ibm-bleu-c') {
 	$SCORE{$set} .= &extract_ibm_bleu($file,$type)." ";
     }
-    elsif ($type eq 'multi-bleu' || $type eq 'multi-bleu-c') {
+    elsif ($type eq 'multi-bleu' || $type eq 'multi-bleu-c'
+	|| $type eq 'multi-bleu-detok' || $type eq 'multi-bleu-c-detok') {
 	$SCORE{$set} .= &extract_multi_bleu($file,$type)." ";
     }
     elsif ($type eq 'meteor') {
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index ebfe6639a..fe5666a8b 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -6,11 +6,17 @@
 use warnings;
 use strict;
 use File::Basename;
-use Cwd 'abs_path';
 
 sub RunFork($);
 sub systemCheck($);
 sub NumStr($);
+sub DigitStr($);
+sub CharStr($);
+
+my $is_osx = ($^O eq "darwin");
+
+my $alph = "abcdefghijklmnopqrstuvwxyz";
+my @alph = (split(//,$alph));
 
 print "Started ".localtime() ."\n";
 
@@ -33,6 +39,7 @@ my $baselineExtract;
 my $glueFile;
 my $phraseOrientation = 0;
 my $phraseOrientationPriorsFile;
+my $splitCmdOption="-d";
 
 my $GZIP_EXEC;
 if(`which pigz`) {
@@ -63,13 +70,15 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i)
     $phraseOrientationPriorsFile = $ARGV[++$i];
     next;
   }
+  $splitCmdOption="",next if $ARGV[$i] eq "--NoNumericSuffix";
 
   $otherExtractArgs .= $ARGV[$i] ." ";
 }
 
 my $cmd;
 my $TMPDIR=dirname($extract)  ."/tmp.$$";
-$cmd = "mkdir -p $TMPDIR";
+$cmd = "mkdir -p $TMPDIR; ls -l $TMPDIR";
+print STDERR "Executing: $cmd \n";
 `$cmd`;
 
 my $totalLines = int(`cat $align | wc -l`);
@@ -82,20 +91,20 @@ my $pid;
 
 if ($numParallel > 1)
 {
-	$cmd = "$splitCmd -d -l $linesPerSplit -a 7 $target $TMPDIR/target.";
+	$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $target $TMPDIR/target.";
 	$pid = RunFork($cmd);
 	push(@children, $pid);
 	
-	$cmd = "$splitCmd -d -l $linesPerSplit -a 7 $source $TMPDIR/source.";
+	$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $source $TMPDIR/source.";
 	$pid = RunFork($cmd);
 	push(@children, $pid);
 
-	$cmd = "$splitCmd -d -l $linesPerSplit -a 7 $align $TMPDIR/align.";
+	$cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $align $TMPDIR/align.";
 	$pid = RunFork($cmd);
 	push(@children, $pid);
 
   if ($weights) {
-    $cmd = "$splitCmd -d -l $linesPerSplit -a 7 $weights $TMPDIR/weights.";
+    $cmd = "$splitCmd $splitCmdOption -l $linesPerSplit -a 7 $weights $TMPDIR/weights.";
     $pid = RunFork($cmd);
     push(@children, $pid);
   }
@@ -110,21 +119,17 @@ else
 {
   my $numStr = NumStr(0);
 
-  $cmd = "ln -s ".abs_path($target)." $TMPDIR/target.$numStr";
-	print STDERR "Executing: $cmd \n";
+  $cmd = "ln -s $target $TMPDIR/target.$numStr";
 	`$cmd`;
 
-  $cmd = "ln -s ".abs_path($source)." $TMPDIR/source.$numStr";
-	print STDERR "Executing: $cmd \n";
+  $cmd = "ln -s $source $TMPDIR/source.$numStr";
 	`$cmd`;
 
-  $cmd = "ln -s ".abs_path($align)." $TMPDIR/align.$numStr";
-	print STDERR "Executing: $cmd \n";
+  $cmd = "ln -s $align $TMPDIR/align.$numStr";
 	`$cmd`;
 
   if ($weights) {
-    $cmd = "ln -s ".abs_path($weights)." $TMPDIR/weights.$numStr";
-    print STDERR "Executing: $cmd \n";
+    $cmd = "ln -s $weights $TMPDIR/weights.$numStr";
     `$cmd`;
   }
 }
@@ -150,8 +155,8 @@ for (my $i = 0; $i < $numParallel; ++$i)
     print "glueArg=$glueArg \n";
 
     my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $glueArg $otherExtractArgs $weightsCmd --SentenceOffset ".($i*$linesPerSplit)." 2>> /dev/stderr \n";
+    `$cmd`;
 
-    safesystem($cmd) or die;
     exit();
   }
   else
@@ -163,10 +168,6 @@ for (my $i = 0; $i < $numParallel; ++$i)
 # wait for everything is finished
 foreach (@children) {
 	waitpid($_, 0);
-        if($? != 0) {
-                print STDERR "ERROR: Failed to execute: @_\n  $!\n";
-                exit(1);
-        }
 }
 
 # merge
@@ -268,7 +269,6 @@ if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
 
 # delete temporary files
 $cmd = "rm -rf $TMPDIR \n";
-print STDERR $cmd;
 `$cmd`;
 
 print STDERR "Finished ".localtime() ."\n";
@@ -301,7 +301,7 @@ sub systemCheck($)
   }
 }
 
-sub NumStr($)
+sub DigitStr($)
 {
     my $i = shift;
     my $numStr;
@@ -329,22 +329,30 @@ sub NumStr($)
     return $numStr;
 }
 
-sub safesystem {
-  print STDERR "Executing: @_\n";
-  system(@_);
-  if ($? == -1) {
-      print STDERR "ERROR: Failed to execute: @_\n  $!\n";
-      exit(1);
-  }
-  elsif ($? & 127) {
-      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n",
-          ($? & 127),  ($? & 128) ? 'with' : 'without';
-      exit(1);
-  }
-  else {
-    my $exitcode = $? >> 8;
-    print STDERR "Exit code: $exitcode\n" if $exitcode;
-    return ! $exitcode;
-  }
+sub CharStr($)
+{
+    my $i = shift;
+    my $charStr;
+    my @bit=();
+
+    while ($i>0){
+        push @bit, $i%26;
+        $i=int($i/26);
+    }
+    my $offset=scalar(@bit);
+    my $h;
+    for ($h=6;$h>=$offset;--$h) { $charStr.="a"; }
+    for ($h=$offset-1;$h>=0;--$h) { $charStr.="$alph[$bit[$h]]"; }
+    return $charStr;
+}
+
+sub NumStr($)
+{
+    my $i = shift;
+    if ($is_osx){
+        return CharStr($i);
+    }else{
+        return DigitStr($i);
+    }
 }
 
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 86084abbf..a7263d4bd 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -76,7 +76,7 @@ my $___N_BEST_LIST_SIZE = 100;
 my $___LATTICE_SAMPLES = 0;
 my $queue_flags = "-hard";  # extra parameters for parallelizer
       # the -l ws0ssmt was relevant only to JHU 2006 workshop
-my $___JOBS = undef; # if parallel, number of jobs to use (undef or 0 -> serial)
+my $___JOBS = undef; # if parallel, number of jobs to use (undef or <= 0 -> serial)
 my $___DECODER_FLAGS = ""; # additional parametrs to pass to the decoder
 my $continue = 0; # should we try to continue from the last saved step?
 my $skip_decoder = 0; # and should we skip the first decoder run (assuming we got interrupted during mert)
@@ -544,7 +544,7 @@ if ($__PROMIX_TRAINING) {
     my $___FILTER_F  = $___DEV_F;
     $___FILTER_F = $filterfile if (defined $filterfile);
     my $cmd = "$filtercmd ./$filtered_path $filtered_config $___FILTER_F";
-    &submit_or_exec($cmd, "filterphrases_$i.out", "filterphrases_$i.err");
+    &submit_or_exec($cmd, "filterphrases_$i.out", "filterphrases_$i.err", 1);
     push (@_PROMIX_TABLES_BIN,"$filtered_path/phrase-table.0-0.1.1");
   }
 }
@@ -559,7 +559,7 @@ if ($___FILTER_PHRASE_TABLE) {
     my $___FILTER_F  = $___DEV_F;
     $___FILTER_F = $filterfile if (defined $filterfile);
     my $cmd = "$filtercmd ./$outdir $___CONFIG $___FILTER_F";
-    &submit_or_exec($cmd, "filterphrases.out", "filterphrases.err");
+    &submit_or_exec($cmd, "filterphrases.out", "filterphrases.err", 1);
   }
 
   # make a backup copy of startup ini filepath
@@ -829,7 +829,7 @@ while (1) {
     # remove segmentation
     $cmd .= " -l $__REMOVE_SEGMENTATION" if  $__PROMIX_TRAINING;
     $cmd = &create_extractor_script($cmd, $___WORKING_DIR);
-    &submit_or_exec($cmd, "extract.out","extract.err");
+    &submit_or_exec($cmd, "extract.out","extract.err", 1);
   }
 
   # Create the initial weights file for mert: init.opt
@@ -919,11 +919,11 @@ while (1) {
   my $pro_optimizer_cmd = "$pro_optimizer $megam_default_options run$run.pro.data";
   if ($___PAIRWISE_RANKED_OPTIMIZER) {  # pro optimization
     $cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; echo 'not used' > $weights_out_file; $pro_optimizer_cmd";
-    &submit_or_exec($cmd, $mert_outfile, $mert_logfile);
+    &submit_or_exec($cmd, $mert_outfile, $mert_logfile, 1);
   } elsif ($___PRO_STARTING_POINT) {  # First, run pro, then mert
     # run pro...
     my $pro_cmd = "$mert_pro_cmd $proargs $seed_settings $pro_file_settings -o run$run.pro.data ; $pro_optimizer_cmd";
-    &submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err");
+    &submit_or_exec($pro_cmd, "run$run.pro.out", "run$run.pro.err", 1);
     # ... get results ...
     ($bestpoint,$devbleu) = &get_weights_from_mert("run$run.pro.out","run$run.pro.err",scalar @{$featlist->{"names"}},\%sparse_weights, \@promix_weights);
     # Get the pro outputs ready for mert. Add the weight ranges,
@@ -951,11 +951,11 @@ while (1) {
 
     # ... and run mert
     $cmd =~ s/(--ifile \S+)/$1,run$run.init.pro/;
-    &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile);
+    &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile, ($__THREADS ? $__THREADS : 1) );
   } elsif ($___BATCH_MIRA) { # batch MIRA optimization
     safesystem("echo 'not used' > $weights_out_file") or die;
     $cmd = "$mert_mira_cmd $mira_settings $seed_settings $pro_file_settings -o $mert_outfile";
-    &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile);
+    &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile, 1);
   } elsif ($___HG_MIRA) {
     safesystem("echo 'not used' > $weights_out_file") or die;
     $mira_settings .= " --type hypergraph ";
@@ -963,7 +963,7 @@ while (1) {
     $mira_settings .= " --hgdir $hypergraph_dir ";
     #$mira_settings .= "--verbose "; 
     $cmd = "$mert_mira_cmd $mira_settings $seed_settings -o $mert_outfile";
-    &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile);
+    &submit_or_exec($cmd, "run$run.mira.out", $mert_logfile, 1);
   } elsif ($__PROMIX_TRAINING) {
     # PRO trained  mixture model
     safesystem("echo 'not used' > $weights_out_file") or die;
@@ -972,10 +972,10 @@ while (1) {
     $cmd .= join(" ", map {"-p $_"} @_PROMIX_TABLES_BIN);
     $cmd .= " -i $___DEV_F";
     print "Starting promix optimisation at " . `date`;
-    &submit_or_exec($cmd, "$mert_outfile", $mert_logfile);
+    &submit_or_exec($cmd, "$mert_outfile", $mert_logfile, 1);
     print "Finished promix optimisation at " . `date`;
   } else {  # just mert
-    &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile);
+    &submit_or_exec($cmd . $mert_settings, $mert_outfile, $mert_logfile, ($__THREADS ? $__THREADS : 1) );
   } 
 
   die "Optimization failed, file $weights_out_file does not exist or is empty"
@@ -1283,7 +1283,7 @@ sub run_decoder {
       $lsamp_cmd = " -lattice-samples $lsamp_filename $___LATTICE_SAMPLES ";
     }
 
-    if (defined $___JOBS && $___JOBS > 0) {
+    if (defined $___JOBS && $___JOBS > 1) {
       die "Hypergraph mira not supported by moses-parallel" if $___HG_MIRA;
       $decoder_cmd = "$moses_parallel_cmd $pass_old_sge -config $___CONFIG";
       $decoder_cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE); 
@@ -1378,9 +1378,9 @@ sub get_featlist_from_moses {
     print STDERR "Asking moses for feature names and values from $___CONFIG\n";
     my $cmd = "$___DECODER $___DECODER_FLAGS -config $configfn";
     $cmd .= " -inputtype $___INPUTTYPE" if defined($___INPUTTYPE);
-    $cmd .= " -show-weights > $featlistfn";
+    $cmd .= " -show-weights";
     print STDERR "Executing: $cmd\n";
-    safesystem($cmd) or die "Failed to run moses with the config $configfn";
+    &submit_or_exec($cmd, $featlistfn, "/dev/null", 1);
   }
   return get_featlist_from_file($featlistfn);
 }
@@ -1706,10 +1706,14 @@ sub ensure_full_path {
 }
 
 sub submit_or_exec {
-  my ($cmd, $stdout, $stderr) = @_;
+  my ($cmd, $stdout, $stderr, $threads) = @_;
   print STDERR "exec: $cmd\n";
-  if (defined $___JOBS && $___JOBS > 0) {
-    safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=$stdout -stderr=$stderr" )
+  if (defined $___JOBS && $___JOBS > 1) {
+    # request fewer CPU slots, if not needed
+    my $queue_flags_for_this_command = $queue_flags;
+    $threads = 1 unless defined($threads);
+    $queue_flags_for_this_command =~ s/(\-pe smp) \d+/$1 $threads/;
+    safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags_for_this_command\" -stdout=$stdout -stderr=$stderr" )
       or die "ERROR: Failed to submit '$cmd' (via $qsubwrapper)";
   } else {
     safesystem("$cmd > $stdout 2> $stderr") or die "ERROR: Failed to run '$cmd'.";
diff --git a/scripts/training/wrappers/madamira-tok.perl b/scripts/training/wrappers/madamira-tok.perl
index bc7e55d43..00639b7a7 100755
--- a/scripts/training/wrappers/madamira-tok.perl
+++ b/scripts/training/wrappers/madamira-tok.perl
@@ -16,6 +16,7 @@ my $KEEP_TMP = 0;
 my $MADA_DIR;
 my $CONFIG;
 my $SCHEME;
+my $USE_PARALLEL = 1;
 
 my $FACTORS_STR;
 my @FACTORS;
@@ -26,7 +27,8 @@ GetOptions(
   "mada-dir=s" => \$MADA_DIR,
   "factors=s" => \$FACTORS_STR,
   "config=s" => \$CONFIG,
-  "scheme=s" => \$SCHEME
+  "scheme=s" => \$SCHEME,
+  "use-parallel=i" => \$USE_PARALLEL
     ) or die("ERROR: unknown options");
 
 die("must have -scheme arg") unless defined($SCHEME);
@@ -61,25 +63,36 @@ close(TMP);
 
 my $cmd;
 
-# split input file
-my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; 
-if($SPLIT_EXEC) {
+if ($USE_PARALLEL) {
+  # split input file
+  my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; 
+  if($SPLIT_EXEC) {
     $SPLIT_EXEC = 'gsplit';
-}
-else {
+  }
+  else {
     $SPLIT_EXEC = 'split';
-}
+  }
 
-$cmd = "$SPLIT_EXEC -l 10000 -a 7 -d  $TMPDIR/input $TMPDIR/split/x";
-`$cmd`;
+  $cmd = "$SPLIT_EXEC -l 10000 -a 7 -d  $TMPDIR/input $TMPDIR/split/x";
+  `$cmd`;
 
-$cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir  $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*";
-print STDERR "Executing: $cmd\n";
-`$cmd`;
+  $cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir  $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*";
+  print STDERR "Executing: $cmd\n";
+  `$cmd`;
 
-$cmd = "cat $TMPDIR/out/x*.$SCHEME.tok > $infile.mada";
-print STDERR "Executing: $cmd\n";
-`$cmd`;
+  $cmd = "cat $TMPDIR/out/x*.$SCHEME.tok > $infile.mada";
+  print STDERR "Executing: $cmd\n";
+  `$cmd`;
+}
+else {
+  $cmd = "cd $MADA_DIR && java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput $infile -rawoutdir $TMPDIR/out -rawconfig $CONFIG";
+  print STDERR "Executing: $cmd\n";
+  `$cmd`;
+
+  $cmd = "cat $TMPDIR/out/input.$SCHEME.tok > $infile.mada";
+  print STDERR "Executing: $cmd\n";
+  `$cmd`;
+}
 
 # get stuff out of mada output
 open(MADA_OUT,"<$infile.mada");
diff --git a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
index 88d16b3f6..35714271c 100755
--- a/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
+++ b/scripts/training/wrappers/make-factor-brown-cluster-mkcls.perl
@@ -3,11 +3,18 @@
 use warnings;
 use strict;
 
-my ($lowercase, $cluster_file,$in,$out,$tmp) = @ARGV;
+my ($lowercase,$cluster_file,$in,$out,$tmp) = @ARGV;
 
 my $CLUSTER = &read_cluster_from_mkcls($cluster_file);
 
-open(IN,$in) || die("ERROR: could not open input");
+# is $lowercase a script?
+if ($lowercase =~ /\//) {
+  open(IN,"$lowercase < $in|") || die("ERROR: could not open input");
+  $lowercase = 0;
+}
+else {
+  open(IN,$in) || die("ERROR: could not open input");
+}
 binmode(IN, ":utf8");
 open(OUT,">$out");
 binmode(OUT, ":utf8");
@@ -18,6 +25,7 @@ while(<IN>) {
   s/ $//;
   my $first = 1;
   foreach my $word (split) {
+    # if lowercase is a flag
     if ($lowercase) {
       $word = lc($word);
     }
diff --git a/scripts/training/wrappers/make-factor-de-lemma.perl b/scripts/training/wrappers/make-factor-de-lemma.perl
new file mode 100755
index 000000000..db978317e
--- /dev/null
+++ b/scripts/training/wrappers/make-factor-de-lemma.perl
@@ -0,0 +1,33 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Encode;
+use FindBin qw($RealBin);
+
+my ($in,$out,$tmpdir) = @ARGV;
+
+`mkdir -p $tmpdir`;
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
+`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
+
+open(LOPAR,"$tmpdir/lopar.$$");
+open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl > $out");
+while(<LOPAR>) {
+    chomp;
+    s/ +/ /g;
+    s/^ //;
+    s/ $//;
+    my $first = 1;
+    foreach (split) {
+	die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
+	my ($word,$pos,$lemma) = ($1,$2,$3);
+	print OUT " " unless $first;
+	$first = 0;
+	$lemma =~ s/\|.+$//;
+	$lemma = $word if $lemma =~ /^\<.+\>$/;
+	print OUT encode('utf8', decode('iso-8859-1', $lemma));
+    }
+    print OUT "\n";
+}
+close(LOPAR);
+close(OUT);
diff --git a/scripts/training/wrappers/make-factor-de-morph.perl b/scripts/training/wrappers/make-factor-de-morph.perl
index 1cc917bce..366a5a76d 100755
--- a/scripts/training/wrappers/make-factor-de-morph.perl
+++ b/scripts/training/wrappers/make-factor-de-morph.perl
@@ -1,31 +1,31 @@
-#!/usr/bin/env perl 
-
-use warnings;
-use strict;
-use Encode;
-use FindBin qw($RealBin);
-my ($in,$out,$tmpdir) = @ARGV;
-
-`mkdir -p $tmpdir`;
-`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
-`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
-
-open(LOPAR,"$tmpdir/lopar.$$");
-open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl >$out");
-while(<LOPAR>) {
-    chomp;
-    s/ +/ /g;
-    s/^ //;
-    s/ $//;
-    my $first = 1;
-    foreach (split) {
-        die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
-        my ($word,$morph,$lemma) = ($1,$2,$3);
-	print OUT " " unless $first;
-	$first = 0;
-	print OUT encode('utf8', decode('iso-8859-1', $morph));
-    }
-    print OUT "\n";
-}
-close(LOPAR);
-close(OUT);
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+use Encode;
+use FindBin qw($RealBin);
+my ($in,$out,$tmpdir) = @ARGV;
+
+`mkdir -p $tmpdir`;
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in | /home/pkoehn/statmt/bin/unicode2latin1.perl > $tmpdir/tok.$$`;
+`/home/pkoehn/statmt/bin/run-lopar-tagger.perl $tmpdir/tok.$$ $tmpdir/lopar.$$`;
+
+open(LOPAR,"$tmpdir/lopar.$$");
+open(OUT,"|$RealBin/../../tokenizer/escape-special-chars.perl >$out");
+while(<LOPAR>) {
+    chomp;
+    s/ +/ /g;
+    s/^ //;
+    s/ $//;
+    my $first = 1;
+    foreach (split) {
+        die("ERROR: choked on token '$_'") unless /^(.+)_([^_]+)_(.+)$/;
+        my ($word,$morph,$lemma) = ($1,$2,$3);
+	print OUT " " unless $first;
+	$first = 0;
+	print OUT encode('utf8', decode('iso-8859-1', $morph));
+    }
+    print OUT "\n";
+}
+close(LOPAR);
+close(OUT);
diff --git a/scripts/training/wrappers/make-factor-en-porter.perl b/scripts/training/wrappers/make-factor-en-porter.perl
new file mode 100755
index 000000000..749dc1318
--- /dev/null
+++ b/scripts/training/wrappers/make-factor-en-porter.perl
@@ -0,0 +1,10 @@
+#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+
+my ($in,$out,$tmpdir) = @ARGV;
+
+my $porter_in = "$tmpdir/porter-in.$$";
+`$RealBin/../../tokenizer/deescape-special-chars.perl < $in > $porter_in`;
+`/home/pkoehn/statmt/bin/porter-stemmer $porter_in | $RealBin/../../tokenizer/escape-special-chars.perl > $out`;
author	Barry Haddow <barry.haddow@gmail.com>	2015-05-08 11:16:55 +0300
committer	Barry Haddow <barry.haddow@gmail.com>	2015-05-08 11:16:55 +0300
commit	85c1af4d72686d2bc95960040cd8407b22f3df53 (patch)
tree	67725a66e32fc9bcd109597b5854067aac01107e /scripts
parent	f403f5e4785361487969ad4865adea14651bfa15 (diff)
parent	8e6eb067bca1ee4f9d36cb2c305f7ac60b81f230 (diff)