Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilipp Koehn <phi@jhu.edu>2015-05-02 00:47:08 +0300
committerPhilipp Koehn <phi@jhu.edu>2015-05-02 00:47:08 +0300
commita4a7c14593766ab188e1d6ae1c29e67ed201d412 (patch)
treefb1325c8d65ee5c231dfb0e24d780c550ad79cee /scripts
parentde6a9bd1b3ed1ecacf4e76f68ee1ef37f21d90b4 (diff)
allow breaking up training data for fast align (to avoid memory blowups for very large corpora)
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/ems/support/fast-align-in-parts.perl91
1 files changed, 91 insertions, 0 deletions
diff --git a/scripts/ems/support/fast-align-in-parts.perl b/scripts/ems/support/fast-align-in-parts.perl
new file mode 100755
index 000000000..fa501b454
--- /dev/null
+++ b/scripts/ems/support/fast-align-in-parts.perl
@@ -0,0 +1,91 @@
+#!/usr/bin/env perl
+
+#######################
+# Revision history
+#
+# 28 Apr 2015 first version
+
+use warnings;
+use strict;
+use Getopt::Long qw(:config pass_through no_ignore_case permute);
+
+my ($BIN,$IN,$MAX_LINES,$SETTINGS,$REVERSE,$TMP);
+
+GetOptions('bin=s' => \$BIN,
+ 'i=s' => \$IN,
+ 'max-lines=i' => \$MAX_LINES,
+ 'settings=s' => \$SETTINGS,
+ 'r' => \$REVERSE,
+ 'tmp=s' => \$TMP,
+ ) or exit(1);
+
+die("ERROR - usage: fast-align-in-parts.perl -bin FAST_ALIGN_BIN -i PARALLEL_CORPUS -max-lines COUNT -settings CONFIG [-r] -tmp TMPDIR")
+ unless defined($BIN) && defined($IN) && defined($SETTINGS) && defined($TMP) && defined($MAX_LINES)
+ && $MAX_LINES > 0;
+die("ERROR - input file does not exist: $IN") unless -e $IN;
+die("ERROR - fast_align binary does not exist: $BIN") unless -e $BIN;
+
+chomp(my $line_count = `cat $IN | wc -l`);
+
+# not more than maximal number of lines -> just run it regulary
+if ($MAX_LINES > $line_count) {
+ my $cmd = "$BIN -i $IN $SETTINGS";
+ $cmd .= " -r" if defined($REVERSE);
+ safesystem($cmd) or die;
+ exit(0);
+}
+
+my $cmd = "mkdir -p $TMP";
+safesystem($cmd) or die;
+
+# split input
+$cmd = "split -a 2 -l $MAX_LINES $IN $TMP/prepared-";
+safesystem($cmd) or die;
+
+# process
+my @INPUT_FILES = `ls $TMP/prepared-*`;
+chop(@INPUT_FILES);
+foreach my $input_file (@INPUT_FILES) {
+ # create output file name
+ die("ERROR") unless $input_file =~ /prepared-(..)$/;
+ my $output_file = "$TMP/aligned-$1";
+
+ # process part
+ my $cmd = "$BIN -i $input_file $SETTINGS";
+ $cmd .= " -r" if defined($REVERSE);
+ $cmd .= " >$output_file";
+ safesystem($cmd) or die;
+ die("ERROR: no output produced from command $cmd") unless -e $output_file;
+
+ # check line count
+ chomp(my $input_line_count = `cat $input_file | wc -l`);
+ chomp(my $output_line_count = `cat $output_file | wc -l`);
+ die("ERROR: mismatched number of lines in part $1\n\t$input_line_count\t$input_file\n\t$output_line_count\t$output_file\n") unless $input_line_count == $output_line_count;
+}
+
+# join output
+$cmd = "cat $TMP/aligned-*";
+safesystem($cmd) or die;
+
+$cmd = "rm -r $TMP/* ; rmdir $TMP";
+safesystem($cmd);
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit 1;
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
+