Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2016-10-11 02:05:40 +0300
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2016-10-11 02:05:40 +0300
commit674aac8da999c0a48185918064053c2513598659 (patch)
tree45466fe09646edfa07a8ac2121f87396dcb611d3 /scripts
parentf6e0d224700f37e33c6cf7253d4df08c5e1f6271 (diff)
script for created vocabulary filter files
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/train_vocab_filter.pl88
1 files changed, 88 insertions, 0 deletions
diff --git a/scripts/train_vocab_filter.pl b/scripts/train_vocab_filter.pl
new file mode 100755
index 00000000..f938e62c
--- /dev/null
+++ b/scripts/train_vocab_filter.pl
@@ -0,0 +1,88 @@
+#!/usr/bin/env perl
+
+use strict;
+use Getopt::Long;
+use FindBin qw($Bin);
+use File::Temp qw(tempdir tempfile);
+use POSIX;
+
+my $PID = $$;
+$SIG{TERM} = $SIG{INT} = $SIG{QUIT} = sub { die; };
+
+my $BINDIR = "$Bin/../build/bin";
+my $SRC;
+my $TRG;
+my $OUTPUT = "lex";
+my $THREADS = 8;
+my $PARALLEL = 0;
+
+GetOptions(
+ "bindir=s" => \$BINDIR,
+ "s|source=s" => \$SRC,
+ "t|target=s" => \$TRG,
+ "o|output=s" => \$OUTPUT,
+ "threads=i" => \$THREADS,
+ "parallel" => \$PARALLEL
+);
+
+die "--bindir arg is required" if not defined $BINDIR;
+die "--source arg is required" if not defined $SRC;
+die "--target arg is required" if not defined $TRG;
+die "--output arg is required" if not defined $OUTPUT;
+
+for my $app (qw(fast_align atools extract_lex)) {
+ die "Could not find $app in $BINDIR" if not -e "$BINDIR/$app";
+}
+
+my $TEMPDIR = tempdir(CLEANUP => 1);
+
+my (undef, $CORPUS) = tempfile( DIR => $TEMPDIR );
+my (undef, $ALN_S2T) = tempfile( DIR => $TEMPDIR );
+my (undef, $ALN_T2S) = tempfile( DIR => $TEMPDIR );
+my (undef, $ALN_GDF) = tempfile( DIR => $TEMPDIR );
+
+execute("paste $SRC $TRG | sed 's/\\t/ ||| /' > $CORPUS");
+
+my @COMMANDS = (
+ "OMP_NUM_THREADS=$THREADS $BINDIR/fast_align -vdo -i $CORPUS > $ALN_S2T",
+ "OMP_NUM_THREADS=$THREADS $BINDIR/fast_align -vdor -i $CORPUS > $ALN_T2S"
+);
+
+for my $c (@COMMANDS) {
+ if($PARALLEL) {
+ my $pid = fork();
+ if (!$pid) {
+ execute($c);
+ exit(0);
+ }
+ else {
+ print "Forked process $pid\n";
+ }
+ }
+ else {
+ execute($c);
+ }
+}
+wait() if($PARALLEL);
+
+execute("$BINDIR/atools -c grow-diag-final -i $ALN_S2T -j $ALN_T2S > $ALN_GDF");
+execute("$BINDIR/extract_lex $TRG $SRC $ALN_GDF $OUTPUT.s2t $OUTPUT.t2s");
+
+sub execute {
+ my $command = shift;
+ logMessage("Executing:\t$command");
+ my $ret = system($command);
+ if($ret != 0) {
+ logMessage("Command '$command' finished with return status $ret");
+ logMessage("Aborting and killing parent process");
+ kill(2, $PID);
+ die;
+ }
+}
+
+sub logMessage {
+ my $message = shift;
+ my $time = POSIX::strftime("%m/%d/%Y %H:%M:%S", localtime());
+ my $log_message = $time."\t$message\n";
+ print STDERR $log_message;
+}