diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2016-10-11 02:05:40 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2016-10-11 02:05:40 +0300 |
commit | 674aac8da999c0a48185918064053c2513598659 (patch) | |
tree | 45466fe09646edfa07a8ac2121f87396dcb611d3 /scripts | |
parent | f6e0d224700f37e33c6cf7253d4df08c5e1f6271 (diff) |
script for created vocabulary filter files
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/train_vocab_filter.pl | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/scripts/train_vocab_filter.pl b/scripts/train_vocab_filter.pl new file mode 100755 index 00000000..f938e62c --- /dev/null +++ b/scripts/train_vocab_filter.pl @@ -0,0 +1,88 @@ +#!/usr/bin/env perl + +use strict; +use Getopt::Long; +use FindBin qw($Bin); +use File::Temp qw(tempdir tempfile); +use POSIX; + +my $PID = $$; +$SIG{TERM} = $SIG{INT} = $SIG{QUIT} = sub { die; }; + +my $BINDIR = "$Bin/../build/bin"; +my $SRC; +my $TRG; +my $OUTPUT = "lex"; +my $THREADS = 8; +my $PARALLEL = 0; + +GetOptions( + "bindir=s" => \$BINDIR, + "s|source=s" => \$SRC, + "t|target=s" => \$TRG, + "o|output=s" => \$OUTPUT, + "threads=i" => \$THREADS, + "parallel" => \$PARALLEL +); + +die "--bindir arg is required" if not defined $BINDIR; +die "--source arg is required" if not defined $SRC; +die "--target arg is required" if not defined $TRG; +die "--output arg is required" if not defined $OUTPUT; + +for my $app (qw(fast_align atools extract_lex)) { + die "Could not find $app in $BINDIR" if not -e "$BINDIR/$app"; +} + +my $TEMPDIR = tempdir(CLEANUP => 1); + +my (undef, $CORPUS) = tempfile( DIR => $TEMPDIR ); +my (undef, $ALN_S2T) = tempfile( DIR => $TEMPDIR ); +my (undef, $ALN_T2S) = tempfile( DIR => $TEMPDIR ); +my (undef, $ALN_GDF) = tempfile( DIR => $TEMPDIR ); + +execute("paste $SRC $TRG | sed 's/\\t/ ||| /' > $CORPUS"); + +my @COMMANDS = ( + "OMP_NUM_THREADS=$THREADS $BINDIR/fast_align -vdo -i $CORPUS > $ALN_S2T", + "OMP_NUM_THREADS=$THREADS $BINDIR/fast_align -vdor -i $CORPUS > $ALN_T2S" +); + +for my $c (@COMMANDS) { + if($PARALLEL) { + my $pid = fork(); + if (!$pid) { + execute($c); + exit(0); + } + else { + print "Forked process $pid\n"; + } + } + else { + execute($c); + } +} +wait() if($PARALLEL); + +execute("$BINDIR/atools -c grow-diag-final -i $ALN_S2T -j $ALN_T2S > $ALN_GDF"); +execute("$BINDIR/extract_lex $TRG $SRC $ALN_GDF $OUTPUT.s2t $OUTPUT.t2s"); + +sub execute { + my $command = shift; + logMessage("Executing:\t$command"); + my $ret = system($command); + if($ret != 0) { + logMessage("Command '$command' finished with return status $ret"); + logMessage("Aborting and killing parent process"); + kill(2, $PID); + die; + } +} + +sub logMessage { + my $message = shift; + my $time = POSIX::strftime("%m/%d/%Y %H:%M:%S", localtime()); + my $log_message = $time."\t$message\n"; + print STDERR $log_message; +} |