#!/usr/bin/env perl # $Id$ ####################### # Revision history # # 02 Aug 2006 added strict requirement # 01 Aug 2006 fix bug about inputfile parameter # fix bug about suffix index generation # 31 Jul 2006 added parameter for reading queue parameters # 29 Jul 2006 added code to handling consfusion networks # 28 Jul 2006 added a better policy for removing jobs from the queue in case of killing signal (CTRL-C) # added the parameter -qsub-prefix which sets the prefix for the name of submitted jobs # 27 Jul 2006 added safesystem() function and other checks to handle with process failure # added checks for existence of decoder and configuration file # 26 Jul 2006 fix a bug related to the use of absolute path for srcfile and nbestfile use strict; ####################### #Customizable parameters #parameters for submiiting processes through Sun GridEngine my $queueparameters=""; #SGE-make-it-work: # my $queueparameters="-l q1w -l mem_free=0.5G -hard"; # my $queueparameters="-l q1dm -pe pe_mth 2 -hard"; # etc. # look for the correct pwdcmd my $pwdcmd = getPwdCmd(); my $workingdir = `$pwdcmd`; chomp $workingdir; my $tmpdir="$workingdir/tmp$$"; my $splitpfx="split$$"; $SIG{'INT'} = \&kill_all_and_quit; # catch exception for CTRL-C ####################### #Default parameters my $jobscript="$workingdir/job$$"; my $qsubout="$workingdir/out.job$$"; my $qsuberr="$workingdir/err.job$$"; my $mosesparameters=""; my $feed_moses_via_stdin = 0; # a workaround, for a reason, the default "-input-file X" blocks # my moses, while "< X" works fine. my $cfgfile=""; #configuration file my $version=undef; my $help=0; my $dbg=0; my $jobs=4; my $mosescmd="$ENV{MOSESBIN}/moses"; #decoder in use my $inputlist=undef; my $inputfile=undef; my $inputtype=0; my @nbestlist=(); my $nbestlist=undef; my $nbestfile=undef; my $oldnbestfile=undef; my $oldnbest=undef; my $nbestflag=0; my @wordgraphlist=(); my $wordgraphlist=undef; my $wordgraphfile=undef; my $wordgraphflag=0; my $robust=5; # resubmit crashed jobs robust-times my $alifile=undef; my $detailsfile=undef; my $logfile=""; my $logflag=""; my $searchgraphlist=""; my $searchgraphfile=""; my $searchgraphflag=0; my $qsubname="MOSES"; my $old_sge = 0; # assume old Sun Grid Engine (<6.0) where qsub does not # implement -sync and -b #SGE-make-it-work: # my $old_sge = "1"; ####################### # Command line options processing sub init(){ use Getopt::Long qw(:config pass_through no_ignore_case permute); GetOptions('version'=>\$version, 'help'=>\$help, 'debug'=>\$dbg, 'jobs=i'=>\$jobs, 'decoder=s'=> \$mosescmd, 'robust=i' => \$robust, 'decoder-parameters=s'=> \$mosesparameters, 'feed-decoder-via-stdin'=> \$feed_moses_via_stdin, 'logfile=s'=> \$logfile, 'i|inputfile|input-file=s'=> \$inputlist, 'n-best-list=s'=> \$nbestlist, 'n-best-file=s'=> \$oldnbestfile, 'n-best-size=i'=> \$oldnbest, 'output-search-graph|osg=s'=> \$searchgraphlist, 'output-word-graph|owg=s'=> \$wordgraphlist, 'alignment-output-file=s'=> \$alifile, 'translation-details|T=s'=> \$detailsfile, 'qsub-prefix=s'=> \$qsubname, 'queue-parameters=s'=> \$queueparameters, 'inputtype=i'=> \$inputtype, 'config|f=s'=>\$cfgfile, 'old-sge' => \$old_sge, ) or exit(1); getNbestParameters(); getSearchGraphParameters(); getWordGraphParameters(); getLogParameters(); #print_parameters(); #print STDERR "nbestflag:$nbestflag\n"; #print STDERR "searchgraphflag:$searchgraphflag\n"; print STDERR "wordgraphflag:$wordgraphflag\n"; #print STDERR "inputlist:$inputlist\n"; chomp($inputfile=`basename $inputlist`) if defined($inputlist); $mosesparameters.="@ARGV -config $cfgfile -inputtype $inputtype"; } ####################### ##print version sub version(){ # print STDERR "version 1.0 (15-07-2006)\n"; # print STDERR "version 1.1 (17-07-2006)\n"; # print STDERR "version 1.2 (18-07-2006)\n"; # print STDERR "version 1.3 (21-07-2006)\n"; # print STDERR "version 1.4 (26-07-2006)\n"; # print STDERR "version 1.5 (27-07-2006)\n"; # print STDERR "version 1.6 (28-07-2006)\n"; # print STDERR "version 1.7 (29-07-2006)\n"; # print STDERR "version 1.8 (31-07-2006)\n"; # print STDERR "version 1.9 (01-08-2006)\n"; # print STDERR "version 1.10 (02-08-2006)\n"; # print STDERR "version 1.11 (10-10-2006)\n"; # print STDERR "version 1.12 (27-12-2006)\n"; print STDERR "version 1.13 (29-12-2006)\n"; exit(1); } #usage sub usage(){ print STDERR "moses-parallel.pl [parallel-options] [moses-options]\n"; print STDERR "Options marked (*) are required.\n"; print STDERR "Parallel options:\n"; print STDERR "* -decoder Moses decoder to use\n"; print STDERR "* -i|inputfile|input-file the input text to translate\n"; print STDERR "* -jobs number of required jobs\n"; print STDERR " -logfile file where storing log files of all jobs\n"; print STDERR " -qsub-prefix name for sumbitte jobs\n"; print STDERR " -queue-parameters specific requirements for queue\n"; print STDERR " -old-sge Assume Sun Grid Engine < 6.0\n"; print STDERR " -debug debug\n"; print STDERR " -version print version of the script\n"; print STDERR " -help this help\n"; print STDERR "Moses options:\n"; print STDERR " -inputtype <0|1|2> 0 for text, 1 for confusion networks, 2 for lattices\n"; print STDERR " -output-search-graph (osg) : Output connected hypotheses of search into specified filename\n"; print STDERR " -output-word-graph (osg) ' <0|1>': Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos\n"; print STDERR " IMPORTANT NOTE: use single quote to group parameters of -output-word-graph\n"; print STDERR " This is different from standard moses\n"; print STDERR " -n-best-list ' [distinct]' where\n"; print STDERR " : file where storing nbest lists\n"; print STDERR " : size of nbest lists\n"; print STDERR " distinct: (optional) to activate generation of distinct nbest alternatives\n"; print STDERR " IMPORTANT NOTE: use single quote to group parameters of -n-best-list\n"; print STDERR " This is different from standard moses\n"; print STDERR " IMPORTANT NOTE: The following two parameters are now OBSOLETE, and they are no more supported\n"; print STDERR " -n-best-file file where storing nbet lists\n"; print STDERR " -n-best-size size of nbest lists\n"; print STDERR " NOTE: -n-best-file-n-best-size are passed to the decoder as \"-n-best-list \"\n"; print STDERR "* -config (f) configuration file\n"; print STDERR " -decoder-parameters specific parameters for the decoder\n"; print STDERR "All other options are passed to Moses\n"; print STDERR " (This way to pass parameters is maintained for back compatibility\n"; print STDERR " but preferably use -decoder-parameters)\n"; exit(1); } #printparameters sub print_parameters(){ print STDERR "Inputfile: $inputlist\n"; print STDERR "Configuration file: $cfgfile\n"; print STDERR "Decoder in use: $mosescmd\n"; print STDERR "Number of jobs:$jobs\n"; print STDERR "Nbest list: $nbestlist\n" if ($nbestflag); print STDERR "Output Search Graph: $searchgraphlist\n" if ($searchgraphflag); print STDERR "Output Word Graph: $wordgraphlist\n" if ($wordgraphflag); print STDERR "LogFile:$logfile\n" if ($logflag); print STDERR "Qsub name: $qsubname\n"; print STDERR "Queue parameters: $queueparameters\n"; print STDERR "Inputtype: text\n" if $inputtype == 0; print STDERR "Inputtype: confusion network\n" if $inputtype == 1; print STDERR "Inputtype: lattices\n" if $inputtype == 2; print STDERR "parameters directly passed to Moses: $mosesparameters\n"; } #get parameters for log file sub getLogParameters(){ if ($logfile){ $logflag=1; } } #get parameters for nbest computation (possibly from configuration file) sub getNbestParameters(){ if (!$nbestlist){ open (CFG, "$cfgfile"); while (chomp($_=)){ if (/^\[n-best-list\]/){ my $tmp; while (chomp($tmp=)){ last if $tmp eq "" || $tmp=~/^\[/; $nbestlist .= "$tmp "; } last; } } close(CFG); } if ($nbestlist){ if ($oldnbestfile){ print STDERR "There is a conflict between NEW parameter -n-best-list and OBSOLETE parameter -n-best-file\n"; print STDERR "Please use only -nbest-list ' [distinct]\n"; exit; } } else{ if ($oldnbestfile){ print STDERR "You are using the OBSOLETE parameter -n-best-file\n"; print STDERR "Next time please use only -n-best-list ' [distinct]\n"; $nbestlist="$oldnbestfile"; if ($oldnbest){ $nbestlist.=" $oldnbest"; } else { $nbestlist.=" 1"; } } } if ($nbestlist){ my @tmp=split(/[ \t]+/,$nbestlist); @nbestlist = @tmp; if ($nbestlist[0] eq '-'){ $nbestfile="nbest"; } else{ chomp($nbestfile=`basename $nbestlist[0]`); } $nbestflag=1; } } #get parameters for search graph computation (possibly from configuration file) sub getSearchGraphParameters(){ if (!$searchgraphlist){ open (CFG, "$cfgfile"); while (chomp($_=)){ if (/^\[output-search-graph\]/ || /^\[osg\]/){ my $tmp; while (chomp($tmp=)){ last if $tmp eq "" || $tmp=~/^\[/; $searchgraphlist = "$tmp"; } last; } } close(CFG); } if ($searchgraphlist){ if ($searchgraphlist eq '-'){ $searchgraphfile="searchgraph"; } else{ chomp($searchgraphfile=`basename $searchgraphlist`); } $searchgraphflag=1; } } #get parameters for word graph computation (possibly from configuration file) sub getWordGraphParameters(){ if (!$wordgraphlist){ open (CFG, "$cfgfile"); while (chomp($_=)){ if (/^\[output-word-graph\]/ || /^\[owg\]/){ my $tmp; while (chomp($tmp=)){ last if $tmp eq "" || $tmp=~/^\[/; $wordgraphlist .= "$tmp "; } last; } } close(CFG); } if ($wordgraphlist){ my @tmp=split(/[ \t]+/,$wordgraphlist); @wordgraphlist = @tmp; if ($wordgraphlist[0] eq '-'){ $wordgraphfile="wordgraph"; } else{ chomp($wordgraphfile=`basename $wordgraphlist[0]`); } $wordgraphflag=1; } } ####################### #Script starts here init(); version() if $version; usage() if $help; if (!defined $inputlist || !defined $mosescmd || ! defined $cfgfile) { print STDERR "Please specify -input-file, -decoder and -config\n"; usage(); } #checking if inputfile exists if (! -e ${inputlist} ){ print STDERR "Inputfile ($inputlist) does not exists\n"; usage(); } #checking if decoder exists if (! -e $mosescmd) { print STDERR "Decoder ($mosescmd) does not exists\n"; usage(); } #checking if configfile exists if (! -e $cfgfile) { print STDERR "Configuration file ($cfgfile) does not exists\n"; usage(); } print_parameters(); # so that people know exit(1) if $dbg; # debug mode: just print and do not run #splitting test file in several parts #$decimal="-d"; #split does not accept this options (on MAC OS) my $decimal=""; my $cmd; my $sentenceN; my $splitN; my @idxlist=(); if ($inputtype==0){ #text input #getting the number of input sentences (one sentence per line) chomp($sentenceN=`wc -l ${inputlist} | awk '{print \$1}' `); #Reducing the number of jobs if less sentences to translate if ($jobs>$sentenceN){ $jobs=$sentenceN; } #Computing the number of sentences for each files if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } else{ $splitN=int($sentenceN /$jobs) + 1; } if ($dbg){ print STDERR "There are $sentenceN sentences to translate\n"; print STDERR "There are at most $splitN sentences per job\n"; } $cmd="split $decimal -a 2 -l $splitN $inputlist ${inputfile}.$splitpfx-"; safesystem("$cmd") or die; } elsif ($inputtype==1){ #confusion network input my $tmpfile="/tmp/cnsplit$$"; $cmd="cat $inputlist | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' > $tmpfile"; safesystem("$cmd") or die; #getting the number of input CNs chomp($sentenceN=`wc -l $tmpfile | awk '{print \$1}' `); #Reducing the number of jobs if less CNs to translate if ($jobs>$sentenceN){ $jobs=$sentenceN; } #Computing the number of CNs for each files if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } else{ $splitN=int($sentenceN /$jobs) + 1; } if ($dbg){ print STDERR "There are $sentenceN confusion networks to translate\n"; print STDERR "There are at most $splitN sentences per job\n"; } $cmd="split $decimal -a 2 -l $splitN $tmpfile $tmpfile-"; safesystem("$cmd") or die; my @idxlist=(); chomp(@idxlist=`ls $tmpfile-*`); grep(s/.+(\-\S+)$/$1/e,@idxlist); foreach my $idx (@idxlist){ $cmd="perl -pe 's/ _CNendline_ /\\n/g;s/ _CNendline_/\\n/g;'"; safesystem("cat $tmpfile$idx | $cmd > ${inputfile}.$splitpfx$idx ; \\rm -f $tmpfile$idx;"); } } elsif ($inputtype==2){ #confusion network input #getting the number of input lattices (one lattice per line) chomp($sentenceN=`wc -l ${inputlist} | awk '{print \$1}' `); #Reducing the number of jobs if less lattices to translate if ($jobs>$sentenceN){ $jobs=$sentenceN; } #Computing the number of sentences for each files if ($sentenceN % $jobs == 0){ $splitN=int($sentenceN / $jobs); } else{ $splitN=int($sentenceN /$jobs) + 1; } if ($dbg){ print STDERR "There are $sentenceN lattices to translate\n"; print STDERR "There are at most $splitN lattices per job\n"; } $cmd="split $decimal -a 2 -l $splitN $inputlist ${inputfile}.$splitpfx-"; safesystem("$cmd") or die; } else{ #unknown input type die "INPUTTYPE:$inputtype is unknown!\n"; } chomp(@idxlist=`ls ${inputfile}.$splitpfx-*`); grep(s/.+(\-\S+)$/$1/e,@idxlist); safesystem("mkdir -p $tmpdir") or die; preparing_script(); #launching process through the queue my @sgepids =(); my @idx_todo = (); foreach (@idxlist) { push @idx_todo,$_; } # loop up to --robust times while ($robust && scalar @idx_todo) { $robust--; my $failure=0; foreach my $idx (@idx_todo){ my $batch_and_join = undef; if ($old_sge) { # old SGE understands -b no as the default and does not understand 'yes' $batch_and_join = "-j y"; } else { $batch_and_join = "-b no -j yes"; } $cmd="qsub $queueparameters $batch_and_join -o $qsubout$idx -e $qsuberr$idx -N $qsubname$idx ${jobscript}${idx}.bash > ${jobscript}${idx}.log 2>&1"; print STDERR "$cmd\n" if $dbg; safesystem($cmd) or die; my ($res,$id); open (IN,"${jobscript}${idx}.log") or die "Can't read id of job ${jobscript}${idx}.log"; chomp($res=); my @arrayStr = split(/\s+/,$res); $id=$arrayStr[2]; die "Failed to guess job id from $jobscript$idx.log, got: $res" if $id !~ /^[0-9]+$/; close(IN); push @sgepids, $id; } #waiting until all jobs have finished my $hj = "-hold_jid " . join(" -hold_jid ", @sgepids); if ($old_sge) { # we need to implement our own waiting script my $syncscript = "${jobscript}.sync_workaround_script.sh"; safesystem("echo 'date' > $syncscript") or kill_all_and_quit(); my $pwd = `$pwdcmd`; chomp $pwd; my $checkpointfile = "${jobscript}.sync_workaround_checkpoint"; # delete previous checkpoint, if left from previous runs safesystem("\\rm -f $checkpointfile") or kill_all_and_quit(); # start the 'hold' job, i.e. the job that will wait $cmd="qsub -cwd $queueparameters $hj -o $checkpointfile -e /dev/null -N $qsubname.W $syncscript 2> $qsubname.W.log"; safesystem($cmd) or kill_all_and_quit(); # and wait for checkpoint file to appear my $nr=0; while (!-e $checkpointfile) { sleep(10); $nr++; print STDERR "w" if $nr % 3 == 0; } print STDERR "End of waiting.\n"; safesystem("\\rm -f $checkpointfile $syncscript") or kill_all_and_quit(); my $failure = 1; my $nr = 0; while ($nr < 60 && $failure) { $nr ++; $failure=&check_exit_status(); if (!$failure) { $failure = check_translation_old_sge(); } last if !$failure; print STDERR "Extra wait ($nr) for possibly unfinished processes.\n"; sleep 10; } } else { # use the -sync option for qsub $cmd="qsub $queueparameters -sync y $hj -j y -o /dev/null -e /dev/null -N $qsubname.W -b y /bin/ls > $qsubname.W.log"; safesystem($cmd) or kill_all_and_quit(); $failure=&check_exit_status(); } kill_all_and_quit() if $failure && !$robust; # check if some translations failed my @idx_still_todo = check_translation(); if ($robust) { # if robust, redo crashed jobs if ((scalar @idx_still_todo) == (scalar @idxlist)) { # ... but not if all crashed print STDERR "everything crashed, not trying to resubmit jobs\n"; $robust = 0; kill_all_and_quit(); } @idx_todo = @idx_still_todo; } else { if (scalar (@idx_still_todo)) { print STDERR "some jobs crashed: ".join(" ",@idx_still_todo)."\n"; kill_all_and_quit(); } } } #concatenating translations and removing temporary files concatenate_1best(); concatenate_logs() if $logflag; concatenate_ali() if defined $alifile; concatenate_details() if defined $detailsfile; concatenate_nbest() if $nbestflag; safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-'; concatenate_searchgraph() if $searchgraphflag; safesystem("cat searchgraph$$ >> /dev/stdout") if $searchgraphlist eq '-'; concatenate_wordgraph() if $wordgraphflag; safesystem("cat wordgraph$$ >> /dev/stdout") if $wordgraphlist[0] eq '-'; remove_temporary_files(); #script creation sub preparing_script(){ my $currStartTranslationId = 0; foreach my $idx (@idxlist){ my $scriptheader=""; $scriptheader.="\#\! /bin/bash\n\n"; # !!! this is useless. qsub ignores the first line of the script. # Pass '-S /bin/bash' to qsub instead. #SGE-make-it-work: # $scriptheader.="## Sun's Grid Engine parameters\n"; # $scriptheader.="# ... (see 'man qsub' for complete documentation)\n"; # $scriptheader.="# ... job name\n"; # $scriptheader.="#\$ -N mert-$qsubname\n"; # $scriptheader.="# ... make sure to use proper shell\n"; # $scriptheader.="#\$ -S /bin/bash\n"; # $scriptheader.="# ... e-mail address to send notifications\n"; # $scriptheader.="#\$ -M ww.xx\@yy.zz\n\n"; $scriptheader.="uname -a\n\n"; $scriptheader.="ulimit -c 0\n\n"; # avoid coredumps $scriptheader.="cd $workingdir\n\n"; open (OUT, "> ${jobscript}${idx}.bash"); print OUT $scriptheader; my $inputmethod = $feed_moses_via_stdin ? "<" : "-input-file"; my $tmpnbestlist=""; if ($nbestflag){ $tmpnbestlist="$tmpdir/$nbestfile.$splitpfx$idx $nbestlist[1]"; $tmpnbestlist = "$tmpnbestlist $nbestlist[2]" if scalar(@nbestlist)==3; $tmpnbestlist = "-n-best-list $tmpnbestlist"; } my $tmpalioutfile = ""; if (defined $alifile){ $tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx"; } my $tmpdetailsoutfile = ""; if (defined $detailsfile){ $tmpdetailsoutfile="-translation-details $tmpdir/$detailsfile.$splitpfx$idx"; } my $tmpsearchgraphlist=""; if ($searchgraphflag){ $tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx"; } my $tmpwordgraphlist=""; if ($wordgraphflag){ $tmpwordgraphlist="-output-word-graph $tmpdir/$wordgraphfile.$splitpfx$idx $wordgraphlist[1]"; } my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId"; print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n"; print OUT "echo exit status \$\?\n\n"; if (defined $alifile){ print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } if (defined $detailsfile){ print OUT "\\mv -f $tmpdir/${detailsfile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } if ($nbestflag){ print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } if ($searchgraphflag){ print OUT "\\mv -f $tmpdir/${searchgraphfile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } if ($wordgraphflag){ print OUT "\\mv -f $tmpdir/${wordgraphfile}.$splitpfx$idx .\n\n"; print OUT "echo exit status \$\?\n\n"; } print OUT "\\mv -f $tmpdir/${inputfile}.$splitpfx$idx.trans .\n\n"; print OUT "echo exit status \$\?\n\n"; close(OUT); #setting permissions of each script chmod(oct(755),"${jobscript}${idx}.bash"); $currStartTranslationId += $splitN; } } sub concatenate_wordgraph(){ my $oldcode=""; my $newcode=-1; my %inplength = (); my $offset = 0; my $outwordgraph=$wordgraphlist[0]; if ($wordgraphlist[0] eq '-'){ $outwordgraph="wordgraph$$"; } open (OUT, "> $outwordgraph"); foreach my $idx (@idxlist){ #computing the length of each input file my @in=(); open (IN, "${inputfile}.${splitpfx}${idx}.trans"); @in=; close(IN); $inplength{$idx} = scalar(@in); open (IN, "${wordgraphfile}.${splitpfx}${idx}"); while (){ my $code=""; if (/^UTTERANCE=/){ ($code)=($_=~/^UTTERANCE=(\d+)/); print STDERR "code:$code offset:$offset\n"; $code += $offset; if ($code ne $oldcode){ # if there is a jump between two consecutive codes # it means that an input sentence is not translated # fill this hole with a "fictitious" list of wordgraphs # comprising just one "_EMPTYSEARCHGRAPH_ while ($code - $oldcode > 1){ $oldcode++; print OUT "UTTERANCE=$oldcode\n"; print STDERR " to OUT -> code:$oldcode\n"; print OUT "_EMPTYWORDGRAPH_\n"; } } $oldcode=$code; print OUT "UTTERANCE=$oldcode\n"; next; } print OUT "$_"; } close(IN); $offset += $inplength{$idx}; while ($offset - $oldcode > 1){ $oldcode++; print OUT "UTTERANCE=$oldcode\n"; print OUT "_EMPTYWORDGRAPH_\n"; } } close(OUT); } sub concatenate_searchgraph(){ my $oldcode=""; my $newcode=-1; my %inplength = (); my $offset = 0; my $outsearchgraph=$searchgraphlist; if ($searchgraphlist eq '-'){ $outsearchgraph="searchgraph$$"; } open (OUT, "> $outsearchgraph"); foreach my $idx (@idxlist){ #computing the length of each input file my @in=(); open (IN, "${inputfile}.${splitpfx}${idx}.trans"); @in=; close(IN); $inplength{$idx} = scalar(@in); open (IN, "${searchgraphfile}.${splitpfx}${idx}"); while (){ my ($code,@extra)=split(/[ \t]+/,$_); $code += $offset; if ($code ne $oldcode){ # if there is a jump between two consecutive codes # it means that an input sentence is not translated # fill this hole with a "fictitious" list of searchgraphs # comprising just one "_EMPTYSEARCHGRAPH_ while ($code - $oldcode > 1){ $oldcode++; print OUT "$oldcode _EMPTYSEARCHGRAPH_\n"; } } $oldcode=$code; print OUT join(" ",($oldcode,@extra)); } close(IN); $offset += $inplength{$idx}; while ($offset - $oldcode > 1){ $oldcode++; print OUT "$oldcode _EMPTYSEARCHGRAPH_\n"; } } close(OUT); } sub concatenate_nbest(){ my $oldcode=""; my $newcode=-1; my %inplength = (); my $offset = 0; # get the list of feature and set a fictitious string with zero scores open (IN, "${nbestfile}.${splitpfx}$idxlist[0]"); my $str = ; chomp($str); close(IN); my ($code,$trans,$featurescores,$globalscore)=split(/\|\|\|/,$str); my $emptytrans = " "; my $emptyglobalscore = " 0.0"; my $emptyfeaturescores = $featurescores; $emptyfeaturescores =~ s/[-0-9\.]+/0/g; my $outnbest=$nbestlist[0]; if ($nbestlist[0] eq '-'){ $outnbest="nbest$$"; } open (OUT, "> $outnbest"); foreach my $idx (@idxlist){ #computing the length of each input file my @in=(); open (IN, "${inputfile}.${splitpfx}${idx}.trans"); @in=; close(IN); $inplength{$idx} = scalar(@in); open (IN, "${nbestfile}.${splitpfx}${idx}"); while (){ my ($code,@extra)=split(/\|\|\|/,$_); $code += $offset; if ($code ne $oldcode){ # if there is a jump between two consecutive codes # it means that an input sentence is not translated # fill this hole with a "fictitious" list of translation # comprising just one "emtpy translation" with zero scores while ($code - $oldcode > 1){ $oldcode++; print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n"; } } $oldcode=$code; print OUT join("\|\|\|",($oldcode,@extra)); } close(IN); $offset += $inplength{$idx}; while ($offset - $oldcode > 1){ $oldcode++; print OUT join("\|\|\|",($oldcode,$emptytrans,$emptyfeaturescores,$emptyglobalscore)),"\n"; } } close(OUT); } sub concatenate_1best(){ foreach my $idx (@idxlist){ my @in=(); open (IN, "${inputfile}.${splitpfx}${idx}.trans"); @in=; print STDOUT "@in"; close(IN); } } sub concatenate_logs(){ open (OUT, "> ${logfile}"); foreach my $idx (@idxlist){ my @in=(); open (IN, "$qsubout$idx"); @in=; print OUT "@in"; close(IN); } close(OUT); } sub concatenate_ali(){ open (OUT, "> ${alifile}"); foreach my $idx (@idxlist){ my @in=(); open (IN, "$alifile.$splitpfx$idx"); @in=; print OUT "@in"; close(IN); } close(OUT); } sub concatenate_details(){ open (OUT, "> ${detailsfile}"); foreach my $idx (@idxlist){ my @in=(); open (IN, "$detailsfile.$splitpfx$idx"); @in=; print OUT "@in"; close(IN); } close(OUT); } sub check_exit_status(){ print STDERR "check_exit_status\n"; my $failure=0; foreach my $idx (@idxlist){ print STDERR "check_exit_status of job $idx\n"; open(IN,"$qsubout$idx"); while (){ $failure=1 if (/exit status 1/); } close(IN); } return $failure; } sub kill_all_and_quit(){ print STDERR "Got interrupt or something failed.\n"; print STDERR "kill_all_and_quit\n"; foreach my $id (@sgepids){ print STDERR "qdel $id\n"; safesystem("qdel $id"); } print STDERR "Translation was not performed correctly\n"; print STDERR "or some of the submitted jobs died.\n"; print STDERR "qdel function was called for all submitted jobs\n"; exit(1); } sub check_translation(){ #checking if all sentences were translated my $inputN; my $outputN; my @failed = (); foreach my $idx (@idx_todo){ if ($inputtype==0){#text input chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); } elsif ($inputtype==1){#confusion network input chomp($inputN=`cat ${inputfile}.$splitpfx$idx | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' | wc -l | cut -d' ' -f1 `); } elsif ($inputtype==2){#lattice input chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); } else{#unknown input die "INPUTTYPE:$inputtype is unknown!\n"; } chomp($outputN=`wc -l ${inputfile}.$splitpfx$idx.trans | cut -d' ' -f1`); if ($inputN != $outputN){ print STDERR "Split ($idx) were not entirely translated\n"; print STDERR "outputN=$outputN inputN=$inputN\n"; print STDERR "outputfile=${inputfile}.$splitpfx$idx.trans inputfile=${inputfile}.$splitpfx$idx\n"; push @failed,$idx; } } return @failed; } sub check_translation_old_sge(){ #checking if all sentences were translated my $inputN; my $outputN; foreach my $idx (@idx_todo){ if ($inputtype==0){#text input chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); } elsif ($inputtype==1){#confusion network input chomp($inputN=`cat ${inputfile}.$splitpfx$idx | perl -pe 's/\\n/ _CNendline_ /g;' | perl -pe 's/_CNendline_ _CNendline_ /_CNendline_\\n/g;' | wc -l | cut -d' ' -f1 `); } elsif ($inputtype==2){#lattice input chomp($inputN=`wc -l ${inputfile}.$splitpfx$idx | cut -d' ' -f1`); } else{#unknown input die "INPUTTYPE:$inputtype is unknown!\n"; } chomp($outputN=`wc -l ${inputfile}.$splitpfx$idx.trans | cut -d' ' -f1`); if ($inputN != $outputN){ print STDERR "Split ($idx) were not entirely translated\n"; print STDERR "outputN=$outputN inputN=$inputN\n"; print STDERR "outputfile=${inputfile}.$splitpfx$idx.trans inputfile=${inputfile}.$splitpfx$idx\n"; return 1; } } return 0; } sub remove_temporary_files(){ #removing temporary files foreach my $idx (@idxlist){ unlink("${inputfile}.${splitpfx}${idx}.trans"); unlink("${inputfile}.${splitpfx}${idx}"); if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); } if (defined $detailsfile){ unlink("${detailsfile}.${splitpfx}${idx}"); } if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); } if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); } if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); } unlink("${jobscript}${idx}.bash"); unlink("${jobscript}${idx}.log"); unlink("$qsubname.W.log"); unlink("$qsubout$idx"); unlink("$qsuberr$idx"); rmdir("$tmpdir"); } if ($nbestflag && $nbestlist[0] eq '-'){ unlink("${nbestfile}$$"); }; if ($searchgraphflag && $searchgraphlist eq '-'){ unlink("${searchgraphfile}$$"); }; if ($wordgraphflag && $wordgraphlist eq '-'){ unlink("${wordgraphfile}$$"); }; } sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit 1; } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; } } # look for the correct pwdcmd (pwd by default, pawd if it exists) # I assume that pwd always exists sub getPwdCmd(){ my $pwdcmd="pwd"; my $a; chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print $1}'`); if ($a && -e $a){ $pwdcmd=$a; } return $pwdcmd; }