diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2012-07-17 13:50:44 +0400 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2012-07-17 13:50:44 +0400 |
commit | 3de8151836ee0ff4ca74b176f025016fc67fae39 (patch) | |
tree | 851bc62db63b7d4447ded4ad63917ddbe65b0c96 | |
parent | a73dc0e5b4d7d580609fa630e89663fb3fa5b047 (diff) | |
parent | bc64284b313be234108fabfdd3020c44a1086e5c (diff) |
Merge pull request #20 from runn1ng/master
Spaces in filename
-rwxr-xr-x | scripts/generic/extract-parallel.perl | 39 | ||||
-rwxr-xr-x | scripts/generic/score-parallel.perl | 20 | ||||
-rwxr-xr-x | scripts/training/train-model.perl | 62 |
3 files changed, 64 insertions, 57 deletions
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 8b61a33e8..14c2af9a3 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -35,7 +35,7 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i) my $TMPDIR=dirname($extract) ."/tmp.$$"; mkdir $TMPDIR; -my $totalLines = int(`cat $align | wc -l`); +my $totalLines = int(`cat '$align' | wc -l`); my $linesPerSplit = int($totalLines / $numParallel) + 1; print "total=$totalLines line-per-split=$linesPerSplit \n"; @@ -46,15 +46,15 @@ my $cmd; if ($numParallel > 1) { - $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $target $TMPDIR/target."; + $cmd = "$splitCmd -d -l $linesPerSplit -a 5 '$target' '$TMPDIR/target.'"; $pid = RunFork($cmd); push(@children, $pid); - $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $source $TMPDIR/source."; + $cmd = "$splitCmd -d -l $linesPerSplit -a 5 '$source' '$TMPDIR/source.'"; $pid = RunFork($cmd); push(@children, $pid); - $cmd = "$splitCmd -d -l $linesPerSplit -a 5 $align $TMPDIR/align."; + $cmd = "$splitCmd -d -l $linesPerSplit -a 5 '$align' '$TMPDIR/align.'"; $pid = RunFork($cmd); push(@children, $pid); @@ -68,15 +68,15 @@ else { my $numStr = NumStr(0); - $cmd = "ln -s $target $TMPDIR/target.$numStr"; + $cmd = "ln -s '$target' '$TMPDIR/target.$numStr'"; print STDERR "Executing: $cmd \n"; `$cmd`; - $cmd = "ln -s $source $TMPDIR/source.$numStr"; + $cmd = "ln -s '$source' '$TMPDIR/source.$numStr'"; print STDERR "Executing: $cmd \n"; `$cmd`; - $cmd = "ln -s $align $TMPDIR/align.$numStr"; + $cmd = "ln -s '$align' '$TMPDIR/align.$numStr'"; print STDERR "Executing: $cmd \n"; `$cmd`; } @@ -90,7 +90,7 @@ for (my $i = 0; $i < $numParallel; ++$i) if ($pid == 0) { # child my $numStr = NumStr($i); - my $cmd = "$extractCmd $TMPDIR/target.$numStr $TMPDIR/source.$numStr $TMPDIR/align.$numStr $TMPDIR/extract.$numStr $otherExtractArgs \n"; + my $cmd = "'$extractCmd' '$TMPDIR/target.$numStr' '$TMPDIR/source.$numStr' '$TMPDIR/align.$numStr' '$TMPDIR/extract.$numStr' $otherExtractArgs \n"; print STDERR $cmd; `$cmd`; @@ -108,20 +108,21 @@ foreach (@children) { } # merge -my $catCmd = "zcat "; -my $catInvCmd = "zcat "; -my $catOCmd = "zcat "; +my $is_osx = ($^O eq "darwin"); +my $catCmd = $is_osx?"gunzip -c ":"zcat "; +my $catInvCmd = $catCmd; +my $catOCmd = $catCmd; for (my $i = 0; $i < $numParallel; ++$i) { my $numStr = NumStr($i); - $catCmd .= "$TMPDIR/extract.$numStr.gz "; - $catInvCmd .= "$TMPDIR/extract.$numStr.inv.gz "; - $catOCmd .= "$TMPDIR/extract.$numStr.o.gz "; + $catCmd .= "'$TMPDIR/extract.$numStr.gz' "; + $catInvCmd .= "'$TMPDIR/extract.$numStr.inv.gz' "; + $catOCmd .= "'$TMPDIR/extract.$numStr.o.gz' "; } -$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.sorted.gz \n"; -$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.inv.sorted.gz \n"; -$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n"; +$catCmd .= " | LC_ALL=C $sortCmd -T '$TMPDIR' | gzip -c > '$extract.sorted.gz' \n"; +$catInvCmd .= " | LC_ALL=C $sortCmd -T '$TMPDIR' | gzip -c > '$extract.inv.sorted.gz' \n"; +$catOCmd .= " | LC_ALL=C $sortCmd -T '$TMPDIR' | gzip -c > '$extract.o.sorted.gz' \n"; @children = (); @@ -135,7 +136,7 @@ if ($makeTTable) } my $numStr = NumStr(0); -if (-e "$TMPDIR/extract.$numStr.o.gz") +if (-e "'$TMPDIR/extract.$numStr.o.gz'") { $pid = RunFork($catOCmd); push(@children, $pid); @@ -147,7 +148,7 @@ foreach (@children) { } # delete temporary files -$cmd = "rm -rf $TMPDIR \n"; +$cmd = "rm -rf '$TMPDIR' \n"; print STDERR $cmd; `$cmd`; diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl index 144571387..da9f9c162 100755 --- a/scripts/generic/score-parallel.perl +++ b/scripts/generic/score-parallel.perl @@ -44,7 +44,7 @@ my $cmd; my $fileCount = 0; if ($numParallel <= 1) { # don't do parallel. Just link the extract file into place - $cmd = "ln -s $extractFile $TMPDIR/extract.0.gz"; + $cmd = "ln -s '$extractFile' '$TMPDIR/extract.0.gz'"; print STDERR "$cmd \n"; systemCheck($cmd); @@ -121,7 +121,7 @@ for (my $i = 0; $i < $fileCount; ++$i) my $fileInd = $i % $numParallel; my $fh = $runFiles[$fileInd]; - my $cmd = "$scoreCmd $TMPDIR/extract.$i.gz $lexFile $TMPDIR/phrase-table.half.$numStr.gz $otherExtractArgs\n"; + my $cmd = "'$scoreCmd' '$TMPDIR'/extract.$i.gz '$lexFile' '$TMPDIR'/phrase-table.half.$numStr.gz $otherExtractArgs\n"; print $fh $cmd; } @@ -129,7 +129,7 @@ for (my $i = 0; $i < $fileCount; ++$i) for (my $i = 0; $i < $numParallel; ++$i) { close($runFiles[$i]); - my $path = "$TMPDIR/run.$i.sh"; + my $path = "'$TMPDIR'/run.$i.sh"; systemCheck("chmod +x $path"); } @@ -137,7 +137,7 @@ for (my $i = 0; $i < $numParallel; ++$i) my @children; for (my $i = 0; $i < $numParallel; ++$i) { - my $cmd = "$TMPDIR/run.$i.sh"; + my $cmd = "'$TMPDIR'/run.$i.sh"; my $pid = RunFork($cmd); push(@children, $pid); } @@ -152,17 +152,19 @@ $cmd = "\n\nOH SHIT. This should have been filled in \n\n"; if ($fileCount == 1 && !$doSort) { my $numStr = NumStr(0); - $cmd = "mv $TMPDIR/phrase-table.half.$numStr.gz $ptHalf"; + $cmd = "mv '$TMPDIR/phrase-table.half.$numStr.gz' '$ptHalf'"; } else { - $cmd = "zcat $TMPDIR/phrase-table.half.*.gz"; + my $_is_osx = ($^O eq "darwin"); + my $_catCmd = $_is_osx?"gunzip -c ":"zcat "; + $cmd = $_catCmd."'$TMPDIR'/phrase-table.half.*.gz"; if ($doSort) { - $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR "; + $cmd .= "| LC_ALL=C $sortCmd -T '$TMPDIR' "; } - $cmd .= " | gzip -c > $ptHalf"; + $cmd .= " | gzip -c > '$ptHalf'"; } print STDERR $cmd; systemCheck($cmd); @@ -213,7 +215,7 @@ if (-e $cocPath) close(FHCOC); } -$cmd = "rm -rf $TMPDIR \n"; +$cmd = "rm -rf '$TMPDIR' \n"; print STDERR $cmd; systemCheck($cmd); diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 6e18581af..5d60591a5 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -18,6 +18,7 @@ if ($SCRIPTS_ROOTDIR eq '') { $SCRIPTS_ROOTDIR = dirname(__FILE__); } $SCRIPTS_ROOTDIR =~ s/\/training$//; +$SCRIPTS_ROOTDIR = qq{'$SCRIPTS_ROOTDIR'}; #$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"}); my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_SORT_PARALLEL, $_CORPUS, @@ -586,7 +587,7 @@ die("ERROR: format for decoding steps is \"t0,g0,t1,g1:t2\", you provided $___DE sub prepare { print STDERR "(1) preparing corpus @ ".`date`; - safesystem("mkdir -p $___CORPUS_DIR") or die("ERROR: could not create corpus dir $___CORPUS_DIR"); + safesystem("mkdir -p '$___CORPUS_DIR'") or die("ERROR: could not create corpus dir $___CORPUS_DIR"); print STDERR "(1.0) selecting factors @ ".`date`; my ($factor_f,$factor_e) = split(/\-/,$___ALIGNMENT_FACTORS); @@ -725,7 +726,7 @@ sub reduce_factors { $realfull .= ".gz"; $reduced =~ s/(\.gz)?$/.gz/; } - safesystem("ln -s $realfull $reduced") + safesystem("ln -s '$realfull' '$reduced'") or die "Failed to create symlink $realfull -> $reduced"; return; } @@ -768,12 +769,12 @@ sub reduce_factors { print STDERR "\n"; close(OUT); close(IN); - `rm -f $reduced.lock`; + `rm -f '$reduced.lock'`; } sub make_classes { my ($corpus,$classes) = @_; - my $cmd = "$MKCLS -c50 -n2 -p$corpus -V$classes opt"; + my $cmd = "$MKCLS -c50 -n2 -p'$corpus' -V'$classes' opt"; print STDERR "(1.1) running mkcls @ ".`date`."$cmd\n"; if (-e $classes) { print STDERR " $classes already in place, reusing\n"; @@ -802,7 +803,7 @@ sub get_vocabulary { } my %VCB; - open(VCB,">$vcb") or die "ERROR: Can't write $vcb"; + open(VCB,">", "$vcb") or die "ERROR: Can't write $vcb"; print VCB "1\tUNK\t0\n"; my $id=2; foreach (reverse sort @NUM) { @@ -972,7 +973,7 @@ sub run_single_giza_on_parts { if ($i%3==1 && $part < ($___PARTS*$i)/$size && $part<$___PARTS) { close(PART) if $part; $part++; - safesystem("mkdir -p $___CORPUS_DIR/part$part") or die("ERROR: could not create $___CORPUS_DIR/part$part"); + safesystem("mkdir -p '$___CORPUS_DIR/part$part'") or die("ERROR: could not create $___CORPUS_DIR/part$part"); open(PART,">$___CORPUS_DIR/part$part/$f-$e-int-train.snt") or die "ERROR: Can't write $___CORPUS_DIR/part$part/$f-$e-int-train.snt"; } @@ -1090,6 +1091,9 @@ sub run_single_giza { my $GizaOptions; foreach my $option (sort keys %GizaDefaultOptions){ my $value = $GizaDefaultOptions{$option} ; + if ($value =~ /\s+/) { + $value = qq('$value') #makes '/file name/' from /file name/ + } $GizaOptions .= " -$option $value" ; } @@ -1115,17 +1119,17 @@ sub run_single_giza { die "ERROR: Giza did not produce the output file $dir/$f-$e.$___GIZA_EXTENSION. Is your corpus clean (reasonably-sized sentences)?" if ! -e "$dir/$f-$e.$___GIZA_EXTENSION"; - safesystem("rm -f $dir/$f-$e.$___GIZA_EXTENSION.gz") or die; - safesystem("gzip $dir/$f-$e.$___GIZA_EXTENSION") or die; + safesystem("rm -f '$dir/$f-$e.$___GIZA_EXTENSION.gz'") or die; + safesystem("gzip '$dir/$f-$e.$___GIZA_EXTENSION'") or die; } sub run_single_snt2cooc { my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_; print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n"; - safesystem("mkdir -p $dir") or die("ERROR"); + safesystem("mkdir -p '$dir'") or die("ERROR"); if ($SNT2COOC eq "$_EXTERNAL_BINDIR/snt2cooc.out") { print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc\n"; - safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc") or die("ERROR"); + safesystem("$SNT2COOC '$vcb_e' '$vcb_f' '$train' > '$dir/$f-$e.cooc'") or die("ERROR"); } else { print "$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train\n"; safesystem("$SNT2COOC $dir/$f-$e.cooc $vcb_e $vcb_f $train") or die("ERROR"); @@ -1146,22 +1150,22 @@ sub word_align { my($__ALIGNMENT_CMD,$__ALIGNMENT_INV_CMD); if (-e "$___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.bz2"){ - $__ALIGNMENT_CMD="\"$BZCAT $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.bz2\""; + $__ALIGNMENT_CMD="\"$BZCAT '$___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.bz2'\""; } elsif (-e "$___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.gz") { - $__ALIGNMENT_CMD="\"$ZCAT $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.gz\""; + $__ALIGNMENT_CMD="\"$ZCAT '$___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.gz'\""; } else { die "ERROR: Can't read $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.{bz2,gz}\n"; } if ( -e "$___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.bz2"){ - $__ALIGNMENT_INV_CMD="\"$BZCAT $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.bz2\""; + $__ALIGNMENT_INV_CMD="\"$BZCAT '$___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.bz2'\""; }elsif (-e "$___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.gz"){ - $__ALIGNMENT_INV_CMD="\"$ZCAT $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.gz\""; + $__ALIGNMENT_INV_CMD="\"$ZCAT '$___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.gz'\""; }else{ die "ERROR: Can't read $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.{bz2,gz}\n\n"; } - safesystem("mkdir -p $___MODEL_DIR") or die("ERROR: could not create dir $___MODEL_DIR"); + safesystem("mkdir -p '$___MODEL_DIR'") or die("ERROR: could not create dir $___MODEL_DIR"); #build arguments for symal my($__symal_a)=""; @@ -1182,7 +1186,7 @@ sub word_align { safesystem("$GIZA2BAL -d $__ALIGNMENT_INV_CMD -i $__ALIGNMENT_CMD |". "$SYMAL -alignment=\"$__symal_a\" -diagonal=\"$__symal_d\" ". "-final=\"$__symal_f\" -both=\"$__symal_b\" > ". - "$___ALIGNMENT_FILE.$___ALIGNMENT") + "'$___ALIGNMENT_FILE.$___ALIGNMENT'") || die "ERROR: Can't generate symmetrized alignment file\n" @@ -1389,7 +1393,7 @@ sub extract_phrase { my @tempfiles = (); foreach my $f ($alignment_file_e, $alignment_file_f, $alignment_file_a) { if (! -e $f && -e $f.".gz") { - safesystem("gunzip < $f.gz > $f") or die("Failed to gunzip corpus $f"); + safesystem("gunzip < '$f.gz' > '$f'") or die("Failed to gunzip corpus $f"); push @tempfiles, "$f.gz"; } } @@ -1398,7 +1402,7 @@ sub extract_phrase { { my $max_length = &get_max_phrase_length($table_number); - $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file"; + $cmd = "$RULE_EXTRACT '$alignment_file_e' '$alignment_file_f' '$alignment_file_a' '$extract_file'"; $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR; $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE); $cmd .= " --PCFG" if $_PCFG; @@ -1415,14 +1419,14 @@ sub extract_phrase { { if ( $_EPPEX ) { # eppex sets max_phrase_length itself (as the maximum phrase length for which any Lossy Counter is defined) - $cmd = "$EPPEX $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $_EPPEX"; + $cmd = "$EPPEX '$alignment_file_e' '$alignment_file_f' '$alignment_file_a' '$extract_file' $_EPPEX"; } else { my $max_length = &get_max_phrase_length($table_number); print "MAX $max_length $reordering_flag $table_number\n"; $max_length = &get_max_phrase_length(-1) if $reordering_flag; - $cmd = "$PHRASE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file $max_length"; + $cmd = "$PHRASE_EXTRACT '$alignment_file_e' '$alignment_file_f' '$alignment_file_a' '$extract_file' '$max_length'"; } if ($reordering_flag) { $cmd .= " orientation"; @@ -1530,7 +1534,7 @@ sub score_phrase_phrase_extract { print STDERR "(6.".($substep++).") creating table half $ttable_file.half.$direction @ ".`date`; - my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction.gz $inverse"; + my $cmd = "$PHRASE_SCORE '$extract' '$lexical_file.$direction' '$ttable_file.half.$direction.gz' $inverse"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT; $cmd .= " --KneserNey" if $KNESER_NEY; @@ -1578,7 +1582,7 @@ sub score_phrase_phrase_extract { # merging the two halves print STDERR "(6.6) consolidating the two halves @ ".`date`; return if $___CONTINUE && -e "$ttable_file.gz"; - my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.gz /dev/stdout"; + my $cmd = "$PHRASE_CONSOLIDATE '$ttable_file.half.f2e.gz' '$ttable_file.half.e2f.gz' /dev/stdout"; $cmd .= " --Hierarchical" if $_HIERARCHICAL; $cmd .= " --LogProb" if $LOG_PROB; $cmd .= " --NegLogProb" if $NEG_LOG_PROB; @@ -1589,10 +1593,10 @@ sub score_phrase_phrase_extract { $cmd .= " --GoodTuring $ttable_file.half.f2e.gz.coc" if $GOOD_TURING; $cmd .= " --KneserNey $ttable_file.half.f2e.gz.coc" if $KNESER_NEY; - $cmd .= " | gzip -c > $ttable_file.gz"; + $cmd .= " | gzip -c > '$ttable_file.gz'"; safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed"; - if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); } + if (! $debug) { safesystem("rm -f '$ttable_file.half.'*") or die("ERROR"); } } sub score_phrase_memscore { @@ -1606,7 +1610,7 @@ sub score_phrase_memscore { # The output is sorted to avoid breaking scripts that rely on the # sorting behaviour of the previous scoring algorithm. - my $cmd = "$MEMSCORE $options | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip >$ttable_file.gz"; + my $cmd = "$MEMSCORE $options | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR | gzip >'$ttable_file.gz'"; if (-e "$extract_file.gz") { $cmd = "$ZCAT $extract_file.gz | ".$cmd; } else { @@ -1666,7 +1670,7 @@ sub get_reordering { print STDERR "(7.2) building tables @ ".`date`; #create cmd string for lexical reordering scoring - my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path"; + my $cmd = "$LEXICAL_REO_SCORER '$extract_file.o.sorted.gz' $smooth '$reo_model_path'"; $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/); for my $mtype (keys %REORDERING_MODEL_TYPES) { $cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}"; @@ -1764,8 +1768,8 @@ sub get_generation { } } close(GEN); - safesystem("rm -f $file.gz") or die("ERROR"); - safesystem("gzip $file") or die("ERROR"); + safesystem("rm -f '$file.gz'") or die("ERROR"); + safesystem("gzip '$file'") or die("ERROR"); } ### (9) CREATE CONFIGURATION FILE @@ -1776,7 +1780,7 @@ sub create_ini { &full_path(\$___MODEL_DIR); &full_path(\$___VCB_E); &full_path(\$___VCB_F); - `mkdir -p $___MODEL_DIR`; + `mkdir -p '$___MODEL_DIR'`; open(INI,">$___CONFIG") or die("ERROR: Can't write $___CONFIG"); print INI "######################### ### MOSES CONFIG FILE ### |