Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEva Hasler <ehasler@saxnot.inf.ed.ac.uk>2012-04-12 15:29:12 +0400
committerEva Hasler <ehasler@saxnot.inf.ed.ac.uk>2012-04-12 15:29:12 +0400
commitb008dabfce7fff8997a8810bf85d6757c5a33641 (patch)
tree1745b9f7fdcf3e8e8c6f2e3871492d58031839c2 /scripts
parentf51ec46009422a5ae39028da5ec5e686b49cb8e0 (diff)
cherry-picked commit 2d47a5637bd95b9edd4dd19556658d6476dd0791 from master
Diffstat (limited to 'scripts')
-rw-r--r--scripts/ems/experiment.machines1
-rw-r--r--scripts/ems/experiment.meta5
-rwxr-xr-xscripts/ems/experiment.perl16
-rwxr-xr-xscripts/tokenizer/deescape-special-chars.perl13
-rwxr-xr-xscripts/tokenizer/escape-special-chars.perl23
-rwxr-xr-xscripts/training/combine_factors.pl4
6 files changed, 50 insertions, 12 deletions
diff --git a/scripts/ems/experiment.machines b/scripts/ems/experiment.machines
index dddc66c95..9e0294d60 100644
--- a/scripts/ems/experiment.machines
+++ b/scripts/ems/experiment.machines
@@ -1,2 +1,3 @@
cluster: townhill seville hermes lion seville sannox lutzow frontend
multicore-8: tyr thor odin crom saxnot vali vili freyja bragi hoenir
+multicore-24: syn hel skaol saga
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 4944286f3..856537040 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -52,8 +52,9 @@ truecase
rerun-on-change: input-truecaser output-truecaser
default-name: corpus/truecased
pass-unless: input-truecaser output-truecaser
- template-if: input-truecaser IN.$input-extension OUT.$input-extension -model IN1.$input-extension
- template-if: output-truecaser IN.$output-extension OUT.$output-extension -model IN1.$output-extension
+ template-if: input-truecaser IN.$input-extension OUT.$input-extension -model IN1.$input-extension
+ template-if: output-truecaser IN.$output-extension OUT.$output-extension -model IN1.$output-extension
+ parallelizable: yes
lowercase
in: truecased-stem
out: lowercased-stem
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 1443fbe74..c6cd375d0 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -5,6 +5,7 @@
use strict;
use Getopt::Long "GetOptions";
use FindBin qw($Bin);
+$SIG{CHLD} = "IGNORE"; # no zombies
my $host = `hostname`; chop($host);
print STDERR "STARTING UP AS PROCESS $$ ON $host AT ".`date`;
@@ -123,7 +124,10 @@ sub init_agenda_graph() {
."(its all gone blank...) show\n"
."showpage\n";
close(PS);
- `convert $graph_file.ps $graph_file.png`;
+
+ $SIG{CHLD} = undef;
+ `convert -alpha off $graph_file.ps $graph_file.png`;
+ $SIG{CHLD} = "IGNORE"; # no zombies
if (!$NO_GRAPH && !fork) {
# use ghostview by default, it it is installed
@@ -1292,7 +1296,8 @@ sub check_if_crashed {
'error','killed','core dumped','can\'t read',
'no such file or directory','unknown option',
'died at','exit code','permission denied',
- "Can't locate") {
+ 'segmentation fault','abort',
+ 'can\'t locate') {
if (/$pattern/i) {
my $not_error = 0;
if (defined($NOT_ERROR{&defined_step_id($i)})) {
@@ -2637,12 +2642,7 @@ sub define_template {
my $extra = join(" ",@EXTRA);
if (&backoff_and_get(&extend_local_name($module,$set,$command))) {
- if ($command eq "input-tokenizer") {
- $cmd .= "\$$command -r $VERSION -o $out < $in > $out $extra\n";
- }
- else {
$cmd .= "\$$command < $in > $out $extra\n";
- }
}
else {
$cmd .= "ln -s $in $out\n";
@@ -2712,7 +2712,7 @@ sub define_template {
$cmd =~ s/OUT/$output/g;
$cmd =~ s/VERSION/$VERSION/g;
print "\tcmd is $cmd\n" if $VERBOSE;
- while ($cmd =~ /^([\S\s]*)\$([^\s\/]+)([\S\s]*)$/) {
+ while ($cmd =~ /^([\S\s]*)\$([^\s\/\"\']+)([\S\s]*)$/) {
my ($pre,$variable,$post) = ($1,$2,$3);
$cmd = $pre
. &check_backoff_and_get(&extend_local_name($module,$set,$variable))
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
new file mode 100755
index 000000000..c98e01ccc
--- /dev/null
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -0,0 +1,13 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ s/\&bar;/\|/g;
+ s/\&lt;/\</g;
+ s/\&gt;/\>/g;
+ s/\&bra;/\[/g;
+ s/\&ket;/\]/g;
+ s/\&amp;/\&/g;
+ print $_;
+}
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
new file mode 100755
index 000000000..5c4dc9bb3
--- /dev/null
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -0,0 +1,23 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ chop;
+
+ # avoid general madness
+ s/\s+/ /g;
+ s/^ //g;
+ s/ $//g;
+ s/[\000-\037]//g;
+
+ # special characters in moses
+ s/\&/\&amp;/g;
+ s/\|/\&bar;/g;
+ s/\</\&lt;/g;
+ s/\>/\&gt;/g;
+ s/\[/\&bra;/g;
+ s/\]/\&ket;/g;
+
+ print $_."\n";
+}
diff --git a/scripts/training/combine_factors.pl b/scripts/training/combine_factors.pl
index 13054013b..8a57a6b57 100755
--- a/scripts/training/combine_factors.pl
+++ b/scripts/training/combine_factors.pl
@@ -35,7 +35,7 @@ while (defined $_) {
print STDERR "." if $nr % 10000 == 0;
print STDERR "($nr)" if $nr % 100000 == 0;
chomp;
- s/ +/ /g; s/^ //; s/ $//;
+ s/\s+/ /g; s/^ //; s/ $//;
my @intokens = split / /;
# load lines of corresponding streams and ensure equal number of words
my @lines_of_extratoks;
@@ -44,7 +44,7 @@ while (defined $_) {
die "Additional factor file $addfactors[$factor] contains too few sentences!"
if !defined $line;
chomp($line);
- $line =~ s/ +/ /g; $line =~ s/^ //; $line =~ s/ $//;
+ $line =~ s/\s+/ /g; $line =~ s/^ //; $line =~ s/ $//;
my @toks = split / /, $line;
die "Incompatible number of words in factor $factor on line $nr. ($#toks != $#intokens)"
if $#toks != $#intokens;