Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--moses/FF/TargetNgramFeature.cpp4
-rwxr-xr-xscripts/ems/support/wrap-xml.perl13
-rwxr-xr-xscripts/generic/strip-xml.perl11
-rwxr-xr-xscripts/recaser/truecase.perl4
-rwxr-xr-xscripts/training/mert-moses.pl3
-rwxr-xr-xscripts/training/reduce-factors.perl25
-rwxr-xr-xscripts/training/train-model.perl4
7 files changed, 56 insertions, 8 deletions
diff --git a/moses/FF/TargetNgramFeature.cpp b/moses/FF/TargetNgramFeature.cpp
index 9ea1ccadf..b0abb07a1 100644
--- a/moses/FF/TargetNgramFeature.cpp
+++ b/moses/FF/TargetNgramFeature.cpp
@@ -82,6 +82,7 @@ void TargetNgramFeature::Load()
m_vocab.insert(EOS_);
while (getline(inFile, line)) {
m_vocab.insert(line);
+ cerr << "ADD TO VOCAB: '" << line << "'" << endl;
}
inFile.close();
@@ -119,7 +120,9 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
// const string& curr_w = targetPhrase.GetWord(i).GetFactor(m_factorType)->GetString();
const StringPiece curr_w = targetPhrase.GetWord(i).GetString(m_factorType);
+ //cerr << "CHECK WORD '" << curr_w << "'" << endl;
if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams
+ //cerr << "ALLOWED WORD '" << curr_w << "'" << endl;
if (n > 1) {
// can we build an ngram at this position? ("<s> this" --> cannot build 3gram at this position)
@@ -154,6 +157,7 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo,
if (!skip) {
curr_ngram << curr_w;
+ //cerr << "SCORE '" << curr_ngram.str() << "'" << endl;
accumulator->PlusEquals(this,curr_ngram.str(),1);
}
curr_ngram.str("");
diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index 4ef6a1de6..beeca6cdd 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -10,6 +10,7 @@ open(SRC,$src) or die "Cannot open: $!";
my @OUT = <STDIN>;
chomp(@OUT);
#my @OUT = `cat $decoder_output`;
+my $missing_end_seg = 0;
while(<SRC>) {
chomp;
if (/^<srcset/) {
@@ -27,10 +28,20 @@ while(<SRC>) {
$line = "" if $line =~ /NO BEST TRANSLATION/;
if (/<\/seg>/) {
s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
+ $missing_end_seg = 0;
}
else {
- s/(<seg[^>]+> *)[^<]*/$1$line/i;
+ s/(<seg[^>]+> *)[^<]*/$1$line<\/seg>/i;
+ $missing_end_seg = 1;
}
}
+ elsif ($missing_end_seg) {
+ if (/<\/doc>/) {
+ $missing_end_seg = 0;
+ }
+ else {
+ next;
+ }
+ }
print $_."\n";
}
diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl
index 9fc43d4d9..40a61302a 100755
--- a/scripts/generic/strip-xml.perl
+++ b/scripts/generic/strip-xml.perl
@@ -9,13 +9,14 @@ while (my $line = <STDIN>) {
my $len = length($line);
my $inXML = 0;
my $prevSpace = 1;
+ my $prevBar = 0;
for (my $i = 0; $i < $len; ++$i) {
my $c = substr($line, $i, 1);
- if ($c eq "<") {
+ if ($c eq "<" && !$prevBar) {
++$inXML;
}
- elsif ($c eq ">") {
+ elsif ($c eq ">" && $inXML>0) {
--$inXML;
}
elsif ($prevSpace == 1 && $c eq " ")
@@ -24,9 +25,15 @@ while (my $line = <STDIN>) {
elsif ($inXML == 0) {
if ($c eq " ") {
$prevSpace = 1;
+ $prevBar = 0;
+ }
+ elsif ($c eq "|") {
+ $prevSpace = 0;
+ $prevBar = 1;
}
else {
$prevSpace = 0;
+ $prevBar = 0;
}
print $c;
}
diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl
index 74b55045b..0a4d366e0 100755
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@@ -35,7 +35,7 @@ while(<STDIN>) {
my ($WORD,$MARKUP) = split_xml($_);
my $sentence_start = 1;
for(my $i=0;$i<=$#$WORD;$i++) {
- print " " if $i;
+ print " " if $i && $$MARKUP[$i] eq '';
print $$MARKUP[$i];
my ($word,$otherfactors);
@@ -67,7 +67,7 @@ while(<STDIN>) {
if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; }
elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
}
- print " ".$$MARKUP[$#$MARKUP];
+ print $$MARKUP[$#$MARKUP];
print "\n";
}
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 682b6fe7e..d1ac5828a 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -863,7 +863,8 @@ while (1) {
$mira_settings .= "$batch_mira_args ";
}
- $mira_settings .= " --dense-init run$run.dense";
+ $mira_settings .= " --dense-init run$run.$weights_in_file";
+ #$mira_settings .= " --dense-init run$run.dense";
if (-e "run$run.sparse-weights") {
$mira_settings .= " --sparse-init run$run.sparse-weights";
}
diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl
index fd4906a48..c7269abf9 100755
--- a/scripts/training/reduce-factors.perl
+++ b/scripts/training/reduce-factors.perl
@@ -47,7 +47,9 @@ sub reduce_factors {
$firstline =~ s/^\s*//;
$firstline =~ s/\s.*//;
# count factors
- my $maxfactorindex = $firstline =~ tr/|/|/;
+ my @WORD = split(/ /,$firstline);
+ my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
+ my $maxfactorindex = scalar(@FACTOR)-1;
if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
# create just symlink; preserving compression
my $realfull = $full;
@@ -107,3 +109,24 @@ sub open_or_zcat {
open($hdl,$read) or die "Can't read $fn ($read)";
return $hdl;
}
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
+
+
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 5a13b6e1f..a9ed58535 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -755,7 +755,9 @@ sub reduce_factors {
$firstline =~ s/^\s*//;
$firstline =~ s/\s.*//;
# count factors
- my $maxfactorindex = $firstline =~ tr/$___FACTOR_DELIMITER/$___FACTOR_DELIMITER/;
+ my @WORD = split(/ /,$firstline);
+ my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]);
+ my $maxfactorindex = scalar(@FACTOR)-1;
if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) {
# create just symlink; preserving compression
my $realfull = $full;