diff options
-rw-r--r-- | moses/FF/TargetNgramFeature.cpp | 4 | ||||
-rwxr-xr-x | scripts/ems/support/wrap-xml.perl | 13 | ||||
-rwxr-xr-x | scripts/generic/strip-xml.perl | 11 | ||||
-rwxr-xr-x | scripts/recaser/truecase.perl | 4 | ||||
-rwxr-xr-x | scripts/training/mert-moses.pl | 3 | ||||
-rwxr-xr-x | scripts/training/reduce-factors.perl | 25 | ||||
-rwxr-xr-x | scripts/training/train-model.perl | 4 |
7 files changed, 56 insertions, 8 deletions
diff --git a/moses/FF/TargetNgramFeature.cpp b/moses/FF/TargetNgramFeature.cpp index 9ea1ccadf..b0abb07a1 100644 --- a/moses/FF/TargetNgramFeature.cpp +++ b/moses/FF/TargetNgramFeature.cpp @@ -82,6 +82,7 @@ void TargetNgramFeature::Load() m_vocab.insert(EOS_); while (getline(inFile, line)) { m_vocab.insert(line); + cerr << "ADD TO VOCAB: '" << line << "'" << endl; } inFile.close(); @@ -119,7 +120,9 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo, // const string& curr_w = targetPhrase.GetWord(i).GetFactor(m_factorType)->GetString(); const StringPiece curr_w = targetPhrase.GetWord(i).GetString(m_factorType); + //cerr << "CHECK WORD '" << curr_w << "'" << endl; if (m_vocab.size() && (FindStringPiece(m_vocab, curr_w) == m_vocab.end())) continue; // skip ngrams + //cerr << "ALLOWED WORD '" << curr_w << "'" << endl; if (n > 1) { // can we build an ngram at this position? ("<s> this" --> cannot build 3gram at this position) @@ -154,6 +157,7 @@ FFState* TargetNgramFeature::Evaluate(const Hypothesis& cur_hypo, if (!skip) { curr_ngram << curr_w; + //cerr << "SCORE '" << curr_ngram.str() << "'" << endl; accumulator->PlusEquals(this,curr_ngram.str(),1); } curr_ngram.str(""); diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl index 4ef6a1de6..beeca6cdd 100755 --- a/scripts/ems/support/wrap-xml.perl +++ b/scripts/ems/support/wrap-xml.perl @@ -10,6 +10,7 @@ open(SRC,$src) or die "Cannot open: $!"; my @OUT = <STDIN>; chomp(@OUT); #my @OUT = `cat $decoder_output`; +my $missing_end_seg = 0; while(<SRC>) { chomp; if (/^<srcset/) { @@ -27,10 +28,20 @@ while(<SRC>) { $line = "" if $line =~ /NO BEST TRANSLATION/; if (/<\/seg>/) { s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i; + $missing_end_seg = 0; } else { - s/(<seg[^>]+> *)[^<]*/$1$line/i; + s/(<seg[^>]+> *)[^<]*/$1$line<\/seg>/i; + $missing_end_seg = 1; } } + elsif ($missing_end_seg) { + if (/<\/doc>/) { + $missing_end_seg = 0; + } + else { + next; + } + } print $_."\n"; } diff --git a/scripts/generic/strip-xml.perl b/scripts/generic/strip-xml.perl index 9fc43d4d9..40a61302a 100755 --- a/scripts/generic/strip-xml.perl +++ b/scripts/generic/strip-xml.perl @@ -9,13 +9,14 @@ while (my $line = <STDIN>) { my $len = length($line); my $inXML = 0; my $prevSpace = 1; + my $prevBar = 0; for (my $i = 0; $i < $len; ++$i) { my $c = substr($line, $i, 1); - if ($c eq "<") { + if ($c eq "<" && !$prevBar) { ++$inXML; } - elsif ($c eq ">") { + elsif ($c eq ">" && $inXML>0) { --$inXML; } elsif ($prevSpace == 1 && $c eq " ") @@ -24,9 +25,15 @@ while (my $line = <STDIN>) { elsif ($inXML == 0) { if ($c eq " ") { $prevSpace = 1; + $prevBar = 0; + } + elsif ($c eq "|") { + $prevSpace = 0; + $prevBar = 1; } else { $prevSpace = 0; + $prevBar = 0; } print $c; } diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl index 74b55045b..0a4d366e0 100755 --- a/scripts/recaser/truecase.perl +++ b/scripts/recaser/truecase.perl @@ -35,7 +35,7 @@ while(<STDIN>) { my ($WORD,$MARKUP) = split_xml($_); my $sentence_start = 1; for(my $i=0;$i<=$#$WORD;$i++) { - print " " if $i; + print " " if $i && $$MARKUP[$i] eq ''; print $$MARKUP[$i]; my ($word,$otherfactors); @@ -67,7 +67,7 @@ while(<STDIN>) { if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; } elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; } } - print " ".$$MARKUP[$#$MARKUP]; + print $$MARKUP[$#$MARKUP]; print "\n"; } diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl index 682b6fe7e..d1ac5828a 100755 --- a/scripts/training/mert-moses.pl +++ b/scripts/training/mert-moses.pl @@ -863,7 +863,8 @@ while (1) { $mira_settings .= "$batch_mira_args "; } - $mira_settings .= " --dense-init run$run.dense"; + $mira_settings .= " --dense-init run$run.$weights_in_file"; + #$mira_settings .= " --dense-init run$run.dense"; if (-e "run$run.sparse-weights") { $mira_settings .= " --sparse-init run$run.sparse-weights"; } diff --git a/scripts/training/reduce-factors.perl b/scripts/training/reduce-factors.perl index fd4906a48..c7269abf9 100755 --- a/scripts/training/reduce-factors.perl +++ b/scripts/training/reduce-factors.perl @@ -47,7 +47,9 @@ sub reduce_factors { $firstline =~ s/^\s*//; $firstline =~ s/\s.*//; # count factors - my $maxfactorindex = $firstline =~ tr/|/|/; + my @WORD = split(/ /,$firstline); + my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); + my $maxfactorindex = scalar(@FACTOR)-1; if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { # create just symlink; preserving compression my $realfull = $full; @@ -107,3 +109,24 @@ sub open_or_zcat { open($hdl,$read) or die "Can't read $fn ($read)"; return $hdl; } + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + + diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 5a13b6e1f..a9ed58535 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -755,7 +755,9 @@ sub reduce_factors { $firstline =~ s/^\s*//; $firstline =~ s/\s.*//; # count factors - my $maxfactorindex = $firstline =~ tr/$___FACTOR_DELIMITER/$___FACTOR_DELIMITER/; + my @WORD = split(/ /,$firstline); + my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); + my $maxfactorindex = scalar(@FACTOR)-1; if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { # create just symlink; preserving compression my $realfull = $full; |