Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2019-11-05 18:58:07 +0300
committerBarry Haddow <barry.haddow@gmail.com>2019-11-05 18:58:07 +0300
commit56b2bad9073b6e50db692d9b0003ed805684849c (patch)
tree2d674c09b5dded11b2eac253a20be46be1109fbe /scripts
parent3910cd6c4625eefa57600159e66f9a86122750fa (diff)
fix abbrev rule
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/ems/support/split-sentences.perl5
1 files changed, 4 insertions, 1 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 0279a0b88..1dfb36d26 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -209,11 +209,14 @@ sub preprocess {
my $starting_punct = $2;
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
# Not breaking;
- } elsif ($words[$i] =~ /(\.)[$sentence_start\-]+(\.+)$/) {
+ #print "NBP1 $words[$i] $words[$i+1]\n";
+ } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
# Not breaking - upper case acronym
+ #print "NBP2 $words[$i] $words[$i+1]\n";
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) {
# The next word has a bunch of initial quotes, maybe a
# space, then either upper case or a number
+ #print "MAYBE $words[$i] $words[$i+1]\n";
$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
# We always add a return for these, unless we have a
# numeric non-breaker and a number start.