Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2019-09-30 17:33:33 +0300
committerBarry Haddow <barry.haddow@gmail.com>2019-09-30 17:33:33 +0300
commit01a8ec41e835b5e9b1b7f7b82a8d49769a354d6d (patch)
tree09c0c66b4abe87f9b2fb05fc8eebcd735f241d9b /scripts/ems/support
parent768944d85147e5aa333b6ea7928ae2b9e6e8974a (diff)
parentb21b071a662547bbfc1a39c168c0cec083cca76e (diff)
Merge branch 'master' of github.com:moses-smt/mosesdecoder
Diffstat (limited to 'scripts/ems/support')
-rwxr-xr-xscripts/ems/support/split-sentences.perl16
1 files changed, 15 insertions, 1 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index b7d5c5a6d..2c2319a12 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -167,6 +167,20 @@ sub preprocess {
}{$1\n$2}gx;
}
+ # Urdu support
+ # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
+ if ($language eq 'ur') {
+ $text =~ s{
+ ( (?: [\.\?!\x{06d4}] | \.\.+ )
+ [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
+ )
+ \s+
+ ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
+ [\x{0600}-\x{06ff}]
+ )
+ }{$1\n$2}gx;
+ }
+
# Special punctuation cases are covered. Check all remaining periods.
my $word;
my $i;
@@ -179,7 +193,7 @@ sub preprocess {
my $starting_punct = $2;
if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
# Not breaking;
- } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
+ } elsif ($words[$i] =~ /(\.?)[\p{IsUpper}\-]+(\.+)$/) {
# Not breaking - upper case acronym
} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
# The next word has a bunch of initial quotes, maybe a