Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xscripts/ems/support/split-sentences.perl14
1 files changed, 14 insertions, 0 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 90fa6ac90..a1cfb0d37 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -165,6 +165,20 @@ sub preprocess {
}{$1\n$2}gx;
}
+ # Urdu support
+ # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
+ if ($language eq 'ur') {
+ $text =~ s{
+ ( (?: [\.\?!\x{06d4}] | \.\.+ )
+ [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
+ )
+ \s+
+ ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
+ [\x{0600}-\x{06ff}]
+ )
+ }{$1\n$2}gx;
+ }
+
# Special punctuation cases are covered. Check all remaining periods.
my $word;
my $i;