Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2019-08-22 00:05:45 +0300
committerGitHub <noreply@github.com>2019-08-22 00:05:45 +0300
commit9f08d77b0ddfa80764d2bdd21b1b17090a367fcc (patch)
tree4d40099d413637eda1c87f4dc99aeb15f6c1d091
parent158d25238909ff24d2364ead2ee0929f7cac9965 (diff)
parent7ad5ffa0c0e57308b18b2c705c98ba103902f135 (diff)
Merge pull request #211 from achimr/master
Support for Urdu in sentence splitter
-rwxr-xr-xscripts/ems/support/split-sentences.perl14
1 files changed, 14 insertions, 0 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 90fa6ac90..a1cfb0d37 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -165,6 +165,20 @@ sub preprocess {
}{$1\n$2}gx;
}
+ # Urdu support
+ # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
+ if ($language eq 'ur') {
+ $text =~ s{
+ ( (?: [\.\?!\x{06d4}] | \.\.+ )
+ [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
+ )
+ \s+
+ ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
+ [\x{0600}-\x{06ff}]
+ )
+ }{$1\n$2}gx;
+ }
+
# Special punctuation cases are covered. Check all remaining periods.
my $word;
my $i;