diff options
author | Achim Ruopp <achim@polyglot.technology> | 2019-07-10 17:48:32 +0300 |
---|---|---|
committer | Achim Ruopp <achim@polyglot.technology> | 2019-07-10 17:48:32 +0300 |
commit | 7ad5ffa0c0e57308b18b2c705c98ba103902f135 (patch) | |
tree | 4d40099d413637eda1c87f4dc99aeb15f6c1d091 | |
parent | 158d25238909ff24d2364ead2ee0929f7cac9965 (diff) |
Support for Urdu in sentence splitter
-rwxr-xr-x | scripts/ems/support/split-sentences.perl | 14 |
1 files changed, 14 insertions, 0 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 90fa6ac90..a1cfb0d37 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -165,6 +165,20 @@ sub preprocess { }{$1\n$2}gx; } + # Urdu support + # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode + if ($language eq 'ur') { + $text =~ s{ + ( (?: [\.\?!\x{06d4}] | \.\.+ ) + [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]* + ) + \s+ + ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]* + [\x{0600}-\x{06ff}] + ) + }{$1\n$2}gx; + } + # Special punctuation cases are covered. Check all remaining periods. my $word; my $i; |