Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2020-03-19 15:06:50 +0300
committerKenneth Heafield <github@kheafield.com>2020-03-19 15:06:50 +0300
commit0a892749bcdaae40a15962072b378aa5cd408686 (patch)
tree1b1671a27c15aea063c2816476672a6e4a50172e /scripts
parentd30a1d51c88aad9908f3a025c86371b8916e9da4 (diff)
Add Pashto ؟ as a sentence splitting character
Diffstat (limited to 'scripts')
-rwxr-xr-xscripts/ems/support/split-sentences.perl8
1 files changed, 4 insertions, 4 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index b6e9e2456..03febea63 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -141,7 +141,7 @@ sub preprocess {
# we include danda and double danda (U+0964 and U+0965) as sentence split characters
# Non-period end of sentence markers (?!) followed by sentence starters.
- $text =~ s/([?!\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
+ $text =~ s/([?!؟\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
# Multi-dots followed by sentence starters.
$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
@@ -149,11 +149,11 @@ sub preprocess {
# Add breaks for sentences that end with some sort of punctuation
# inside a quote or parenthetical and are followed by a possible
# sentence starter punctuation and upper case.
- $text =~ s/([?!\.\x{0964}\x{0965}][\ ]*[\x{300d}\x{300f}\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g;
+ $text =~ s/([?!؟\.\x{0964}\x{0965}][\ ]*[\x{300d}\x{300f}\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g;
# Add breaks for sentences that end with some sort of punctuation,
# and are followed by a sentence starter punctuation and upper case.
- $text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
+ $text =~ s/([?!؟\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
#NOTE: Korean no longer handled here, cos Korean has spaces.
@@ -167,7 +167,7 @@ sub preprocess {
# A normal full-stop or other Western sentence enders followed
# by an ideograph is an end-of-sentence, always.
- $text =~ s/([\.?!]) *(\p{CJK})/$1\n$2/g;
+ $text =~ s/([\.?!؟]) *(\p{CJK})/$1\n$2/g;
# Split close-paren-then-comma into two.
$text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g;