diff options
author | Kenneth Heafield <github@kheafield.com> | 2020-03-19 15:06:50 +0300 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2020-03-19 15:06:50 +0300 |
commit | 0a892749bcdaae40a15962072b378aa5cd408686 (patch) | |
tree | 1b1671a27c15aea063c2816476672a6e4a50172e /scripts | |
parent | d30a1d51c88aad9908f3a025c86371b8916e9da4 (diff) |
Add Pashto ؟ as a sentence splitting character
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/ems/support/split-sentences.perl | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index b6e9e2456..03febea63 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -141,7 +141,7 @@ sub preprocess { # we include danda and double danda (U+0964 and U+0965) as sentence split characters # Non-period end of sentence markers (?!) followed by sentence starters. - $text =~ s/([?!\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g; + $text =~ s/([?!؟\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g; # Multi-dots followed by sentence starters. $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g; @@ -149,11 +149,11 @@ sub preprocess { # Add breaks for sentences that end with some sort of punctuation # inside a quote or parenthetical and are followed by a possible # sentence starter punctuation and upper case. - $text =~ s/([?!\.\x{0964}\x{0965}][\ ]*[\x{300d}\x{300f}\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g; + $text =~ s/([?!؟\.\x{0964}\x{0965}][\ ]*[\x{300d}\x{300f}\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g; # Add breaks for sentences that end with some sort of punctuation, # and are followed by a sentence starter punctuation and upper case. - $text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g; + $text =~ s/([?!؟\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g; #NOTE: Korean no longer handled here, cos Korean has spaces. @@ -167,7 +167,7 @@ sub preprocess { # A normal full-stop or other Western sentence enders followed # by an ideograph is an end-of-sentence, always. - $text =~ s/([\.?!]) *(\p{CJK})/$1\n$2/g; + $text =~ s/([\.?!؟]) *(\p{CJK})/$1\n$2/g; # Split close-paren-then-comma into two. $text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g; |