Add Pashto ؟ as a sentence splitting character

author: Kenneth Heafield <github@kheafield.com> 2020-03-19 15:06:50 +0300
committer: Kenneth Heafield <github@kheafield.com> 2020-03-19 15:06:50 +0300
commit: 0a892749bcdaae40a15962072b378aa5cd408686 (patch)
tree: 1b1671a27c15aea063c2816476672a6e4a50172e /scripts
parent: d30a1d51c88aad9908f3a025c86371b8916e9da4 (diff)
1 files changed, 4 insertions, 4 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index b6e9e2456..03febea63 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -141,7 +141,7 @@ sub preprocess {
 	# we include danda and double danda (U+0964 and U+0965) as sentence split characters
 
 	# Non-period end of sentence markers (?!) followed by sentence starters.
-	$text =~ s/([?!\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
+	$text =~ s/([?!؟\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
 
 	# Multi-dots followed by sentence starters.
 	$text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g;
@@ -149,11 +149,11 @@ sub preprocess {
 	# Add breaks for sentences that end with some sort of punctuation
 	# inside a quote or parenthetical and are followed by a possible
 	# sentence starter punctuation and upper case.
-	$text =~ s/([?!\.\x{0964}\x{0965}][\ ]*[\x{300d}\x{300f}\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g;
+	$text =~ s/([?!؟\.\x{0964}\x{0965}][\ ]*[\x{300d}\x{300f}\'\"\)\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[$sentence_start])/$1\n$2/g;
 
 	# Add breaks for sentences that end with some sort of punctuation,
 	# and are followed by a sentence starter punctuation and upper case.
-	$text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
+	$text =~ s/([?!؟\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g;
 
 
 	#NOTE: Korean no longer handled here, cos Korean has spaces.
@@ -167,7 +167,7 @@ sub preprocess {
 
 		# A normal full-stop or other Western sentence enders followed
 		# by an ideograph is an end-of-sentence, always.
-		$text =~ s/([\.?!]) *(\p{CJK})/$1\n$2/g;
+		$text =~ s/([\.?!؟]) *(\p{CJK})/$1\n$2/g;
 
 		# Split close-paren-then-comma into two.
 		$text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g;
author	Kenneth Heafield <github@kheafield.com>	2020-03-19 15:06:50 +0300
committer	Kenneth Heafield <github@kheafield.com>	2020-03-19 15:06:50 +0300
commit	0a892749bcdaae40a15962072b378aa5cd408686 (patch)
tree	1b1671a27c15aea063c2816476672a6e4a50172e /scripts
parent	d30a1d51c88aad9908f3a025c86371b8916e9da4 (diff)