sentence splitter -k option to keep line boundaries

author: Kenneth Heafield <github@kheafield.com> 2020-03-19 18:44:41 +0300
committer: Kenneth Heafield <github@kheafield.com> 2020-03-19 18:44:41 +0300
commit: 89b9b4fba2cb11dc2a2602ecdcace17b6ec4a86a (patch)
tree: 3cf702f0ee3b445fe507db22827047ce792841e4 /scripts/ems/support
parent: 0a892749bcdaae40a15962072b378aa5cd408686 (diff)
1 files changed, 11 insertions, 7 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 03febea63..206b7ebe9 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -24,6 +24,7 @@ my $QUIET = 0;
 my $HELP = 0;
 my $LIST_ITEM = 0;
 my $NOP = 0;
+my $KEEP_LINES = 0;
 
 while (@ARGV) {
 	$_ = shift;
@@ -33,6 +34,7 @@ while (@ARGV) {
 	/^-h$/ && ($HELP = 1, next);
 	/^-i$/ && ($LIST_ITEM = 1, next);
 	/^-n$/ && ($NOP = 1, next);
+	/^-k$/ && ($KEEP_LINES = 1, next);
 	/^-b$/ && ($|++, next); # no output buffering
 }
 
@@ -43,6 +45,7 @@ if ($HELP) {
 	print "-p: use a custom prefix file, overriding the installed one\n";
 	print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
 	print "-n: do not emit <P> after paragraphs\n";
+	print "-k: keep existing line boundaries\n";
 	exit;
 }
 if (!$QUIET) {
@@ -89,13 +92,14 @@ if (-e "$prefixfile") {
 my $text = "";
 while (<STDIN>) {
 	chomp;
-	if (/^<.+>$/ || /^\s*$/) {
+	if ($KEEP_LINES) {
+		&do_it_for($_,"");
+	} elsif (/^<.+>$/ || /^\s*$/) {
 		# Time to process this block; we've hit a blank or <p>
 		&do_it_for($text, $_);
 		print "<P>\n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by <P>
 		$text = "";
-	}
-	else {
+	} else {
 		# Append the text, with a space.
 		$text .= $_. " ";
 	}
@@ -163,7 +167,7 @@ sub preprocess {
 		# There does not appear to be any unicode category for full-stops
 		# in general, so list them here.  U+3002 U+FF0E U+FF1F U+FF01
 		#$text =~ s/([。．？！♪])/$1\n/g;
-    	$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
+		$text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
 
 		# A normal full-stop or other Western sentence enders followed
 		# by an ideograph is an end-of-sentence, always.
@@ -179,7 +183,7 @@ sub preprocess {
 		# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
 
 		# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
-    	# what we want from sentence-splitter!
+		# what we want from sentence-splitter!
 		#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
 		#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
 		#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
@@ -204,10 +208,10 @@ sub preprocess {
 	my $word;
 	my $i;
 	my @words = split(/\h/,$text);
-    #print "NOW $text\n";
+	#print "NOW $text\n";
 	$text = "";
 	for ($i=0;$i<(scalar(@words)-1);$i++) {
-    #print "Checking $words[$i] $words[$i+1]\n";
+	#print "Checking $words[$i] $words[$i+1]\n";
 		if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
 			# Check if $1 is a known honorific and $2 is empty, never break.
 			my $prefix = $1;
author	Kenneth Heafield <github@kheafield.com>	2020-03-19 18:44:41 +0300
committer	Kenneth Heafield <github@kheafield.com>	2020-03-19 18:44:41 +0300
commit	89b9b4fba2cb11dc2a2602ecdcace17b6ec4a86a (patch)
tree	3cf702f0ee3b445fe507db22827047ce792841e4 /scripts/ems/support
parent	0a892749bcdaae40a15962072b378aa5cd408686 (diff)