Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2020-03-19 18:44:41 +0300
committerKenneth Heafield <github@kheafield.com>2020-03-19 18:44:41 +0300
commit89b9b4fba2cb11dc2a2602ecdcace17b6ec4a86a (patch)
tree3cf702f0ee3b445fe507db22827047ce792841e4 /scripts/ems/support
parent0a892749bcdaae40a15962072b378aa5cd408686 (diff)
sentence splitter -k option to keep line boundaries
Diffstat (limited to 'scripts/ems/support')
-rwxr-xr-xscripts/ems/support/split-sentences.perl18
1 files changed, 11 insertions, 7 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 03febea63..206b7ebe9 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -24,6 +24,7 @@ my $QUIET = 0;
my $HELP = 0;
my $LIST_ITEM = 0;
my $NOP = 0;
+my $KEEP_LINES = 0;
while (@ARGV) {
$_ = shift;
@@ -33,6 +34,7 @@ while (@ARGV) {
/^-h$/ && ($HELP = 1, next);
/^-i$/ && ($LIST_ITEM = 1, next);
/^-n$/ && ($NOP = 1, next);
+ /^-k$/ && ($KEEP_LINES = 1, next);
/^-b$/ && ($|++, next); # no output buffering
}
@@ -43,6 +45,7 @@ if ($HELP) {
print "-p: use a custom prefix file, overriding the installed one\n";
print "-i: avoid splitting on list items (e.g. 1. This is the first)\n";
print "-n: do not emit <P> after paragraphs\n";
+ print "-k: keep existing line boundaries\n";
exit;
}
if (!$QUIET) {
@@ -89,13 +92,14 @@ if (-e "$prefixfile") {
my $text = "";
while (<STDIN>) {
chomp;
- if (/^<.+>$/ || /^\s*$/) {
+ if ($KEEP_LINES) {
+ &do_it_for($_,"");
+ } elsif (/^<.+>$/ || /^\s*$/) {
# Time to process this block; we've hit a blank or <p>
&do_it_for($text, $_);
print "<P>\n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by <P>
$text = "";
- }
- else {
+ } else {
# Append the text, with a space.
$text .= $_. " ";
}
@@ -163,7 +167,7 @@ sub preprocess {
# There does not appear to be any unicode category for full-stops
# in general, so list them here. U+3002 U+FF0E U+FF1F U+FF01
#$text =~ s/([。.?!♪])/$1\n/g;
- $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
+ $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g;
# A normal full-stop or other Western sentence enders followed
# by an ideograph is an end-of-sentence, always.
@@ -179,7 +183,7 @@ sub preprocess {
# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
# bhaddow - Comment this out since it adds white-space between Chinese characters. This is not
- # what we want from sentence-splitter!
+ # what we want from sentence-splitter!
#$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
#$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
#$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
@@ -204,10 +208,10 @@ sub preprocess {
my $word;
my $i;
my @words = split(/\h/,$text);
- #print "NOW $text\n";
+ #print "NOW $text\n";
$text = "";
for ($i=0;$i<(scalar(@words)-1);$i++) {
- #print "Checking $words[$i] $words[$i+1]\n";
+ #print "Checking $words[$i] $words[$i+1]\n";
if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
# Check if $1 is a known honorific and $2 is empty, never break.
my $prefix = $1;