diff options
author | Barry Haddow <barry.haddow@gmail.com> | 2019-11-05 19:52:50 +0300 |
---|---|---|
committer | Barry Haddow <barry.haddow@gmail.com> | 2019-11-05 19:52:50 +0300 |
commit | 61b1d06570e0f9b9043a91517bdde317ddd3bffa (patch) | |
tree | aaf2c19765c38470a8c4a60d8b6368307263cb7a | |
parent | 4da86c360f0586c08fbf654efe50fd6b4e1a6dee (diff) |
list items
-rwxr-xr-x | scripts/ems/support/split-sentences.perl | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 1dfb36d26..9d588c265 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -22,6 +22,7 @@ my $prefixfile = ""; my $is_cjk = 0; my $QUIET = 0; my $HELP = 0; +my $LIST_ITEM = 0; while (@ARGV) { $_ = shift; @@ -29,6 +30,7 @@ while (@ARGV) { /^-p$/ && ($prefixfile = shift, next); /^-q$/ && ($QUIET = 1, next); /^-h$/ && ($HELP = 1, next); + /^-i$/ && ($LIST_ITEM = 1, next); /^-b$/ && ($|++, next); # no output buffering } @@ -37,6 +39,7 @@ if ($HELP) { print "-q: quiet mode\n"; print "-b: no output buffering (for use in bidirectional pipes)\n"; print "-p: use a custom prefix file, overriding the installed one\n"; + print "-i: avoid splitting on list items (e.g. 1. This is the first)\n"; exit; } if (!$QUIET) { @@ -213,6 +216,11 @@ sub preprocess { } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) { # Not breaking - upper case acronym #print "NBP2 $words[$i] $words[$i+1]\n"; + } elsif ($LIST_ITEM + && ($i == 0 || substr($words[$i-1], -1) eq "\n") + && $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) { + #Maybe list item - non breaking + #print "NBP3 $words[$i] $words[$i+1]\n"; } elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) { # The next word has a bunch of initial quotes, maybe a # space, then either upper case or a number |