From d03df21e88365b7094f5ad9c7dd79b1291707cf8 Mon Sep 17 00:00:00 2001 From: alvations Date: Mon, 6 Jan 2020 11:43:31 +0800 Subject: Proper spacing --- scripts/ems/support/split-sentences.perl | 104 +++++++++++++++---------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 5a63961ad..240195c7c 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -27,10 +27,10 @@ my $LIST_ITEM = 0; while (@ARGV) { $_ = shift; /^-l$/ && ($language = shift, next); - /^-p$/ && ($prefixfile = shift, next); + /^-p$/ && ($prefixfile = shift, next); /^-q$/ && ($QUIET = 1, next); /^-h$/ && ($HELP = 1, next); - /^-i$/ && ($LIST_ITEM = 1, next); + /^-i$/ && ($LIST_ITEM = 1, next); /^-b$/ && ($|++, next); # no output buffering } @@ -39,7 +39,7 @@ if ($HELP) { print "-q: quiet mode\n"; print "-b: no output buffering (for use in bidirectional pipes)\n"; print "-p: use a custom prefix file, overriding the installed one\n"; - print "-i: avoid splitting on list items (e.g. 1. This is the first)\n"; + print "-i: avoid splitting on list items (e.g. 1. This is the first)\n"; exit; } if (!$QUIET) { @@ -53,17 +53,17 @@ if ($language eq "yue" || $language eq "zh" || $language eq "ja") { } if ($prefixfile ne "") { - print STDERR "Loading non-breaking prefixes from $prefixfile\n"; + print STDERR "Loading non-breaking prefixes from $prefixfile\n"; } else { - $prefixfile = "$mydir/nonbreaking_prefix.$language"; + $prefixfile = "$mydir/nonbreaking_prefix.$language"; -# Default to English, if we don't have a language-specific prefix file. - if (!(-e $prefixfile)) { - $prefixfile = "$mydir/nonbreaking_prefix.en"; - print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; - die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); - } + # Default to English, if we don't have a language-specific prefix file. + if (!(-e $prefixfile)) { + $prefixfile = "$mydir/nonbreaking_prefix.en"; + print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; + die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); + } } if (-e "$prefixfile") { @@ -122,21 +122,20 @@ sub preprocess { ##### Add sentence breaks as needed ##### - # Sentences can start with upper-case, numnbers, or Indic characters - my $sentence_start = "\\p{IsUpper}0-9"; - $sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr"); - $sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu"; - $sentence_start .= "\\p{Block: Bengali}" if ($language eq "as" || $language eq "bn" || $language eq "mni"); - $sentence_start .= "\\p{Block: Kannada}" if $language eq "kn"; - $sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml"; - $sentence_start .= "\\p{Block: Oriya}" if $language eq "or"; - $sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa"; - $sentence_start .= "\\p{Block: Tamil}" if $language eq "ta"; - $sentence_start .= "\\p{Block: Telugu}" if $language eq "te"; - $sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko"; - - - # we include danda and double danda (U+0964 and U+0965) as sentence split characters + # Sentences can start with upper-case, numnbers, or Indic characters + my $sentence_start = "\\p{IsUpper}0-9"; + $sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr"); + $sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu"; + $sentence_start .= "\\p{Block: Bengali}" if ($language eq "as" || $language eq "bn" || $language eq "mni"); + $sentence_start .= "\\p{Block: Kannada}" if $language eq "kn"; + $sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml"; + $sentence_start .= "\\p{Block: Oriya}" if $language eq "or"; + $sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa"; + $sentence_start .= "\\p{Block: Tamil}" if $language eq "ta"; + $sentence_start .= "\\p{Block: Telugu}" if $language eq "te"; + $sentence_start .= "\\p{Block: Hangul}\\p{Block: Hangul_Compatibility_Jamo}\\p{Block: Hangul_Jamo}\\p{Block: Hangul_Jamo_Extended_A}\\p{Block: Hangul_Jamo_Extended_B}" if $language eq "ko"; + + # we include danda and double danda (U+0964 and U+0965) as sentence split characters # Non-period end of sentence markers (?!) followed by sentence starters. $text =~ s/([?!\x{0964}\x{0965}]) +([\'\"\(\[\¿\¡\p{IsPi}]*[$sentence_start])/$1\n$2/g; @@ -153,15 +152,15 @@ sub preprocess { # and are followed by a sentence starter punctuation and upper case. $text =~ s/([?!\.\x{0964}\x{0965}]) +([\x{300d}\x{300f}\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[$sentence_start])/$1\n$2/g; - -#NOTE: Korean no longer handled here. + + #NOTE: Korean no longer handled here, cos Korean has spaces. if ($is_cjk == 1) { # Chinese uses unusual end-of-sentence markers. These are NOT # followed by whitespace. Nor is there any idea of capitalization. # There does not appear to be any unicode category for full-stops # in general, so list them here. U+3002 U+FF0E U+FF1F U+FF01 #$text =~ s/([。.?!♪])/$1\n/g; - $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g; + $text =~ s/([\x{3002}\x{ff0e}\x{FF1F}\x{FF01}]+\s*["\x{201d}\x{201e}\x{300d}\x{300f}]?\s*)/$1\n/g; # A normal full-stop or other Western sentence enders followed # by an ideograph is an end-of-sentence, always. @@ -175,33 +174,34 @@ sub preprocess { # spaces here, so that later processing stages can tokenize readily. # Note that this handles mixed latinate+CJK. # TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ? - # bhaddow - Comment this out since it adds white-space between Chinese characters. This is not - # what we want from sentence-splitter! + + # bhaddow - Comment this out since it adds white-space between Chinese characters. This is not + # what we want from sentence-splitter! #$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g; #$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g; #$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g; #$text =~ s/ +/ /g; - } - - # Urdu support - # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode - if ($language eq 'ur') { - $text =~ s{ - ( (?: [\.\?!\x{06d4}] | \.\.+ ) - [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]* - ) - \s+ - ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]* - [\x{0600}-\x{06ff}] - ) - }{$1\n$2}gx; - } + } + + # Urdu support + # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode + if ($language eq 'ur') { + $text =~ s{ + ( (?: [\.\?!\x{06d4}] | \.\.+ ) + [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]* + ) + \s+ + ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]* + [\x{0600}-\x{06ff}] + ) + }{$1\n$2}gx; + } # Special punctuation cases are covered. Check all remaining periods. my $word; my $i; my @words = split(/\h/,$text); - #print "NOW $text\n"; + #print "NOW $text\n"; $text = ""; for ($i=0;$i<(scalar(@words)-1);$i++) { #print "Checking $words[$i] $words[$i+1]\n"; @@ -211,19 +211,19 @@ sub preprocess { my $starting_punct = $2; if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) { # Not breaking; - # print "NBP1 $words[$i] $words[$i+1]\n"; + ## print "NBP1 $words[$i] $words[$i+1]\n"; } elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) { # Not breaking - upper case acronym - # print "NBP2 $words[$i] $words[$i+1]\n"; + #print "NBP2 $words[$i] $words[$i+1]\n"; } elsif ($LIST_ITEM && ($i == 0 || substr($words[$i-1], -1) eq "\n") && $words[$i] =~ /^\(?(([0-9]+)|([ivx]+)|([A-Za-z]))\)?\.$/) { - #Maybe list item - non breaking - # print "NBP3 $words[$i] $words[$i+1]\n"; + # Maybe list item - non breaking + #print "NBP3 $words[$i] $words[$i+1]\n"; } elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[0-9$sentence_start])/) { # The next word has a bunch of initial quotes, maybe a # space, then either upper case or a number -# print "MAYBE $words[$i] $words[$i+1]\n"; + #print "MAYBE $words[$i] $words[$i+1]\n"; $words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/)); # We always add a return for these, unless we have a # numeric non-breaker and a number start. -- cgit v1.2.3