3 files changed, 155 insertions, 15 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 7f2fb3ced..19d05d8e1 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -2,8 +2,8 @@
 #
 # This file is part of moses.  Its use is licensed under the GNU Lesser General
 # Public License version 2.1 or, at your option, any later version.
-
-# Based on Preprocessor written by Philipp Koehn
+#
+# Based on a preprocessor written by Philipp Koehn.
 
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
@@ -12,11 +12,13 @@ binmode(STDERR, ":utf8");
 use warnings;
 use FindBin qw($RealBin);
 use strict;
+use utf8;
 
 my $mydir = "$RealBin/../../share/nonbreaking_prefixes";
 
 my %NONBREAKING_PREFIX = ();
 my $language = "en";
+my $is_cjk = 0;
 my $QUIET = 0;
 my $HELP = 0;
 
@@ -39,9 +41,14 @@ if (!$QUIET) {
 	print STDERR "Language: $language\n";
 }
 
+# Is it Chinese, Japanese, Korean?
+if ($language eq "yue" || $language eq "zh") {
+	$is_cjk = 1;
+}
+
 my $prefixfile = "$mydir/nonbreaking_prefix.$language";
 
-#default back to English if we don't have a language-specific prefix file
+# Default to English, if we don't have a language-specific prefix file.
 if (!(-e $prefixfile)) {
 	$prefixfile = "$mydir/nonbreaking_prefix.en";
 	print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
@@ -69,13 +76,13 @@ my $text = "";
 while (<STDIN>) {
 	chop;
 	if (/^<.+>$/ || /^\s*$/) {
-		#time to process this block, we've hit a blank or <p>
-		&do_it_for($text,$_);
-		print "<P>\n" if (/^\s*$/ && $text); ##if we have text followed by <P>
+		# Time to process this block; we've hit a blank or <p>
+		&do_it_for($text, $_);
+		print "<P>\n" if (/^\s*$/ && $text); ## If we have text followed by <P>
 		$text = "";
 	}
 	else {
-		#append the text, with a space
+		# Append the text, with a space.
 		$text .= $_. " ";
 	}
 }
@@ -91,7 +98,7 @@ sub do_it_for {
 }
 
 sub preprocess {
-	# This is one paragraph.
+	# Argument is one paragraph.
 	my($text) = @_;
 
 	# Clean up spaces at head and tail of each line, as well as
@@ -119,31 +126,58 @@ sub preprocess {
 	# and are followed by a sentence starter punctuation and upper case.
 	$text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g;
 
-	# special punctuation cases are covered. Check all remaining periods.
+	if ($is_cjk == 1) {
+		# Chinese uses unusual end-of-sentence markers. These are NOT
+		# followed by whitespace.  Nor is there any idea of capitalization.
+		# There does not appear to be any unicode category for full-stops
+		# in general, so list them here.  U+3002 U+FF0E U+FF1F U+FF01
+		$text =~ s/([。．？！♪])/$1\n/g;
+
+		# A normal full-stop or other Western sentence enders followed
+		# by an ideograph is an end-of-sentence, always.
+		$text =~ s/([\.?!]) *(\p{CJK})/$1\n$2/g;
+
+		# Split close-paren-then-comma into two.
+		$text =~ s/(\p{Punctuation}) *(\p{Punctuation})/ $1 $2 /g;
+
+		# Chinese does not use any sort of white-space between ideographs.
+		# Nominally, each single ideograph corresponds to one word. Add
+		# spaces here, so that later processing stages can tokenize readily.
+		# Note that this handles mixed latinate+CJK.
+		# TODO: perhaps also CJKExtA CJKExtB etc ??? CJK_Radicals_Sup ?
+		$text =~ s/(\p{Punctuation}) *(\p{CJK})/ $1 $2/g;
+		$text =~ s/(\p{CJK}) *(\p{Punctuation})/$1 $2 /g;
+		$text =~ s/([\p{CJK}\p{CJKSymbols}])/ $1 /g;
+		$text =~ s/ +/ /g;
+	}
+
+	# Special punctuation cases are covered. Check all remaining periods.
 	my $word;
 	my $i;
 	my @words = split(/ /,$text);
 	$text = "";
 	for ($i=0;$i<(scalar(@words)-1);$i++) {
 		if ($words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/) {
-			#check if $1 is a known honorific and $2 is empty, never break
+			# Check if $1 is a known honorific and $2 is empty, never break.
 			my $prefix = $1;
 			my $starting_punct = $2;
-			if($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
-				#not breaking;
+			if ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 1 && !$starting_punct) {
+				# Not breaking;
 			} elsif ($words[$i] =~ /(\.)[\p{IsUpper}\-]+(\.+)$/) {
-				#not breaking - upper case acronym
+				# Not breaking - upper case acronym
 			} elsif($words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/) {
 				# The next word has a bunch of initial quotes, maybe a
 				# space, then either upper case or a number
 				$words[$i] = $words[$i]."\n" unless ($prefix && $NONBREAKING_PREFIX{$prefix} && $NONBREAKING_PREFIX{$prefix} == 2 && !$starting_punct && ($words[$i+1] =~ /^[0-9]+/));
-				#we always add a return for these unless we have a numeric non-breaker and a number start
+				# We always add a return for these, unless we have a
+				# numeric non-breaker and a number start.
 			}
 		}
 		$text = $text.$words[$i]." ";
 	}
 
-	# We stopped one token from the end to allow for easy look-ahead. Append it now.
+	# We stopped one token from the end to allow for easy look-ahead.
+	# Append it now.
 	$text = $text.$words[$i];
 
 	# Clean up spaces at head and tail of each line as well as any double-spacing
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.yue b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.yue
new file mode 100644
index 000000000..37942ade9
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.yue
@@ -0,0 +1,53 @@
+#
+# Cantonese (Chinese)
+#
+# Anything in this file, followed by a period, 
+# does NOT indicate an end-of-sentence marker.
+#
+# English/Euro-language given-name initials (appearing in
+# news, periodicals, etc.)
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+Ģ
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+Q
+R
+S
+Š
+T
+U
+Ū
+V
+W
+X
+Y
+Z
+Ž
+
+# Numbers only. These should only induce breaks when followed by
+# a numeric sequence.
+# Add NUMERIC_ONLY after the word for this function. This case is
+# mostly for the english "No." which can either be a sentence of its
+# own, or if followed by a number, a non-breaking prefix.
+No #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.zh b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.zh
new file mode 100644
index 000000000..df4c2ff88
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.zh
@@ -0,0 +1,53 @@
+#
+# Mandarin (Chinese)
+#
+# Anything in this file, followed by a period, 
+# does NOT indicate an end-of-sentence marker.
+#
+# English/Euro-language given-name initials (appearing in
+# news, periodicals, etc.)
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+Ģ
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+Q
+R
+S
+Š
+T
+U
+Ū
+V
+W
+X
+Y
+Z
+Ž
+
+# Numbers only. These should only induce breaks when followed by
+# a numeric sequence.
+# Add NUMERIC_ONLY after the word for this function. This case is
+# mostly for the english "No." which can either be a sentence of its
+# own, or if followed by a number, a non-breaking prefix.
+No #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#