when detokenizing, remove whitespace between a pair of CJK (Chinese/Japanese/Korean) words

This gets the Chinese and Japanese tests working, so remove the failure expectation. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4134 1f5c12ca-751b-0410-a591-d2e778427230
author: bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230> 2011-08-08 19:30:54 +0400
committer: bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230> 2011-08-08 19:30:54 +0400
commit: 24f5bf6723460d838e2ef03158d0ff6ce700ddbf (patch)
tree: 5373beced8b88140adec152286a23621139aa8f9
parent: 14587cdafc42cdbff9221c07b5545551ecec475b (diff)
2 files changed, 91 insertions, 13 deletions
diff --git a/regression-testing/run-test-detokenizer.t b/regression-testing/run-test-detokenizer.t
index 9d677b43e..da6585cc8 100644
--- a/regression-testing/run-test-detokenizer.t
+++ b/regression-testing/run-test-detokenizer.t
@@ -107,9 +107,7 @@ Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen
 EXP
 );
 
-# A (failing) simple Chinese test
-{
-my $testCase =
+# A simple Chinese test
 &addDetokenizerTest("TEST_CHINESE_EASY", undef,
 <<'TOK'
 这 是 一个 简单 的的 汉语 句子 。
@@ -120,12 +118,7 @@ TOK
 EXP
 );
 
-$testCase->setExpectedToFail("Chinese detokenization is not implemented yet.");
-}
-
-# A (failing) simple Japanese test
-{
-my $testCase =
+# A simple Japanese test
 &addDetokenizerTest("TEST_JAPANESE_EASY", undef,
 <<'TOK'
 どう しょ う か な 。
@@ -138,9 +131,6 @@ TOK
 EXP
 );
 
-$testCase->setExpectedToFail("Japanese detokenization is not implemented yet.");
-}
-
 
 ######################################
 # Now run those babies ...
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index f049b8080..ac88834b0 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -73,7 +73,16 @@ sub detokenize {
 	my %quoteCount =  ("\'"=>0,"\""=>0);
 	my $prependSpace = " ";
 	for ($i=0;$i<(scalar(@words));$i++) {		
-		if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+		if (&startsWithCJKChar($words[$i])) {
+		    if ($i > 0 && &endsWithCJKChar($words[$i-1])) {
+			# perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
+			$text=$text.$words[$i];
+		    } else {
+			# ... but do nothing special if this is a CJK word that doesn't follow a CJK word
+			$text=$text.$prependSpace.$words[$i];
+		    }
+		    $prependSpace = " ";
+		} elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
 			#perform right shift on currency and other random punctuation items
 			$text = $text.$prependSpace.$words[$i];
 			$prependSpace = "";
@@ -161,3 +170,82 @@ sub detokenize {
 	return $text;
 }
 
+sub startsWithCJKChar {
+    my ($str) = @_;
+    return 0 if length($str) == 0;
+    my $firstChar = substr($str, 0, 1);
+    return &charIsCJK($firstChar);
+}
+
+sub endsWithCJKChar {
+    my ($str) = @_;
+    return 0 if length($str) == 0;
+    my $lastChar = substr($str, length($str)-1, 1);
+    return &charIsCJK($lastChar);
+}
+
+# Given a string consisting of one character, returns true iff the character
+# is a CJK (Chinese/Japanese/Korean) character
+sub charIsCJK {
+    my ($char) = @_;
+    # $char should be a string of length 1
+    my $codepoint = &codepoint_dec($char);
+    
+    # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+
+    # Hangul Jamo (1100–11FF)
+    return 1 if (&between_hexes($codepoint, '1100', '11FF'));
+
+    # CJK Radicals Supplement (2E80–2EFF)
+    # Kangxi Radicals (2F00–2FDF)
+    # Ideographic Description Characters (2FF0–2FFF)
+    # CJK Symbols and Punctuation (3000–303F)
+    # Hiragana (3040–309F)
+    # Katakana (30A0–30FF)
+    # Bopomofo (3100–312F)
+    # Hangul Compatibility Jamo (3130–318F)
+    # Kanbun (3190–319F)
+    # Bopomofo Extended (31A0–31BF)
+    # CJK Strokes (31C0–31EF)
+    # Katakana Phonetic Extensions (31F0–31FF)
+    # Enclosed CJK Letters and Months (3200–32FF)
+    # CJK Compatibility (3300–33FF)
+    # CJK Unified Ideographs Extension A (3400–4DBF)
+    # Yijing Hexagram Symbols (4DC0–4DFF)
+    # CJK Unified Ideographs (4E00–9FFF)
+    # Yi Syllables (A000–A48F)
+    # Yi Radicals (A490–A4CF)
+    return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
+
+    # Phags-pa (A840–A87F)
+    return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
+
+    # Hangul Syllables (AC00–D7AF)
+    return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
+
+    # CJK Compatibility Ideographs (F900–FAFF)
+    return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
+
+    # CJK Compatibility Forms (FE30–FE4F)
+    return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
+
+    # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+    return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
+
+    # Supplementary Ideographic Plane 20000–2FFFF
+    return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
+
+    return 0;
+}
+
+# Returns the code point of a Unicode char, represented as a decimal number
+sub codepoint_dec {
+    if (my $char = shift) {
+	return unpack('U0U*', $char);
+    }
+}
+
+sub between_hexes {
+    my ($num, $left, $right) = @_;
+    return $num >= hex($left) && $num <= hex($right);
+}
author	bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230>	2011-08-08 19:30:54 +0400
committer	bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230>	2011-08-08 19:30:54 +0400
commit	24f5bf6723460d838e2ef03158d0ff6ce700ddbf (patch)
tree	5373beced8b88140adec152286a23621139aa8f9
parent	14587cdafc42cdbff9221c07b5545551ecec475b (diff)