Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230>2011-08-08 19:30:54 +0400
committerbgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230>2011-08-08 19:30:54 +0400
commit24f5bf6723460d838e2ef03158d0ff6ce700ddbf (patch)
tree5373beced8b88140adec152286a23621139aa8f9
parent14587cdafc42cdbff9221c07b5545551ecec475b (diff)
when detokenizing, remove whitespace between a pair of CJK (Chinese/Japanese/Korean) words
This gets the Chinese and Japanese tests working, so remove the failure expectation. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4134 1f5c12ca-751b-0410-a591-d2e778427230
-rw-r--r--regression-testing/run-test-detokenizer.t14
-rwxr-xr-xscripts/tokenizer/detokenizer.perl90
2 files changed, 91 insertions, 13 deletions
diff --git a/regression-testing/run-test-detokenizer.t b/regression-testing/run-test-detokenizer.t
index 9d677b43e..da6585cc8 100644
--- a/regression-testing/run-test-detokenizer.t
+++ b/regression-testing/run-test-detokenizer.t
@@ -107,9 +107,7 @@ Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen
EXP
);
-# A (failing) simple Chinese test
-{
-my $testCase =
+# A simple Chinese test
&addDetokenizerTest("TEST_CHINESE_EASY", undef,
<<'TOK'
这 是 一个 简单 的的 汉语 句子 。
@@ -120,12 +118,7 @@ TOK
EXP
);
-$testCase->setExpectedToFail("Chinese detokenization is not implemented yet.");
-}
-
-# A (failing) simple Japanese test
-{
-my $testCase =
+# A simple Japanese test
&addDetokenizerTest("TEST_JAPANESE_EASY", undef,
<<'TOK'
どう しょ う か な 。
@@ -138,9 +131,6 @@ TOK
EXP
);
-$testCase->setExpectedToFail("Japanese detokenization is not implemented yet.");
-}
-
######################################
# Now run those babies ...
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index f049b8080..ac88834b0 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -73,7 +73,16 @@ sub detokenize {
my %quoteCount = ("\'"=>0,"\""=>0);
my $prependSpace = " ";
for ($i=0;$i<(scalar(@words));$i++) {
- if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+ if (&startsWithCJKChar($words[$i])) {
+ if ($i > 0 && &endsWithCJKChar($words[$i-1])) {
+ # perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
+ $text=$text.$words[$i];
+ } else {
+ # ... but do nothing special if this is a CJK word that doesn't follow a CJK word
+ $text=$text.$prependSpace.$words[$i];
+ }
+ $prependSpace = " ";
+ } elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
#perform right shift on currency and other random punctuation items
$text = $text.$prependSpace.$words[$i];
$prependSpace = "";
@@ -161,3 +170,82 @@ sub detokenize {
return $text;
}
+sub startsWithCJKChar {
+ my ($str) = @_;
+ return 0 if length($str) == 0;
+ my $firstChar = substr($str, 0, 1);
+ return &charIsCJK($firstChar);
+}
+
+sub endsWithCJKChar {
+ my ($str) = @_;
+ return 0 if length($str) == 0;
+ my $lastChar = substr($str, length($str)-1, 1);
+ return &charIsCJK($lastChar);
+}
+
+# Given a string consisting of one character, returns true iff the character
+# is a CJK (Chinese/Japanese/Korean) character
+sub charIsCJK {
+ my ($char) = @_;
+ # $char should be a string of length 1
+ my $codepoint = &codepoint_dec($char);
+
+ # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+
+ # Hangul Jamo (1100–11FF)
+ return 1 if (&between_hexes($codepoint, '1100', '11FF'));
+
+ # CJK Radicals Supplement (2E80–2EFF)
+ # Kangxi Radicals (2F00–2FDF)
+ # Ideographic Description Characters (2FF0–2FFF)
+ # CJK Symbols and Punctuation (3000–303F)
+ # Hiragana (3040–309F)
+ # Katakana (30A0–30FF)
+ # Bopomofo (3100–312F)
+ # Hangul Compatibility Jamo (3130–318F)
+ # Kanbun (3190–319F)
+ # Bopomofo Extended (31A0–31BF)
+ # CJK Strokes (31C0–31EF)
+ # Katakana Phonetic Extensions (31F0–31FF)
+ # Enclosed CJK Letters and Months (3200–32FF)
+ # CJK Compatibility (3300–33FF)
+ # CJK Unified Ideographs Extension A (3400–4DBF)
+ # Yijing Hexagram Symbols (4DC0–4DFF)
+ # CJK Unified Ideographs (4E00–9FFF)
+ # Yi Syllables (A000–A48F)
+ # Yi Radicals (A490–A4CF)
+ return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
+
+ # Phags-pa (A840–A87F)
+ return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
+
+ # Hangul Syllables (AC00–D7AF)
+ return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
+
+ # CJK Compatibility Ideographs (F900–FAFF)
+ return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
+
+ # CJK Compatibility Forms (FE30–FE4F)
+ return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
+
+ # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+ return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
+
+ # Supplementary Ideographic Plane 20000–2FFFF
+ return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
+
+ return 0;
+}
+
+# Returns the code point of a Unicode char, represented as a decimal number
+sub codepoint_dec {
+ if (my $char = shift) {
+ return unpack('U0U*', $char);
+ }
+}
+
+sub between_hexes {
+ my ($num, $left, $right) = @_;
+ return $num >= hex($left) && $num <= hex($right);
+}