diff options
author | bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-08-08 19:30:54 +0400 |
---|---|---|
committer | bgottesman <bgottesman@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-08-08 19:30:54 +0400 |
commit | 24f5bf6723460d838e2ef03158d0ff6ce700ddbf (patch) | |
tree | 5373beced8b88140adec152286a23621139aa8f9 | |
parent | 14587cdafc42cdbff9221c07b5545551ecec475b (diff) |
when detokenizing, remove whitespace between a pair of CJK (Chinese/Japanese/Korean) words
This gets the Chinese and Japanese tests working, so remove the failure expectation.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4134 1f5c12ca-751b-0410-a591-d2e778427230
-rw-r--r-- | regression-testing/run-test-detokenizer.t | 14 | ||||
-rwxr-xr-x | scripts/tokenizer/detokenizer.perl | 90 |
2 files changed, 91 insertions, 13 deletions
diff --git a/regression-testing/run-test-detokenizer.t b/regression-testing/run-test-detokenizer.t index 9d677b43e..da6585cc8 100644 --- a/regression-testing/run-test-detokenizer.t +++ b/regression-testing/run-test-detokenizer.t @@ -107,9 +107,7 @@ Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen EXP ); -# A (failing) simple Chinese test -{ -my $testCase = +# A simple Chinese test &addDetokenizerTest("TEST_CHINESE_EASY", undef, <<'TOK' 这 是 一个 简单 的的 汉语 句子 。 @@ -120,12 +118,7 @@ TOK EXP ); -$testCase->setExpectedToFail("Chinese detokenization is not implemented yet."); -} - -# A (failing) simple Japanese test -{ -my $testCase = +# A simple Japanese test &addDetokenizerTest("TEST_JAPANESE_EASY", undef, <<'TOK' どう しょ う か な 。 @@ -138,9 +131,6 @@ TOK EXP ); -$testCase->setExpectedToFail("Japanese detokenization is not implemented yet."); -} - ###################################### # Now run those babies ... diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl index f049b8080..ac88834b0 100755 --- a/scripts/tokenizer/detokenizer.perl +++ b/scripts/tokenizer/detokenizer.perl @@ -73,7 +73,16 @@ sub detokenize { my %quoteCount = ("\'"=>0,"\""=>0); my $prependSpace = " "; for ($i=0;$i<(scalar(@words));$i++) { - if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { + if (&startsWithCJKChar($words[$i])) { + if ($i > 0 && &endsWithCJKChar($words[$i-1])) { + # perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word + $text=$text.$words[$i]; + } else { + # ... but do nothing special if this is a CJK word that doesn't follow a CJK word + $text=$text.$prependSpace.$words[$i]; + } + $prependSpace = " "; + } elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { #perform right shift on currency and other random punctuation items $text = $text.$prependSpace.$words[$i]; $prependSpace = ""; @@ -161,3 +170,82 @@ sub detokenize { return $text; } +sub startsWithCJKChar { + my ($str) = @_; + return 0 if length($str) == 0; + my $firstChar = substr($str, 0, 1); + return &charIsCJK($firstChar); +} + +sub endsWithCJKChar { + my ($str) = @_; + return 0 if length($str) == 0; + my $lastChar = substr($str, length($str)-1, 1); + return &charIsCJK($lastChar); +} + +# Given a string consisting of one character, returns true iff the character +# is a CJK (Chinese/Japanese/Korean) character +sub charIsCJK { + my ($char) = @_; + # $char should be a string of length 1 + my $codepoint = &codepoint_dec($char); + + # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane + + # Hangul Jamo (1100–11FF) + return 1 if (&between_hexes($codepoint, '1100', '11FF')); + + # CJK Radicals Supplement (2E80–2EFF) + # Kangxi Radicals (2F00–2FDF) + # Ideographic Description Characters (2FF0–2FFF) + # CJK Symbols and Punctuation (3000–303F) + # Hiragana (3040–309F) + # Katakana (30A0–30FF) + # Bopomofo (3100–312F) + # Hangul Compatibility Jamo (3130–318F) + # Kanbun (3190–319F) + # Bopomofo Extended (31A0–31BF) + # CJK Strokes (31C0–31EF) + # Katakana Phonetic Extensions (31F0–31FF) + # Enclosed CJK Letters and Months (3200–32FF) + # CJK Compatibility (3300–33FF) + # CJK Unified Ideographs Extension A (3400–4DBF) + # Yijing Hexagram Symbols (4DC0–4DFF) + # CJK Unified Ideographs (4E00–9FFF) + # Yi Syllables (A000–A48F) + # Yi Radicals (A490–A4CF) + return 1 if (&between_hexes($codepoint, '2E80', 'A4CF')); + + # Phags-pa (A840–A87F) + return 1 if (&between_hexes($codepoint, 'A840', 'A87F')); + + # Hangul Syllables (AC00–D7AF) + return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF')); + + # CJK Compatibility Ideographs (F900–FAFF) + return 1 if (&between_hexes($codepoint, 'F900', 'FAFF')); + + # CJK Compatibility Forms (FE30–FE4F) + return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F')); + + # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters + return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC')); + + # Supplementary Ideographic Plane 20000–2FFFF + return 1 if (&between_hexes($codepoint, '20000', '2FFFF')); + + return 0; +} + +# Returns the code point of a Unicode char, represented as a decimal number +sub codepoint_dec { + if (my $char = shift) { + return unpack('U0U*', $char); + } +} + +sub between_hexes { + my ($num, $left, $right) = @_; + return $num >= hex($left) && $num <= hex($right); +} |