Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2008-02-22 18:07:46 +0300
committerbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2008-02-22 18:07:46 +0300
commit7f3e34207ad822f4543ba11769426520d0453349 (patch)
treebc0e96ac0e8bd2271322370ff38f325b39f07979 /scripts/recaser
parent6af3140978d271f2222971b73ca5a363e8d68bd7 (diff)
added some heuristics for Czech quotation marks
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1567 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/recaser')
-rwxr-xr-xscripts/recaser/detokenizer.perl21
1 files changed, 17 insertions, 4 deletions
diff --git a/scripts/recaser/detokenizer.perl b/scripts/recaser/detokenizer.perl
index 0ab8c1588..386f71e2d 100755
--- a/scripts/recaser/detokenizer.perl
+++ b/scripts/recaser/detokenizer.perl
@@ -7,6 +7,7 @@
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
use strict;
+use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
my $language = "en";
my $QUIET = 0;
@@ -84,9 +85,21 @@ sub detokenize {
$text = $text.$prependSpace.$words[$i].$words[$i+1];
$i++; # advance over the dash
$prependSpace = "";
- } elsif ($words[$i] =~ /^[\'\"]+$/) {
+ } elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
#combine punctuation smartly
- if (($quoteCount{$words[$i]} % 2) eq 0) {
+ my $normalized_quo = $words[$i];
+ $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
+ $quoteCount{$normalized_quo} = 0
+ if !defined $quoteCount{$normalized_quo};
+ if ($language eq "cs" && $words[$i] eq "„") {
+ # this is always the starting quote in Czech
+ $quoteCount{$normalized_quo} = 0;
+ }
+ if ($language eq "cs" && $words[$i] eq "“") {
+ # this is usually the ending quote in Czech
+ $quoteCount{$normalized_quo} = 1;
+ }
+ if (($quoteCount{$normalized_quo} % 2) eq 0) {
if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
#single quote for posesssives ending in s... "The Jones' house"
#left shift
@@ -96,14 +109,14 @@ sub detokenize {
#right shift
$text = $text.$prependSpace.$words[$i];
$prependSpace = "";
- $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+ $quoteCount{$normalized_quo} ++;
}
} else {
#left shift
$text=$text.$words[$i];
$prependSpace = " ";
- $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+ $quoteCount{$normalized_quo} ++;
}