Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2011-02-14 16:32:41 +0300
committerbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2011-02-14 16:32:41 +0300
commit26ccace94644af74bd945a3f91b8cb047b919e4f (patch)
tree4bf31ed7f3026cd451d703f8f9bafb35f263a409 /scripts/tokenizer
parent4c6dfbddc3685fa780c2e3d89471e92344278d7a (diff)
Czech detokenization
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3879 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/tokenizer')
-rwxr-xr-xscripts/tokenizer/detokenizer.perl60
1 files changed, 53 insertions, 7 deletions
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index 5eac00077..1745580d6 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -1,30 +1,41 @@
#!/usr/bin/perl -w
-# $Id: detokenizer.perl 928 2009-09-02 02:58:01Z philipp $
+# $Id$
# Sample De-Tokenizer
# written by Josh Schroeder, based on code by Philipp Koehn
+# further modifications by Ondrej Bojar
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
use strict;
+use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
my $language = "en";
my $QUIET = 0;
my $HELP = 0;
+my $UPPERCASE_SENT = 0;
while (@ARGV) {
$_ = shift;
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
+ /^-u$/ && ($UPPERCASE_SENT = 1, next);
}
if ($HELP) {
- print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n";
+ print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
+ print "Options:\n";
+ print " -u ... uppercase the first char in the final sentence.\n";
+ print " -q ... don't report detokenizer revision.\n";
exit;
}
+
+die "No built-in rules for language $language, claim en for default behaviour."
+ if $language !~ /^(cs|en|fr|it)$/;
+
if (!$QUIET) {
- print STDERR "Detokenizer Version 1.0\n";
+ print STDERR "Detokenizer Version ".'$Revision$'."\n";
print STDERR "Language: $language\n";
}
@@ -38,6 +49,14 @@ while(<STDIN>) {
}
}
+
+sub ucsecondarg {
+ # uppercase the second argument
+ my $arg1 = shift;
+ my $arg2 = shift;
+ return $arg1.uc($arg2);
+}
+
sub detokenize {
my($text) = @_;
chomp($text);
@@ -63,13 +82,38 @@ sub detokenize {
#left-shift the contraction for English
$text=$text.$words[$i];
$prependSpace = " ";
+ } elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
+ #left-shift floats in Czech
+ $text=$text.$words[$i];
+ $prependSpace = " ";
} elsif ((($language eq "fr") ||($language eq "it")) && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
#right-shift the contraction for French and Italian
$text = $text.$prependSpace.$words[$i];
$prependSpace = "";
- } elsif ($words[$i] =~ /^[\'\"]+$/) {
+ } elsif (($language eq "cs") && ($i<(scalar(@words)-3))
+ && ($words[$i] =~ /[\p{IsAlpha}]$/)
+ && ($words[$i+1] =~ /^[-–]$/)
+ && ($words[$i+2] =~ /^li$|^mail.*/i)
+ ) {
+ #right-shift "-li" in Czech and a few Czech dashed words (e-mail)
+ $text = $text.$prependSpace.$words[$i].$words[$i+1];
+ $i++; # advance over the dash
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
#combine punctuation smartly
- if (($quoteCount{$words[$i]} % 2) eq 0) {
+ my $normalized_quo = $words[$i];
+ $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
+ $quoteCount{$normalized_quo} = 0
+ if !defined $quoteCount{$normalized_quo};
+ if ($language eq "cs" && $words[$i] eq "„") {
+ # this is always the starting quote in Czech
+ $quoteCount{$normalized_quo} = 0;
+ }
+ if ($language eq "cs" && $words[$i] eq "“") {
+ # this is usually the ending quote in Czech
+ $quoteCount{$normalized_quo} = 1;
+ }
+ if (($quoteCount{$normalized_quo} % 2) eq 0) {
if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
#single quote for posesssives ending in s... "The Jones' house"
#left shift
@@ -79,14 +123,14 @@ sub detokenize {
#right shift
$text = $text.$prependSpace.$words[$i];
$prependSpace = "";
- $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+ $quoteCount{$normalized_quo} ++;
}
} else {
#left shift
$text=$text.$words[$i];
$prependSpace = " ";
- $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+ $quoteCount{$normalized_quo} ++;
}
@@ -106,6 +150,8 @@ sub detokenize {
#add trailing break
$text .= "\n" unless $text =~ /\n$/;
+ $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+
return $text;
}