Czech detokenization

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3879 1f5c12ca-751b-0410-a591-d2e778427230
author: bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> 2011-02-14 16:32:41 +0300
committer: bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230> 2011-02-14 16:32:41 +0300
commit: 26ccace94644af74bd945a3f91b8cb047b919e4f (patch)
tree: 4bf31ed7f3026cd451d703f8f9bafb35f263a409 /scripts/tokenizer
parent: 4c6dfbddc3685fa780c2e3d89471e92344278d7a (diff)
1 files changed, 53 insertions, 7 deletions
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index 5eac00077..1745580d6 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -1,30 +1,41 @@
 #!/usr/bin/perl -w
 
-# $Id: detokenizer.perl 928 2009-09-02 02:58:01Z philipp $
+# $Id$
 # Sample De-Tokenizer
 # written by Josh Schroeder, based on code by Philipp Koehn
+# further modifications by Ondrej Bojar
 
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 use strict;
+use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
 
 my $language = "en";
 my $QUIET = 0;
 my $HELP = 0;
+my $UPPERCASE_SENT = 0;
 
 while (@ARGV) {
 	$_ = shift;
 	/^-l$/ && ($language = shift, next);
 	/^-q$/ && ($QUIET = 1, next);
 	/^-h$/ && ($HELP = 1, next);
+	/^-u$/ && ($UPPERCASE_SENT = 1, next);
 }
 
 if ($HELP) {
-	print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n";
+	print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
+        print "Options:\n";
+        print "  -u  ... uppercase the first char in the final sentence.\n";
+        print "  -q  ... don't report detokenizer revision.\n";
 	exit;
 }
+
+die "No built-in rules for language $language, claim en for default behaviour."
+	if $language !~ /^(cs|en|fr|it)$/;
+
 if (!$QUIET) {
-	print STDERR "Detokenizer Version 1.0\n";
+	print STDERR "Detokenizer Version ".'$Revision$'."\n";
 	print STDERR "Language: $language\n";
 }
 
@@ -38,6 +49,14 @@ while(<STDIN>) {
 	}
 }
 
+
+sub ucsecondarg {
+  # uppercase the second argument
+  my $arg1 = shift;
+  my $arg2 = shift;
+  return $arg1.uc($arg2);
+}
+
 sub detokenize {
 	my($text) = @_;
 	chomp($text);
@@ -63,13 +82,38 @@ sub detokenize {
 			#left-shift the contraction for English
 			$text=$text.$words[$i];
 			$prependSpace = " ";
+		} elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
+			#left-shift floats in Czech
+			$text=$text.$words[$i];
+			$prependSpace = " ";
 		}  elsif ((($language eq "fr") ||($language eq "it")) && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
 			#right-shift the contraction for French and Italian
 			$text = $text.$prependSpace.$words[$i];
 			$prependSpace = "";
-		} elsif ($words[$i] =~ /^[\'\"]+$/) {
+		} elsif (($language eq "cs") && ($i<(scalar(@words)-3))
+				&& ($words[$i] =~ /[\p{IsAlpha}]$/)
+				&& ($words[$i+1] =~ /^[-–]$/)
+				&& ($words[$i+2] =~ /^li$|^mail.*/i)
+				) {
+			#right-shift "-li" in Czech and a few Czech dashed words (e-mail)
+			$text = $text.$prependSpace.$words[$i].$words[$i+1];
+			$i++; # advance over the dash
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
 			#combine punctuation smartly
-			if (($quoteCount{$words[$i]} % 2) eq 0) {
+                        my $normalized_quo = $words[$i];
+                        $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
+                        $quoteCount{$normalized_quo} = 0
+                                if !defined $quoteCount{$normalized_quo};
+                        if ($language eq "cs" && $words[$i] eq "„") {
+                          # this is always the starting quote in Czech
+                          $quoteCount{$normalized_quo} = 0;
+                        }
+                        if ($language eq "cs" && $words[$i] eq "“") {
+                          # this is usually the ending quote in Czech
+                          $quoteCount{$normalized_quo} = 1;
+                        }
+			if (($quoteCount{$normalized_quo} % 2) eq 0) {
 				if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
 					#single quote for posesssives ending in s... "The Jones' house"
 					#left shift
@@ -79,14 +123,14 @@ sub detokenize {
 					#right shift
 					$text = $text.$prependSpace.$words[$i];
 					$prependSpace = "";
-					$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+					$quoteCount{$normalized_quo} ++;
 
 				}
 			} else {
 				#left shift
 				$text=$text.$words[$i];
 				$prependSpace = " ";
-				$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+				$quoteCount{$normalized_quo} ++;
 
 			}
 			
@@ -106,6 +150,8 @@ sub detokenize {
 	#add trailing break
 	$text .= "\n" unless $text =~ /\n$/;
 
+        $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+
 	return $text;
 }
author	bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>	2011-02-14 16:32:41 +0300
committer	bojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>	2011-02-14 16:32:41 +0300
commit	26ccace94644af74bd945a3f91b8cb047b919e4f (patch)
tree	4bf31ed7f3026cd451d703f8f9bafb35f263a409 /scripts/tokenizer
parent	4c6dfbddc3685fa780c2e3d89471e92344278d7a (diff)