From 6af3140978d271f2222971b73ca5a363e8d68bd7 Mon Sep 17 00:00:00 2001 From: bojar Date: Fri, 22 Feb 2008 14:50:43 +0000 Subject: added optional sentence uppercasing (use -u) git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1566 1f5c12ca-751b-0410-a591-d2e778427230 --- scripts/recaser/detokenizer.perl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'scripts') diff --git a/scripts/recaser/detokenizer.perl b/scripts/recaser/detokenizer.perl index 60d8732c9..0ab8c1588 100755 --- a/scripts/recaser/detokenizer.perl +++ b/scripts/recaser/detokenizer.perl @@ -2,6 +2,7 @@ # Sample De-Tokenizer # written by Josh Schroeder, based on code by Philipp Koehn +# further modifications by Ondrej Bojar binmode(STDIN, ":utf8"); binmode(STDOUT, ":utf8"); @@ -10,16 +11,21 @@ use strict; my $language = "en"; my $QUIET = 0; my $HELP = 0; +my $UPPERCASE_SENT = 0; while (@ARGV) { $_ = shift; /^-l$/ && ($language = shift, next); /^-q$/ && ($QUIET = 1, next); /^-h$/ && ($HELP = 1, next); + /^-u$/ && ($UPPERCASE_SENT = 1, next); } if ($HELP) { - print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n"; + print "Usage ./detokenizer.perl (-l [en|fr|cs]) < tokenizedfile > detokenizedfile\n"; + print "Options:\n"; + print " -u ... uppercase the first char in the final sentence.\n"; + print " -q ... don't report detokenizer revision.\n"; exit; } @@ -117,6 +123,8 @@ sub detokenize { #add trailing break $text .= "\n" unless $text =~ /\n$/; + $text = ucfirst($text) if $UPPERCASE_SENT; + return $text; } -- cgit v1.2.3