Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/recaser/detokenizer.perl')
-rwxr-xr-xscripts/recaser/detokenizer.perl143
1 files changed, 143 insertions, 0 deletions
diff --git a/scripts/recaser/detokenizer.perl b/scripts/recaser/detokenizer.perl
new file mode 100755
index 000000000..2a43c30cf
--- /dev/null
+++ b/scripts/recaser/detokenizer.perl
@@ -0,0 +1,143 @@
+#!/usr/bin/perl -w
+
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+# further modifications by Ondrej Bojar
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use strict;
+use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
+
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $UPPERCASE_SENT = 0;
+
+while (@ARGV) {
+ $_ = shift;
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+ /^-u$/ && ($UPPERCASE_SENT = 1, next);
+}
+
+if ($HELP) {
+ print "Usage ./detokenizer.perl (-l [en|fr|cs]) < tokenizedfile > detokenizedfile\n";
+ print "Options:\n";
+ print " -u ... uppercase the first char in the final sentence.\n";
+ print " -q ... don't report detokenizer revision.\n";
+ exit;
+}
+
+die "No built-in rules for language $language, claim en for default behaviour."
+ if $language !~ /^(cs|en|fr)$/;
+
+if (!$QUIET) {
+ print STDERR "Detokenizer Version ".'$Revision: 1567 $'."\n";
+ print STDERR "Language: $language\n";
+}
+
+while(<STDIN>) {
+ if (/^<.+>$/ || /^\s*$/) {
+ #don't try to detokenize XML/HTML tag lines
+ print $_;
+ }
+ else {
+ print &detokenize($_);
+ }
+}
+
+sub detokenize {
+ my($text) = @_;
+ chomp($text);
+ $text = " $text ";
+
+ my $word;
+ my $i;
+ my @words = split(/ /,$text);
+ $text = "";
+ my %quoteCount = ("\'"=>0,"\""=>0);
+ my $prependSpace = " ";
+ for ($i=0;$i<(scalar(@words));$i++) {
+ if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+ #perform right shift on currency and other random punctuation items
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+ #perform left shift on punctuation items
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+ #left-shift the contraction for English
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif (($language eq "fr") && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+ #right-shift the contraction for French
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif (($language eq "cs") && ($i<(scalar(@words)-3))
+ && ($words[$i] =~ /[\p{IsAlpha}]$/)
+ && ($words[$i+1] =~ /^[-–]$/)
+ && ($words[$i+2] =~ /^li$/i)
+ ) {
+ #right-shift "-li" in Czech
+ $text = $text.$prependSpace.$words[$i].$words[$i+1];
+ $i++; # advance over the dash
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
+ #combine punctuation smartly
+ my $normalized_quo = $words[$i];
+ $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
+ $quoteCount{$normalized_quo} = 0
+ if !defined $quoteCount{$normalized_quo};
+ if ($language eq "cs" && $words[$i] eq "„") {
+ # this is always the starting quote in Czech
+ $quoteCount{$normalized_quo} = 0;
+ }
+ if ($language eq "cs" && $words[$i] eq "“") {
+ # this is usually the ending quote in Czech
+ $quoteCount{$normalized_quo} = 1;
+ }
+ if (($quoteCount{$normalized_quo} % 2) eq 0) {
+ if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+ #single quote for posesssives ending in s... "The Jones' house"
+ #left shift
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } else {
+ #right shift
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ $quoteCount{$normalized_quo} ++;
+
+ }
+ } else {
+ #left shift
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ $quoteCount{$normalized_quo} ++;
+
+ }
+
+ } else {
+ $text=$text.$prependSpace.$words[$i];
+ $prependSpace = " ";
+ }
+ }
+
+ # clean up spaces at head and tail of each line as well as any double-spacing
+ $text =~ s/ +/ /g;
+ $text =~ s/\n /\n/g;
+ $text =~ s/ \n/\n/g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #add trailing break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ $text = ucfirst($text) if $UPPERCASE_SENT;
+
+ return $text;
+}
+