Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2007-03-26 09:46:50 +0400
committerbojar <bojar@1f5c12ca-751b-0410-a591-d2e778427230>2007-03-26 09:46:50 +0400
commit58bf2089af2bcdf4ad6f70906cdaea6975863457 (patch)
tree9e07d671cff826f4772e50b0cce0f28a3cacff39 /scripts/recaser
parent3d288d81e4cd4da62b976b2aaaabb63576d582a2 (diff)
Adding detokenizer from WMT07 shared scripts.tgz, hoping there are no copyright
problems. Please withdraw if necessary. git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1327 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/recaser')
-rwxr-xr-xscripts/recaser/detokenizer.perl109
1 files changed, 109 insertions, 0 deletions
diff --git a/scripts/recaser/detokenizer.perl b/scripts/recaser/detokenizer.perl
new file mode 100755
index 000000000..f23237c46
--- /dev/null
+++ b/scripts/recaser/detokenizer.perl
@@ -0,0 +1,109 @@
+#!/usr/bin/perl -w
+
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use strict;
+
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+
+while (@ARGV) {
+ $_ = shift;
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+}
+
+if ($HELP) {
+ print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n";
+ exit;
+}
+if (!$QUIET) {
+ print STDERR "Detokenizer Version 1.0\n";
+ print STDERR "Language: $language\n";
+}
+
+while(<STDIN>) {
+ if (/^<.+>$/ || /^\s*$/) {
+ #don't try to detokenize XML/HTML tag lines
+ print $_;
+ }
+ else {
+ print &detokenize($_);
+ }
+}
+
+sub detokenize {
+ my($text) = @_;
+ chomp($text);
+ $text = " $text ";
+
+ my $word;
+ my $i;
+ my @words = split(/ /,$text);
+ $text = "";
+ my %quoteCount = ("\'"=>0,"\""=>0);
+ my $prependSpace = " ";
+ for ($i=0;$i<(scalar(@words));$i++) {
+ if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+ #perform right shift on currency and other random punctuation items
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+ #perform left shift on punctuation items
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+ #left-shift the contraction for English
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif (($language eq "fr") && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+ #right-shift the contraction for French
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\'\"]+$/) {
+ #combine punctuation smartly
+ if (($quoteCount{$words[$i]} % 2) eq 0) {
+ if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+ #single quote for posesssives ending in s... "The Jones' house"
+ #left shift
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } else {
+ #right shift
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+
+ }
+ } else {
+ #left shift
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+
+ }
+
+ } else {
+ $text=$text.$prependSpace.$words[$i];
+ $prependSpace = " ";
+ }
+ }
+
+ # clean up spaces at head and tail of each line as well as any double-spacing
+ $text =~ s/ +/ /g;
+ $text =~ s/\n /\n/g;
+ $text =~ s/ \n/\n/g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #add trailing break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ return $text;
+}
+