Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2013-05-16 20:03:37 +0400
committerHieu Hoang <hieuhoang@gmail.com>2013-05-16 20:03:37 +0400
commitf96a82d26c126d7733116a23d7cb557d61646e0c (patch)
tree2b1ef7543034926be9e15c5c9be9f7fb41bfef58 /scripts/tokenizer
parent97d6cb1caae7c4c37847041408c3779b84d87b31 (diff)
add normalize-punctuation.perl, from WMT
Diffstat (limited to 'scripts/tokenizer')
-rwxr-xr-xscripts/tokenizer/normalize-punctuation.perl73
1 files changed, 73 insertions, 0 deletions
diff --git a/scripts/tokenizer/normalize-punctuation.perl b/scripts/tokenizer/normalize-punctuation.perl
new file mode 100755
index 000000000..76f58714f
--- /dev/null
+++ b/scripts/tokenizer/normalize-punctuation.perl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my ($language) = @ARGV;
+
+while(<STDIN>) {
+ s/\r//g;
+ # remove extra spaces
+ s/\(/ \(/g;
+ s/\)/\) /g; s/ +/ /g;
+ s/\) ([\.\!\:\?\;\,])/\)$1/g;
+ s/\( /\(/g;
+ s/ \)/\)/g;
+ s/(\d) \%/$1\%/g;
+ s/ :/:/g;
+ s/ ;/;/g;
+ # normalize unicode punctuation
+ s/„/\"/g;
+ s/“/\"/g;
+ s/”/\"/g;
+ s/–/-/g;
+ s/—/ - /g; s/ +/ /g;
+ s/´/\'/g;
+ s/([a-z])‘([a-z])/$1\'$2/gi;
+ s/([a-z])’([a-z])/$1\'$2/gi;
+ s/‘/\"/g;
+ s/‚/\"/g;
+ s/’/\"/g;
+ s/''/\"/g;
+ s/´´/\"/g;
+ s/…/.../g;
+ # French quotes
+ s/ « / \"/g;
+ s/« /\"/g;
+ s/«/\"/g;
+ s/ » /\" /g;
+ s/ »/\"/g;
+ s/»/\"/g;
+ # handle pseudo-spaces
+ s/ \%/\%/g;
+ s/nº /nº /g;
+ s/ :/:/g;
+ s/ ºC/ ºC/g;
+ s/ cm/ cm/g;
+ s/ \?/\?/g;
+ s/ \!/\!/g;
+ s/ ;/;/g;
+ s/, /, /g; s/ +/ /g;
+
+ # English "quotation," followed by comma, style
+ if ($language eq "en") {
+ s/\"([,\.]+)/$1\"/g;
+ }
+ # Czech is confused
+ elsif ($language eq "cs" || $language eq "cz") {
+ }
+ # German/Spanish/French "quotation", followed by comma, style
+ else {
+ s/,\"/\",/g;
+ s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
+ }
+
+ print STDERR $_ if //;
+
+ if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {
+ s/(\d) (\d)/$1,$2/g;
+ }
+ else {
+ s/(\d) (\d)/$1.$2/g;
+ }
+ print $_;
+}