Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2010-05-05 03:04:10 +0400
committerphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2010-05-05 03:04:10 +0400
commit2ed6804f12e6703610fb4ad40567de7cd9f17348 (patch)
tree5a207145f8d07173018043a28b06709269c46301 /scripts/tokenizer
parent0118733a45ad8f9513187717f19289bbc1c7ba53 (diff)
official release of experiment.perl
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3224 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/tokenizer')
-rwxr-xr-xscripts/tokenizer/detokenizer.perl111
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de325
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el2
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en107
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it109
-rwxr-xr-xscripts/tokenizer/tokenizer.perl175
6 files changed, 829 insertions, 0 deletions
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
new file mode 100755
index 000000000..5eac00077
--- /dev/null
+++ b/scripts/tokenizer/detokenizer.perl
@@ -0,0 +1,111 @@
+#!/usr/bin/perl -w
+
+# $Id: detokenizer.perl 928 2009-09-02 02:58:01Z philipp $
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use strict;
+
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+
+while (@ARGV) {
+ $_ = shift;
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+}
+
+if ($HELP) {
+ print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n";
+ exit;
+}
+if (!$QUIET) {
+ print STDERR "Detokenizer Version 1.0\n";
+ print STDERR "Language: $language\n";
+}
+
+while(<STDIN>) {
+ if (/^<.+>$/ || /^\s*$/) {
+ #don't try to detokenize XML/HTML tag lines
+ print $_;
+ }
+ else {
+ print &detokenize($_);
+ }
+}
+
+sub detokenize {
+ my($text) = @_;
+ chomp($text);
+ $text = " $text ";
+ $text =~ s/ \@\-\@ /-/g;
+
+ my $word;
+ my $i;
+ my @words = split(/ /,$text);
+ $text = "";
+ my %quoteCount = ("\'"=>0,"\""=>0);
+ my $prependSpace = " ";
+ for ($i=0;$i<(scalar(@words));$i++) {
+ if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+ #perform right shift on currency and other random punctuation items
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+ #perform left shift on punctuation items
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+ #left-shift the contraction for English
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif ((($language eq "fr") ||($language eq "it")) && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+ #right-shift the contraction for French and Italian
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\'\"]+$/) {
+ #combine punctuation smartly
+ if (($quoteCount{$words[$i]} % 2) eq 0) {
+ if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+ #single quote for posesssives ending in s... "The Jones' house"
+ #left shift
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } else {
+ #right shift
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+
+ }
+ } else {
+ #left shift
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ $quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+
+ }
+
+ } else {
+ $text=$text.$prependSpace.$words[$i];
+ $prependSpace = " ";
+ }
+ }
+
+ # clean up spaces at head and tail of each line as well as any double-spacing
+ $text =~ s/ +/ /g;
+ $text =~ s/\n /\n/g;
+ $text =~ s/ \n/\n/g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #add trailing break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ return $text;
+}
+
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de
new file mode 100644
index 000000000..35fdf5eee
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de
@@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el
new file mode 100644
index 000000000..0470f9192
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el
@@ -0,0 +1,2 @@
+# for now, just include the Greek equivalent of "Mr."
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en
new file mode 100644
index 000000000..e1a3733b5
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en
@@ -0,0 +1,107 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
new file mode 100644
index 000000000..ad4d355c7
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
@@ -0,0 +1,109 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+dott
+Sig
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
new file mode 100755
index 000000000..54be11644
--- /dev/null
+++ b/scripts/tokenizer/tokenizer.perl
@@ -0,0 +1,175 @@
+#!/usr/bin/perl -w
+
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# Sample Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($Bin);
+use strict;
+#use Time::HiRes;
+
+my $mydir = "$Bin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+
+#my $start = [ Time::HiRes::gettimeofday( ) ];
+
+while (@ARGV) {
+ $_ = shift;
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+ /^-a$/ && ($AGGRESSIVE = 1, next);
+}
+
+if ($HELP) {
+ print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
+ exit;
+}
+if (!$QUIET) {
+ print STDERR "Tokenizer Version 1.0\n";
+ print STDERR "Language: $language\n";
+}
+
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0){
+ print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+while(<STDIN>) {
+ if (/^<.+>$/ || /^\s*$/) {
+ #don't try to tokenize XML/HTML tag lines
+ print $_;
+ }
+ else {
+ print &tokenize($_);
+ }
+}
+
+#my $duration = Time::HiRes::tv_interval( $start );
+#print STDERR ("EXECUTION TIME: ".$duration."\n");
+
+
+sub tokenize {
+ my($text) = @_;
+ chomp($text);
+ $text = " $text ";
+
+ # seperate out all "other" special characters
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+
+ # aggressive hyphen splitting
+ if ($AGGRESSIVE) {
+ $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+ }
+
+ #multi-dots stay together
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+ while($text =~ /DOTMULTI\./) {
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+ }
+
+ # seperate out "," except if within numbers (5,300)
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ # separate , pre and post number
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+ # turn `into '
+ $text =~ s/\`/\'/g;
+
+ #turn '' into "
+ $text =~ s/\'\'/ \" /g;
+
+ if ($language eq "en") {
+ #split contractions right
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+ #special case for "1990's"
+ $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+ } elsif (($language eq "fr") or ($language eq "it")) {
+ #split contractions left
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+ } else {
+ $text =~ s/\'/ \' /g;
+ }
+
+ #word token method
+ my @words = split(/\s/,$text);
+ $text = "";
+ for (my $i=0;$i<(scalar(@words));$i++) {
+ my $word = $words[$i];
+ if ( $word =~ /^(\S+)\.$/) {
+ my $pre = $1;
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
+ #no change
+ } elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
+ #no change
+ } else {
+ $word = $pre." .";
+ }
+ }
+ $text .= $word." ";
+ }
+
+ # clean up extraneous spaces
+ $text =~ s/ +/ /g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #restore multi-dots
+ while($text =~ /DOTDOTMULTI/) {
+ $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+ }
+ $text =~ s/DOTMULTI/./g;
+
+ #ensure final line break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ return $text;
+}
+
+sub load_prefixes {
+ my ($language, $PREFIX_REF) = @_;
+
+ my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+ #default back to English if we don't have a language-specific prefix file
+ if (!(-e $prefixfile)) {
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
+ print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+ }
+
+ if (-e "$prefixfile") {
+ open(PREFIX, "<:utf8", "$prefixfile");
+ while (<PREFIX>) {
+ my $item = $_;
+ chomp($item);
+ if (($item) && (substr($item,0,1) ne "#")) {
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
+ $PREFIX_REF->{$1} = 2;
+ } else {
+ $PREFIX_REF->{$item} = 1;
+ }
+ }
+ }
+ close(PREFIX);
+ }
+
+}
+