official release of experiment.perl

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3224 1f5c12ca-751b-0410-a591-d2e778427230
author: phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> 2010-05-05 03:04:10 +0400
committer: phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> 2010-05-05 03:04:10 +0400
commit: 2ed6804f12e6703610fb4ad40567de7cd9f17348 (patch)
tree: 5a207145f8d07173018043a28b06709269c46301 /scripts/tokenizer
parent: 0118733a45ad8f9513187717f19289bbc1c7ba53 (diff)
6 files changed, 829 insertions, 0 deletions
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
new file mode 100755
index 000000000..5eac00077
--- /dev/null
+++ b/scripts/tokenizer/detokenizer.perl
@@ -0,0 +1,111 @@
+#!/usr/bin/perl -w
+
+# $Id: detokenizer.perl 928 2009-09-02 02:58:01Z philipp $
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use strict;
+
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+
+while (@ARGV) {
+	$_ = shift;
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+}
+
+if ($HELP) {
+	print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n";
+	exit;
+}
+if (!$QUIET) {
+	print STDERR "Detokenizer Version 1.0\n";
+	print STDERR "Language: $language\n";
+}
+
+while(<STDIN>) {
+	if (/^<.+>$/ || /^\s*$/) {
+		#don't try to detokenize XML/HTML tag lines
+		print $_;
+	}
+	else {
+		print &detokenize($_);
+	}
+}
+
+sub detokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+        $text =~ s/ \@\-\@ /-/g;
+	
+	my $word;
+	my $i;
+	my @words = split(/ /,$text);
+	$text = "";
+	my %quoteCount =  ("\'"=>0,"\""=>0);
+	my $prependSpace = " ";
+	for ($i=0;$i<(scalar(@words));$i++) {		
+		if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+			#perform right shift on currency and other random punctuation items
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+			#perform left shift on punctuation items
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+			#left-shift the contraction for English
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		}  elsif ((($language eq "fr") ||($language eq "it")) && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+			#right-shift the contraction for French and Italian
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\'\"]+$/) {
+			#combine punctuation smartly
+			if (($quoteCount{$words[$i]} % 2) eq 0) {
+				if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+					#single quote for posesssives ending in s... "The Jones' house"
+					#left shift
+					$text=$text.$words[$i];
+					$prependSpace = " ";
+				} else {
+					#right shift
+					$text = $text.$prependSpace.$words[$i];
+					$prependSpace = "";
+					$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+
+				}
+			} else {
+				#left shift
+				$text=$text.$words[$i];
+				$prependSpace = " ";
+				$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;
+
+			}
+			
+		} else {
+			$text=$text.$prependSpace.$words[$i];
+			$prependSpace = " ";
+		}
+	}
+	
+	# clean up spaces at head and tail of each line as well as any double-spacing
+	$text =~ s/ +/ /g;
+	$text =~ s/\n /\n/g;
+	$text =~ s/ \n/\n/g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+	
+	#add trailing break
+	$text .= "\n" unless $text =~ /\n$/;
+
+	return $text;
+}
+
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de
new file mode 100644
index 000000000..35fdf5eee
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de
@@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el
new file mode 100644
index 000000000..0470f9192
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el
@@ -0,0 +1,2 @@
+# for now, just include the Greek equivalent of "Mr."
+κ
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en
new file mode 100644
index 000000000..e1a3733b5
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en
@@ -0,0 +1,107 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
new file mode 100644
index 000000000..ad4d355c7
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
@@ -0,0 +1,109 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+dott
+Sig
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
new file mode 100755
index 000000000..54be11644
--- /dev/null
+++ b/scripts/tokenizer/tokenizer.perl
@@ -0,0 +1,175 @@
+#!/usr/bin/perl -w
+
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# Sample Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($Bin);
+use strict;
+#use Time::HiRes;
+
+my $mydir = "$Bin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+
+#my $start = [ Time::HiRes::gettimeofday( ) ];
+
+while (@ARGV) {
+	$_ = shift;
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+}
+
+if ($HELP) {
+	print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
+	exit;
+}
+if (!$QUIET) {
+	print STDERR "Tokenizer Version 1.0\n";
+	print STDERR "Language: $language\n";
+}
+
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0){
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+while(<STDIN>) {
+	if (/^<.+>$/ || /^\s*$/) {
+		#don't try to tokenize XML/HTML tag lines
+		print $_;
+	}
+	else {
+		print &tokenize($_);
+	}
+}
+
+#my $duration = Time::HiRes::tv_interval( $start );
+#print STDERR ("EXECUTION TIME: ".$duration."\n");
+
+
+sub tokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+	
+	# seperate out all "other" special characters
+	$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+	
+	# aggressive hyphen splitting
+        if ($AGGRESSIVE) {
+	   $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+        }
+
+	#multi-dots stay together
+	$text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+	while($text =~ /DOTMULTI\./) {
+		$text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+		$text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+	}
+
+	# seperate out "," except if within numbers (5,300)
+	$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+	# separate , pre and post number
+	$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+	$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+	      
+	# turn `into '
+	$text =~ s/\`/\'/g;
+	
+	#turn '' into "
+	$text =~ s/\'\'/ \" /g;
+
+	if ($language eq "en") {
+		#split contractions right
+		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+		#special case for "1990's"
+		$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+	} elsif (($language eq "fr") or ($language eq "it")) {
+		#split contractions left	
+		$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+		$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+	} else {
+		$text =~ s/\'/ \' /g;
+	}
+	
+	#word token method
+	my @words = split(/\s/,$text);
+	$text = "";
+	for (my $i=0;$i<(scalar(@words));$i++) {
+		my $word = $words[$i];
+		if ( $word =~ /^(\S+)\.$/) {
+			my $pre = $1;
+			if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
+				#no change
+			} elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
+				#no change
+			} else {
+				$word = $pre." .";
+			}
+		}
+		$text .= $word." ";
+	}		
+
+	# clean up extraneous spaces
+	$text =~ s/ +/ /g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+
+	#restore multi-dots
+	while($text =~ /DOTDOTMULTI/) {
+		$text =~ s/DOTDOTMULTI/DOTMULTI./g;
+	}
+	$text =~ s/DOTMULTI/./g;
+	
+	#ensure final line break
+	$text .= "\n" unless $text =~ /\n$/;
+
+	return $text;
+}
+
+sub load_prefixes {
+	my ($language, $PREFIX_REF) = @_;
+	
+	my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+	
+	#default back to English if we don't have a language-specific prefix file
+	if (!(-e $prefixfile)) {
+		$prefixfile = "$mydir/nonbreaking_prefix.en";
+		print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+		die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+	}
+	
+	if (-e "$prefixfile") {
+		open(PREFIX, "<:utf8", "$prefixfile");
+		while (<PREFIX>) {
+			my $item = $_;
+			chomp($item);
+			if (($item) && (substr($item,0,1) ne "#")) {
+				if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
+					$PREFIX_REF->{$1} = 2;
+				} else {
+					$PREFIX_REF->{$item} = 1;
+				}
+			}
+		}
+		close(PREFIX);
+	}
+	
+}
+
author	phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>	2010-05-05 03:04:10 +0400
committer	phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>	2010-05-05 03:04:10 +0400
commit	2ed6804f12e6703610fb4ad40567de7cd9f17348 (patch)
tree	5a207145f8d07173018043a28b06709269c46301 /scripts/tokenizer
parent	0118733a45ad8f9513187717f19289bbc1c7ba53 (diff)