Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPidong Wang <wangpd@comp.nus.edu.sg>2012-06-30 19:04:09 +0400
committerPidong Wang <wangpd@comp.nus.edu.sg>2012-06-30 19:04:09 +0400
commiteecefbede6b9fa7b23190711de67560b552f082a (patch)
treec08ca4438e85971ac7d82c8c66710b6656a13626 /scripts/tokenizer
parent3a44cd3f69ffb1c3917acab570d223cdf7fa4770 (diff)
add multi-threading feature to the tokenizer.perl
Diffstat (limited to 'scripts/tokenizer')
-rwxr-xr-xscripts/tokenizer/tokenizer.perl432
1 files changed, 292 insertions, 140 deletions
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index b23628b65..f59cd5f86 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -1,7 +1,14 @@
#!/usr/bin/perl -w
-# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
# written by Josh Schroeder, based on code by Philipp Koehn
binmode(STDIN, ":utf8");
@@ -9,7 +16,8 @@ binmode(STDOUT, ":utf8");
use FindBin qw($RealBin);
use strict;
-#use Time::HiRes;
+use Time::HiRes;
+use Thread;
my $mydir = "$RealBin/../share/nonbreaking_prefixes";
@@ -19,10 +27,12 @@ my $QUIET = 0;
my $HELP = 0;
my $AGGRESSIVE = 0;
my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
-#my $start = [ Time::HiRes::gettimeofday( ) ];
-
-while (@ARGV) {
+while (@ARGV)
+{
$_ = shift;
/^-b$/ && ($| = 1, next);
/^-l$/ && ($language = shift, next);
@@ -30,167 +40,309 @@ while (@ARGV) {
/^-h$/ && ($HELP = 1, next);
/^-x$/ && ($SKIP_XML = 1, next);
/^-a$/ && ($AGGRESSIVE = 1, next);
+ /^-time$/ && ($TIMING = 1, next);
+ /^-threads$/ && ($NUM_THREADS = int(shift), next);
+ /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+ $start_time = [ Time::HiRes::gettimeofday( ) ];
}
-if ($HELP) {
- print "Usage ./tokenizer.perl (-l [en|de|...]) < textfile > tokenizedfile\n";
+# print help message
+if ($HELP)
+{
+ print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
print "Options:\n";
- print " -q ... quiet.\n";
- print " -a ... aggressive hyphen splitting.\n";
- print " -b ... disable Perl buffering.\n";
+ print " -q ... quiet.\n";
+ print " -a ... aggressive hyphen splitting.\n";
+ print " -b ... disable Perl buffering.\n";
+ print " -time ... enable processing time calculation.\n";
exit;
}
-if (!$QUIET) {
- print STDERR "Tokenizer Version 1.0\n";
+
+if (!$QUIET)
+{
+ print STDERR "Tokenizer Version 1.1\n";
print STDERR "Language: $language\n";
+ print STDERR "Number of threads: $NUM_THREADS\n";
}
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
load_prefixes($language,\%NONBREAKING_PREFIX);
-if (scalar(%NONBREAKING_PREFIX) eq 0){
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
print STDERR "Warning: No known abbreviations for language '$language'\n";
}
-while(<STDIN>) {
- if (($SKIP_XML && /^<.+>$/) || /^\s*$/) {
- #don't try to tokenize XML/HTML tag lines
- print $_;
- }
- else {
- print &tokenize($_);
- }
-}
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
-#my $duration = Time::HiRes::tv_interval( $start );
-#print STDERR ("EXECUTION TIME: ".$duration."\n");
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+ while(<STDIN>)
+ {
+ $count_sentences = $count_sentences + 1;
+ push(@batch_sentences, $_);
+ if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ # reset for the new run
+ @thread_list = ();
+ @batch_sentences = ();
+ }
+ }
+ # the last batch
+ if (scalar(@batch_sentences)>0)
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ if ($start_index >= scalar(@batch_sentences))
+ {
+ last;
+ }
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ if ($end_index >= scalar(@batch_sentences))
+ {
+ $end_index = scalar(@batch_sentences)-1;
+ }
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ }
+}
+else
+{# single thread only
+ while(<STDIN>)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ print $_;
+ }
+ else
+ {
+ print &tokenize($_);
+ }
+ }
+}
+if ($TIMING)
+{
+ my $duration = Time::HiRes::tv_interval( $start_time );
+ print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+ print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
-sub tokenize {
- my($text) = @_;
- chomp($text);
- $text = " $text ";
-
- # remove ASCII junk
- $text =~ s/\s+/ /g;
- $text =~ s/[\000-\037]//g;
+#####################################################################################
+# subroutines afterward
- # seperate out all "other" special characters
- $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
-
- # aggressive hyphen splitting
- if ($AGGRESSIVE) {
- $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array cotaining a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+ my(@text_list) = @_;
+ my(@tokenized_list) = ();
+ foreach (@text_list)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ push(@tokenized_list, $_);
+ }
+ else
+ {
+ push(@tokenized_list, &tokenize($_));
}
+ }
+ return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+ my($text) = @_;
+ chomp($text);
+ $text = " $text ";
+
+ # remove ASCII junk
+ $text =~ s/\s+/ /g;
+ $text =~ s/[\000-\037]//g;
- #multi-dots stay together
- $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
- while($text =~ /DOTMULTI\./) {
- $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
- $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
- }
-
- # seperate out "," except if within numbers (5,300)
- $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
- # separate , pre and post number
- $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
- $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+ # seperate out all "other" special characters
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+
+ # aggressive hyphen splitting
+ if ($AGGRESSIVE)
+ {
+ $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+ }
+
+ #multi-dots stay together
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+ while($text =~ /DOTMULTI\./)
+ {
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+ }
+
+ # seperate out "," except if within numbers (5,300)
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ # separate , pre and post number
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
- # turn `into '
- $text =~ s/\`/\'/g;
+ # turn `into '
+ $text =~ s/\`/\'/g;
- #turn '' into "
- $text =~ s/\'\'/ \" /g;
-
- if ($language eq "en") {
- #split contractions right
- $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
- #special case for "1990's"
- $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
- } elsif (($language eq "fr") or ($language eq "it")) {
- #split contractions left
- $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
- $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
- } else {
- $text =~ s/\'/ \' /g;
- }
+ #turn '' into "
+ $text =~ s/\'\'/ \" /g;
+
+ if ($language eq "en")
+ {
+ #split contractions right
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+ #special case for "1990's"
+ $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+ }
+ elsif (($language eq "fr") or ($language eq "it"))
+ {
+ #split contractions left
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+ }
+ else
+ {
+ $text =~ s/\'/ \' /g;
+ }
- #word token method
- my @words = split(/\s/,$text);
- $text = "";
- for (my $i=0;$i<(scalar(@words));$i++) {
- my $word = $words[$i];
- if ( $word =~ /^(\S+)\.$/) {
- my $pre = $1;
- if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) {
- #no change
- } elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) {
- #no change
- } else {
- $word = $pre." .";
- }
- }
- $text .= $word." ";
- }
-
- # clean up extraneous spaces
- $text =~ s/ +/ /g;
- $text =~ s/^ //g;
- $text =~ s/ $//g;
-
- #restore multi-dots
- while($text =~ /DOTDOTMULTI/) {
- $text =~ s/DOTDOTMULTI/DOTMULTI./g;
- }
- $text =~ s/DOTMULTI/./g;
-
- #escape special chars
- $text =~ s/\&/\&amp;/g; # escape escape
- $text =~ s/\|/\&#124;/g; # factor separator
- $text =~ s/\</\&lt;/g; # xml
- $text =~ s/\>/\&gt;/g; # xml
- $text =~ s/\'/\&apos;/g; # xml
- $text =~ s/\"/\&quot;/g; # xml
- $text =~ s/\[/\&#91;/g; # syntax non-terminal
- $text =~ s/\]/\&#93;/g; # syntax non-terminal
-
- #ensure final line break
- $text .= "\n" unless $text =~ /\n$/;
-
- return $text;
+ #word token method
+ my @words = split(/\s/,$text);
+ $text = "";
+ for (my $i=0;$i<(scalar(@words));$i++)
+ {
+ my $word = $words[$i];
+ if ( $word =~ /^(\S+)\.$/)
+ {
+ my $pre = $1;
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+ {
+ #no change
+ }
+ elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+ {
+ #no change
+ }
+ else
+ {
+ $word = $pre." .";
+ }
+ }
+ $text .= $word." ";
+ }
+
+ # clean up extraneous spaces
+ $text =~ s/ +/ /g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #restore multi-dots
+ while($text =~ /DOTDOTMULTI/)
+ {
+ $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+ }
+ $text =~ s/DOTMULTI/./g;
+
+ #escape special chars
+ $text =~ s/\&/\&amp;/g; # escape escape
+ $text =~ s/\|/\&#124;/g; # factor separator
+ $text =~ s/\</\&lt;/g; # xml
+ $text =~ s/\>/\&gt;/g; # xml
+ $text =~ s/\'/\&apos;/g; # xml
+ $text =~ s/\"/\&quot;/g; # xml
+ $text =~ s/\[/\&#91;/g; # syntax non-terminal
+ $text =~ s/\]/\&#93;/g; # syntax non-terminal
+
+ #ensure final line break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ return $text;
}
-sub load_prefixes {
- my ($language, $PREFIX_REF) = @_;
+sub load_prefixes
+{
+ my ($language, $PREFIX_REF) = @_;
- my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+ my $prefixfile = "$mydir/nonbreaking_prefix.$language";
- #default back to English if we don't have a language-specific prefix file
- if (!(-e $prefixfile)) {
- $prefixfile = "$mydir/nonbreaking_prefix.en";
- print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
- die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
- }
-
- if (-e "$prefixfile") {
- open(PREFIX, "<:utf8", "$prefixfile");
- while (<PREFIX>) {
- my $item = $_;
- chomp($item);
- if (($item) && (substr($item,0,1) ne "#")) {
- if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
- $PREFIX_REF->{$1} = 2;
- } else {
- $PREFIX_REF->{$item} = 1;
- }
- }
- }
- close(PREFIX);
- }
+ #default back to English if we don't have a language-specific prefix file
+ if (!(-e $prefixfile))
+ {
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
+ print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+ }
+ if (-e "$prefixfile")
+ {
+ open(PREFIX, "<:utf8", "$prefixfile");
+ while (<PREFIX>)
+ {
+ my $item = $_;
+ chomp($item);
+ if (($item) && (substr($item,0,1) ne "#"))
+ {
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+ {
+ $PREFIX_REF->{$1} = 2;
+ }
+ else
+ {
+ $PREFIX_REF->{$item} = 1;
+ }
+ }
+ }
+ close(PREFIX);
+ }
}