Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-03-23 11:17:08 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-03-23 11:17:08 +0400
commit4d0fc996baa806172455177a435dc529dd07a21b (patch)
treee564c659314d1791672a6d44a6a80258f29a7d7b /scripts/tokenizer
parent1794bccd9086504e27f0b9222d0f237af6c264c1 (diff)
bug fix to filter hierarchical
Diffstat (limited to 'scripts/tokenizer')
-rwxr-xr-xscripts/tokenizer/detokenizer.perl2
-rwxr-xr-xscripts/tokenizer/tokenizer.perl6
2 files changed, 8 insertions, 0 deletions
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index 6f23ab9c6..e2d7ea0bb 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -68,6 +68,8 @@ sub detokenize {
$text =~ s/\&bar;/\|/g;
$text =~ s/\&lt;/\</g;
$text =~ s/\&gt;/\>/g;
+ $text =~ s/\&bra;/\[/g;
+ $text =~ s/\&ket;/\]/g;
$text =~ s/\&amp;/\&/g;
my $word;
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 7158c417a..a97d5e160 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -70,6 +70,10 @@ sub tokenize {
chomp($text);
$text = " $text ";
+ # remove ASCII junk
+ $text =~ s/\s+/ /g;
+ $text =~ s/[\000-\037]//g;
+
# seperate out all "other" special characters
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
@@ -149,6 +153,8 @@ sub tokenize {
$text =~ s/\|/\&bar;/g;
$text =~ s/\</\&lt;/g;
$text =~ s/\>/\&gt;/g;
+ $text =~ s/\[/\&bra;/g;
+ $text =~ s/\]/\&ket;/g;
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;