Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-03-20 08:57:37 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-03-20 08:57:37 +0400
commit791b5a7676ba1b0604b0dfe46a837cf96c643d08 (patch)
tree081961b2ff508cd470449af08ea4c6a13616559f /scripts/tokenizer
parent2fdb47fe67a0bcca8cebdb3f86c2f2a5d7fb2055 (diff)
lotsa minor changes: mostly bug fixes, tokenizer now esacapes special Moses characters (|<>&)
Diffstat (limited to 'scripts/tokenizer')
-rwxr-xr-xscripts/tokenizer/detokenizer.perl9
-rwxr-xr-xscripts/tokenizer/tokenizer.perl12
2 files changed, 17 insertions, 4 deletions
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index d8d6e5da7..6f23ab9c6 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -63,8 +63,13 @@ sub detokenize {
my($text) = @_;
chomp($text);
$text = " $text ";
- $text =~ s/ \@\-\@ /-/g;
-
+ $text =~ s/ \@\-\@ /-/g;
+ # de-escape special chars
+ $text =~ s/\&bar;/\|/g;
+ $text =~ s/\&lt;/\</g;
+ $text =~ s/\&gt;/\>/g;
+ $text =~ s/\&amp;/\&/g;
+
my $word;
my $i;
my @words = split(/ /,$text);
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 84fdc3462..7158c417a 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -18,6 +18,7 @@ my $language = "en";
my $QUIET = 0;
my $HELP = 0;
my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
#my $start = [ Time::HiRes::gettimeofday( ) ];
@@ -27,6 +28,7 @@ while (@ARGV) {
/^-l$/ && ($language = shift, next);
/^-q$/ && ($QUIET = 1, next);
/^-h$/ && ($HELP = 1, next);
+ /^-x$/ && ($SKIP_XML = 1, next);
/^-a$/ && ($AGGRESSIVE = 1, next);
}
@@ -50,7 +52,7 @@ if (scalar(%NONBREAKING_PREFIX) eq 0){
}
while(<STDIN>) {
- if (/^<.+>$/ || /^\s*$/) {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/) {
#don't try to tokenize XML/HTML tag lines
print $_;
}
@@ -141,7 +143,13 @@ sub tokenize {
$text =~ s/DOTDOTMULTI/DOTMULTI./g;
}
$text =~ s/DOTMULTI/./g;
-
+
+ #escape special chars
+ $text =~ s/\&/\&amp;/g;
+ $text =~ s/\|/\&bar;/g;
+ $text =~ s/\</\&lt;/g;
+ $text =~ s/\>/\&gt;/g;
+
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;