Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-05-30 03:58:18 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-05-30 03:58:18 +0400
commit2e370ed11b0cd8989118891dc4385619837dd39f (patch)
treee0b8ab423399453a90bc4cacabc323289620042c /scripts/tokenizer
parentfd577d7a65cab923b9102d61873a032654d573a1 (diff)
more escaping in tokenizer; wrapper for berkeley parser (german)
Diffstat (limited to 'scripts/tokenizer')
-rwxr-xr-xscripts/tokenizer/deescape-special-chars.perl18
-rwxr-xr-xscripts/tokenizer/detokenizer.perl18
-rwxr-xr-xscripts/tokenizer/escape-special-chars.perl14
-rwxr-xr-xscripts/tokenizer/tokenizer.perl14
4 files changed, 36 insertions, 28 deletions
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
index 55035ae6d..345555990 100755
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -3,13 +3,15 @@
use strict;
while(<STDIN>) {
- s/\&bar;/\|/g;
- s/\&lt;/\</g;
- s/\&gt;/\>/g;
- s/\&bra;/\[/g;
- s/\&ket;/\]/g;
- s/\&#91;/\[/g;
- s/\&#93;/\]/g;
- s/\&amp;/\&/g;
+ s/\&bar;/\|/g; # factor separator
+ s/\&lt;/\</g; # xml
+ s/\&gt;/\>/g; # xml
+ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ s/\&quot;/\"/g; # xml
+ s/\&apos;/\'/g; # xml
+ s/\&#91;/\[/g; # syntax non-terminal
+ s/\&#93;/\]/g; # syntax non-terminal
+ s/\&amp;/\&/g; # escape escape
print $_;
}
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index e55a1a26e..8233b419c 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -66,14 +66,16 @@ sub detokenize {
$text = " $text ";
$text =~ s/ \@\-\@ /-/g;
# de-escape special chars
- $text =~ s/\&bar;/\|/g;
- $text =~ s/\&lt;/\</g;
- $text =~ s/\&gt;/\>/g;
- $text =~ s/\&bra;/\[/g;
- $text =~ s/\&ket;/\]/g;
- $text =~ s/\&#91;/\[/g;
- $text =~ s/\&#93;/\]/g;
- $text =~ s/\&amp;/\&/g;
+ $text =~ s/\&bar;/\|/g; # factor separator
+ $text =~ s/\&lt;/\</g; # xml
+ $text =~ s/\&gt;/\>/g; # xml
+ $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ $text =~ s/\&quot;/\"/g; # xml
+ $text =~ s/\&apos;/\'/g; # xml
+ $text =~ s/\&#91;/\[/g; # syntax non-terminal
+ $text =~ s/\&#93;/\]/g; # syntax non-terminal
+ $text =~ s/\&amp;/\&/g; # escape escape
my $word;
my $i;
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
index f4c1b4dd5..5d9690c04 100755
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -12,12 +12,14 @@ while(<STDIN>) {
s/ $//g;
# special characters in moses
- s/\&/\&amp;/g;
- s/\|/\&bar;/g;
- s/\</\&lt;/g;
- s/\>/\&gt;/g;
- s/\[/\&#91;/g;
- s/\]/\&#93;/g;
+ s/\&/\&amp;/g; # escape escape
+ s/\|/\&bar;/g; # factor separator
+ s/\</\&lt;/g; # xml
+ s/\>/\&gt;/g; # xml
+ s/\'/\&apos;/g; # xml
+ s/\"/\&quot;/g; # xml
+ s/\[/\&#91;/g; # syntax non-terminal
+ s/\]/\&#93;/g; # syntax non-terminal
# restore xml instructions
s/\&lt;(\S+) translation="([^\"]+)"&gt; (.+?) &lt;\/(\S+)&gt;/\<$1 translation=\"$2\"> $3 <\/$4>/g;
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 70bb318f7..0cb713740 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -149,12 +149,14 @@ sub tokenize {
$text =~ s/DOTMULTI/./g;
#escape special chars
- $text =~ s/\&/\&amp;/g;
- $text =~ s/\|/\&bar;/g;
- $text =~ s/\</\&lt;/g;
- $text =~ s/\>/\&gt;/g;
- $text =~ s/\[/\&#91;/g;
- $text =~ s/\]/\&#93;/g;
+ $text =~ s/\&/\&amp;/g; # escape escape
+ $text =~ s/\|/\&bar;/g; # factor separator
+ $text =~ s/\</\&lt;/g; # xml
+ $text =~ s/\>/\&gt;/g; # xml
+ $text =~ s/\'/\&apos;/g; # xml
+ $text =~ s/\"/\&quot;/g; # xml
+ $text =~ s/\[/\&#91;/g; # syntax non-terminal
+ $text =~ s/\]/\&#93;/g; # syntax non-terminal
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;