Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-05-26 03:09:50 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-05-26 03:09:50 +0400
commit561b9ac9567d3e5b0bbc56fdae3b29961b8bc728 (patch)
tree2d65e7f4b0a4beec227a3ee173d120edaa3ae3d6 /scripts/tokenizer
parent180dd773f6507829c551c5512aaad7128f958385 (diff)
minor fixes
Diffstat (limited to 'scripts/tokenizer')
-rwxr-xr-xscripts/tokenizer/deescape-special-chars.perl2
-rwxr-xr-xscripts/tokenizer/detokenizer.perl7
-rwxr-xr-xscripts/tokenizer/escape-special-chars.perl8
-rwxr-xr-xscripts/tokenizer/tokenizer.perl4
4 files changed, 14 insertions, 7 deletions
diff --git a/scripts/tokenizer/deescape-special-chars.perl b/scripts/tokenizer/deescape-special-chars.perl
index c98e01ccc..55035ae6d 100755
--- a/scripts/tokenizer/deescape-special-chars.perl
+++ b/scripts/tokenizer/deescape-special-chars.perl
@@ -8,6 +8,8 @@ while(<STDIN>) {
s/\&gt;/\>/g;
s/\&bra;/\[/g;
s/\&ket;/\]/g;
+ s/\&#91;/\[/g;
+ s/\&#93;/\]/g;
s/\&amp;/\&/g;
print $_;
}
diff --git a/scripts/tokenizer/detokenizer.perl b/scripts/tokenizer/detokenizer.perl
index e2d7ea0bb..e55a1a26e 100755
--- a/scripts/tokenizer/detokenizer.perl
+++ b/scripts/tokenizer/detokenizer.perl
@@ -33,8 +33,9 @@ if ($HELP) {
exit;
}
-die "No built-in rules for language $language, claim en for default behaviour."
- if $language !~ /^(cs|en|fr|it)$/;
+if ($language !~ /^(cs|en|fr|it)$/) {
+ print STDERR "Warning: No built-in rules for language $language.\n"
+}
if (!$QUIET) {
print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
@@ -70,6 +71,8 @@ sub detokenize {
$text =~ s/\&gt;/\>/g;
$text =~ s/\&bra;/\[/g;
$text =~ s/\&ket;/\]/g;
+ $text =~ s/\&#91;/\[/g;
+ $text =~ s/\&#93;/\]/g;
$text =~ s/\&amp;/\&/g;
my $word;
diff --git a/scripts/tokenizer/escape-special-chars.perl b/scripts/tokenizer/escape-special-chars.perl
index 5c4dc9bb3..f4c1b4dd5 100755
--- a/scripts/tokenizer/escape-special-chars.perl
+++ b/scripts/tokenizer/escape-special-chars.perl
@@ -6,18 +6,20 @@ while(<STDIN>) {
chop;
# avoid general madness
+ s/[\000-\037]//g;
s/\s+/ /g;
s/^ //g;
s/ $//g;
- s/[\000-\037]//g;
# special characters in moses
s/\&/\&amp;/g;
s/\|/\&bar;/g;
s/\</\&lt;/g;
s/\>/\&gt;/g;
- s/\[/\&bra;/g;
- s/\]/\&ket;/g;
+ s/\[/\&#91;/g;
+ s/\]/\&#93;/g;
+ # restore xml instructions
+ s/\&lt;(\S+) translation="([^\"]+)"&gt; (.+?) &lt;\/(\S+)&gt;/\<$1 translation=\"$2\"> $3 <\/$4>/g;
print $_."\n";
}
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index a97d5e160..70bb318f7 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -153,8 +153,8 @@ sub tokenize {
$text =~ s/\|/\&bar;/g;
$text =~ s/\</\&lt;/g;
$text =~ s/\>/\&gt;/g;
- $text =~ s/\[/\&bra;/g;
- $text =~ s/\]/\&ket;/g;
+ $text =~ s/\[/\&#91;/g;
+ $text =~ s/\]/\&#93;/g;
#ensure final line break
$text .= "\n" unless $text =~ /\n$/;