Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2018-03-06 18:02:18 +0300
committerHieu Hoang <hieuhoang@gmail.com>2018-03-06 18:02:18 +0300
commitd9f380b2821c933da8e62dd9d4e2e24a7ea957db (patch)
treed26754c38cc0b41af7227d38fcac0cfd42f79aba
parent3f66a2fbb216762806a38ab2ef2727ad8b701935 (diff)
escape, but not quoteswipoNew
-rwxr-xr-xscripts/tokenizer/escape-special-chars-un.perl27
1 files changed, 27 insertions, 0 deletions
diff --git a/scripts/tokenizer/escape-special-chars-un.perl b/scripts/tokenizer/escape-special-chars-un.perl
new file mode 100755
index 000000000..ff88de1cc
--- /dev/null
+++ b/scripts/tokenizer/escape-special-chars-un.perl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+while(<STDIN>) {
+ chop;
+
+ # avoid general madness
+ s/[\000-\037]//g;
+ s/\s+/ /g;
+ s/^ //g;
+ s/ $//g;
+
+ # special characters in moses
+ s/\&/\&amp;/g; # escape escape
+ s/\|/\&#124;/g; # factor separator
+ s/\</\&lt;/g; # xml
+ s/\>/\&gt;/g; # xml
+ s/\'/\&apos;/g; # xml
+ #s/\"/\&quot;/g; # xml
+ s/\[/\&#91;/g; # syntax non-terminal
+ s/\]/\&#93;/g; # syntax non-terminal
+
+ # restore xml instructions
+ s/\&lt;(\S+) translation=&quot;(.+?)&quot;&gt; (.+?) &lt;\/(\S+)&gt;/\<$1 translation=\"$2\"> $3 <\/$4>/g;
+ print $_."\n";
+}