diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2018-03-06 18:02:18 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2018-03-06 18:02:18 +0300 |
commit | d9f380b2821c933da8e62dd9d4e2e24a7ea957db (patch) | |
tree | d26754c38cc0b41af7227d38fcac0cfd42f79aba | |
parent | 3f66a2fbb216762806a38ab2ef2727ad8b701935 (diff) |
escape, but not quoteswipoNew
-rwxr-xr-x | scripts/tokenizer/escape-special-chars-un.perl | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/scripts/tokenizer/escape-special-chars-un.perl b/scripts/tokenizer/escape-special-chars-un.perl new file mode 100755 index 000000000..ff88de1cc --- /dev/null +++ b/scripts/tokenizer/escape-special-chars-un.perl @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w + +use strict; + +while(<STDIN>) { + chop; + + # avoid general madness + s/[\000-\037]//g; + s/\s+/ /g; + s/^ //g; + s/ $//g; + + # special characters in moses + s/\&/\&/g; # escape escape + s/\|/\|/g; # factor separator + s/\</\</g; # xml + s/\>/\>/g; # xml + s/\'/\'/g; # xml + #s/\"/\"/g; # xml + s/\[/\[/g; # syntax non-terminal + s/\]/\]/g; # syntax non-terminal + + # restore xml instructions + s/\<(\S+) translation="(.+?)"> (.+?) <\/(\S+)>/\<$1 translation=\"$2\"> $3 <\/$4>/g; + print $_."\n"; +} |