Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieuhoang@gmail.com>2018-09-10 20:30:46 +0300
committerHieu Hoang <hieuhoang@gmail.com>2018-09-10 20:30:46 +0300
commitfd1758ba74b70728c9f2841bfc50a514d03a69d8 (patch)
tree6d8295f2454223f996a2a8fd60d304f3e488f87b
parente760db2d178e5b4325de24414f8e453d4a01e85f (diff)
parent06f519d4e2b077971718966d0b5e20fd087ef8eb (diff)
Merge branch 'master' of github.com:moses-smt/mosesdecoder
-rwxr-xr-xscripts/tokenizer/tokenizer.perl7
1 files changed, 7 insertions, 0 deletions
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index b1eba5ca6..f9b5cd60b 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -325,6 +325,13 @@ sub tokenize
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
}
+ elsif ($language eq "so")
+ {
+ # Don't split glottals
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ }
else
{
$text =~ s/\'/ \' /g;