diff options
author | Hieu Hoang <hieuhoang@gmail.com> | 2018-09-10 20:30:46 +0300 |
---|---|---|
committer | Hieu Hoang <hieuhoang@gmail.com> | 2018-09-10 20:30:46 +0300 |
commit | fd1758ba74b70728c9f2841bfc50a514d03a69d8 (patch) | |
tree | 6d8295f2454223f996a2a8fd60d304f3e488f87b | |
parent | e760db2d178e5b4325de24414f8e453d4a01e85f (diff) | |
parent | 06f519d4e2b077971718966d0b5e20fd087ef8eb (diff) |
Merge branch 'master' of github.com:moses-smt/mosesdecoder
-rwxr-xr-x | scripts/tokenizer/tokenizer.perl | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index b1eba5ca6..f9b5cd60b 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -325,6 +325,13 @@ sub tokenize $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; } + elsif ($language eq "so") + { + # Don't split glottals + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + } else { $text =~ s/\'/ \' /g; |