Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/tokenizer/tokenizer.perl')
-rwxr-xr-xscripts/tokenizer/tokenizer.perl8
1 files changed, 7 insertions, 1 deletions
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 4bc5f9a0d..124ddd819 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -265,6 +265,12 @@ sub tokenize
# if a colon is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
}
+ elsif ($language eq "tdt") {
+ # in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+ # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
+ $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
+ }
elsif (($language eq "ca")) {
# in Catalan, the middle dot can be used inside words:
# il�lusio
@@ -332,7 +338,7 @@ sub tokenize
$text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
$text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
}
- elsif ($language eq "so")
+ elsif (($language eq "so") or ($language eq "tdt"))
{
# Don't split glottals
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;