Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCristina España i Bonet <cristinae@users.noreply.github.com>2020-07-31 16:22:47 +0300
committerGitHub <noreply@github.com>2020-07-31 16:22:47 +0300
commit8d78dae6348009274cbd8fc045a192ca19e938bc (patch)
tree18a3b87a596bb4e41a638cf8e7677ae4d8562d1f
parent47915b561fb711973199daa2fc5a5f4dac3e22c7 (diff)
adding rules for Catalan
special characters within words and contractions closer to French than to English
-rwxr-xr-xscripts/tokenizer/tokenizer.perl9
1 files changed, 8 insertions, 1 deletions
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index b84b9eb31..4bc5f9a0d 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -265,6 +265,13 @@ sub tokenize
# if a colon is not immediately followed by lower-case characters, separate it out anyway
$text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
}
+ elsif (($language eq "ca")) {
+ # in Catalan, the middle dot can be used inside words:
+ # il�lusio
+ $text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g;
+ # if a middot is not immediately followed by lower-case characters, separate it out anyway
+ $text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g;
+ }
else {
$text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
}
@@ -317,7 +324,7 @@ sub tokenize
#special case for "1990's"
$text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
}
- elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga"))
+ elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca"))
{
#split contractions left
$text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;