diff options
author | Cristina España i Bonet <cristinae@users.noreply.github.com> | 2020-07-31 16:22:47 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-31 16:22:47 +0300 |
commit | 8d78dae6348009274cbd8fc045a192ca19e938bc (patch) | |
tree | 18a3b87a596bb4e41a638cf8e7677ae4d8562d1f | |
parent | 47915b561fb711973199daa2fc5a5f4dac3e22c7 (diff) |
adding rules for Catalan
special characters within words and contractions closer to French than to English
-rwxr-xr-x | scripts/tokenizer/tokenizer.perl | 9 |
1 files changed, 8 insertions, 1 deletions
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index b84b9eb31..4bc5f9a0d 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -265,6 +265,13 @@ sub tokenize # if a colon is not immediately followed by lower-case characters, separate it out anyway $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g; } + elsif (($language eq "ca")) { + # in Catalan, the middle dot can be used inside words: + # il�lusio + $text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g; + # if a middot is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g; + } else { $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; } @@ -317,7 +324,7 @@ sub tokenize #special case for "1990's" $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; } - elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga")) + elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca")) { #split contractions left $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; |