From 75d4c672e86cce7c7875fc131bbaf1299b7b9259 Mon Sep 17 00:00:00 2001 From: Raphael Merx Date: Sat, 13 Mar 2021 18:37:30 +0800 Subject: Add tokenisation support for the Tetun language --- .../nonbreaking_prefixes/nonbreaking_prefix.tdt | 210 +++++++++++++++++++++ scripts/tokenizer/tokenizer.perl | 8 +- 2 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt new file mode 100644 index 000000000..1303bba5b --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt @@ -0,0 +1,210 @@ +#File adapted for TDT from PT by Raphael Merx. Last update: 10.11.2009. +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +#Roman Numerals. A dot after one of these is not a sentence break in Portuguese. +I +II +III +IV +V +VI +VII +VIII +IX +X +XI +XII +XIII +XIV +XV +XVI +XVII +XVIII +XIX +XX +i +ii +iii +iv +v +vi +vii +viii +ix +x +xi +xii +xiii +xiv +xv +xvi +xvii +xviii +xix +xx + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Art +Ca +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +DRA +Dr +Dra +Dras +Drs +Eng +Enga +Engas +Engos +Ex +Exo +Exmo +Fig +Gen +Hosp +Insp +Lda +MM +MR +MRS +MS +Maj +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +Sra +Sras +Srs +Sto +Supt +Surg +adj +adm +adv +art +cit +col +con +corp +cpl +dr +dra +dras +drs +eng +enga +engas +engos +ex +exo +exmo +fig +op +prof +sr +sra +sras +srs +sto + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +p #NUMERIC_ONLY# +pp #NUMERIC_ONLY# + diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl index 4bc5f9a0d..124ddd819 100755 --- a/scripts/tokenizer/tokenizer.perl +++ b/scripts/tokenizer/tokenizer.perl @@ -265,6 +265,12 @@ sub tokenize # if a colon is not immediately followed by lower-case characters, separate it out anyway $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g; } + elsif ($language eq "tdt") { + # in Tetun, the apostrophe can be used inside words as an apostrophe-like character: + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g; + } elsif (($language eq "ca")) { # in Catalan, the middle dot can be used inside words: # il�lusio @@ -332,7 +338,7 @@ sub tokenize $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; } - elsif ($language eq "so") + elsif (($language eq "so") or ($language eq "tdt")) { # Don't split glottals $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; -- cgit v1.2.3