Add tokenisation support for the Tetun language

author: Raphael Merx <raphael.merx@gmail.com> 2021-03-13 13:37:30 +0300
committer: Raphael Merx <raphael.merx@gmail.com> 2021-03-13 13:39:56 +0300
commit: 75d4c672e86cce7c7875fc131bbaf1299b7b9259 (patch)
tree: a52ec50b7c1340dd3f9c31a78d51ecb298ae68e4
parent: 7dd812180e52eedc26b6f6ea49c875febcc7488c (diff)
2 files changed, 217 insertions, 1 deletions
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt
new file mode 100644
index 000000000..1303bba5b
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.tdt
@@ -0,0 +1,210 @@
+#File adapted for TDT from PT by Raphael Merx. Last update: 10.11.2009.
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Art
+Ca
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+DRA
+Dr
+Dra
+Dras
+Drs
+Eng
+Enga
+Engas
+Engos
+Ex
+Exo
+Exmo
+Fig
+Gen
+Hosp
+Insp
+Lda
+MM
+MR
+MRS
+MS
+Maj
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+Sra
+Sras
+Srs
+Sto
+Supt
+Surg
+adj
+adm
+adv
+art
+cit
+col
+con
+corp
+cpl
+dr
+dra
+dras
+drs
+eng
+enga
+engas
+engos
+ex
+exo
+exmo
+fig
+op
+prof
+sr
+sra
+sras
+srs
+sto
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+p #NUMERIC_ONLY#
+pp #NUMERIC_ONLY#
+
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 4bc5f9a0d..124ddd819 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -265,6 +265,12 @@ sub tokenize
         # if a colon is not immediately followed by lower-case characters, separate it out anyway
         $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
     }
+    elsif ($language eq "tdt") {
+        # in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+        # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
+    }
     elsif (($language eq "ca")) {
         # in Catalan, the middle dot can be used inside words:
         # il�lusio
@@ -332,7 +338,7 @@ sub tokenize
         $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
         $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
     }
-    elsif ($language eq "so") 
+    elsif (($language eq "so")  or ($language eq "tdt"))
     {
         # Don't split glottals
         $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
author	Raphael Merx <raphael.merx@gmail.com>	2021-03-13 13:37:30 +0300
committer	Raphael Merx <raphael.merx@gmail.com>	2021-03-13 13:39:56 +0300
commit	75d4c672e86cce7c7875fc131bbaf1299b7b9259 (patch)
tree	a52ec50b7c1340dd3f9c31a78d51ecb298ae68e4
parent	7dd812180e52eedc26b6f6ea49c875febcc7488c (diff)