diff options
author | Barry Haddow <barry.haddow@gmail.com> | 2019-11-08 17:56:58 +0300 |
---|---|---|
committer | Barry Haddow <barry.haddow@gmail.com> | 2019-11-08 17:56:58 +0300 |
commit | 103707002699a1e114a2f45c1ef1c2b20a981964 (patch) | |
tree | d76f79e98e8d3553853918a977e9fd2b84c28ab1 | |
parent | b1163966b1a9b4a3d6eec5a54b8bbf5f674a447b (diff) |
support for several Indic languages
14 files changed, 944 insertions, 285 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl index 9d588c265..38d56e0d6 100755 --- a/scripts/ems/support/split-sentences.perl +++ b/scripts/ems/support/split-sentences.perl @@ -124,8 +124,15 @@ sub preprocess { # Sentences can start with upper-case, numnbers, or Indic characters my $sentence_start = "\\p{IsUpper}0-9"; - $sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if $language eq "hi"; + $sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr"); $sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu"; + $sentence_start .= "\\p{Block: Bengali}" if ($language eq "asm" || $language eq "bn" || $language eq "mni"); + $sentence_start .= "\\p{Block: Kannada}" if $language eq "kn"; + $sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml"; + $sentence_start .= "\\p{Block: Oriya}" if $language eq "ory"; + $sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa"; + $sentence_start .= "\\p{Block: Tamil}" if $language eq "ta"; + $sentence_start .= "\\p{Block: Telugu}" if $language eq "te"; # we include danda and double danda (U+0964 and U+0965) as sentence split characters @@ -171,21 +178,6 @@ sub preprocess { #$text =~ s/ +/ /g; } - # Indic languages dealt with using regular rules above - # Hindi and Gujarati do not capitalise beginning of sentence characters. - # Also Hindi traditionally uses a danda as a sentence separator (U+0964) - #if ($language eq 'hi' || $language eq 'gu') { - # $text =~ s{ - # ( (?: [\.\?!\x{0964}\x{0965}] | \.\.+ ) - # [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]* - # ) - # \s+ - # ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]* - # [\p{Block: Devanagari_Extended}\p{Block: Gujarati}] - # ) - # }{$1\n$2}gx; - #} - # Urdu support # https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode if ($language eq 'ur') { diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.asm b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.asm new file mode 100644 index 000000000..866ee158a --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.asm @@ -0,0 +1,65 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Dr +ড + +#others + + +#phonetics +# A +এ +# B +বি +# C +সি +# D +ডি +# E +ই +# F +এফ +# G +জি +# H +এইচ +# I +আম +# J +জে +# K +কে +# L +এল +# M +এম +# N +এন +# O +হে +# P +পি +# Q +কিউ +# R +আর +# S +এস +# T +টি +# U +ইউ +# V +ভি +# W +ডব্লু +# X +এক্স +# Y +ওয়াই +# Z +জেড + +#consonants + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.bn b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.bn new file mode 100644 index 000000000..866ee158a --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.bn @@ -0,0 +1,65 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Dr +ড + +#others + + +#phonetics +# A +এ +# B +বি +# C +সি +# D +ডি +# E +ই +# F +এফ +# G +জি +# H +এইচ +# I +আম +# J +জে +# K +কে +# L +এল +# M +এম +# N +এন +# O +হে +# P +পি +# Q +কিউ +# R +আর +# S +এস +# T +টি +# U +ইউ +# V +ভি +# W +ডব্লু +# X +এক্স +# Y +ওয়াই +# Z +জেড + +#consonants + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et new file mode 120000 index 000000000..adf849495 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et @@ -0,0 +1 @@ +nonbreaking_prefix.fi
\ No newline at end of file diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu new file mode 100644 index 000000000..856cdb9ab --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu @@ -0,0 +1,105 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +રૂ +# Dr +ડો +# Dr +ડૉ +# Mr +શ્રી + +#others + + +#phonetics +# A +એ +# B +બી +# C +સી +# D +ડી +# E +ઇ +# F +એફ +# G +જી +# H +એચ +# I +આઈ +# J +જે +# K +કે +# L +એલ +# M +એમ +# N +એન +# O +ઓ +# P +પી +# Q +ક્યૂ +# R +આર +# S +એસ +# T +ટી +# U +યુ +# V +વી +# W +ડબલ્યુ +# X +એક્સ +# Y +વાય +# Z +ઝેડ + +#consonants +ક +ખ +ગ +ઘ +ઙ +ચ +છ +જ +ઝ +ઞ +ટ +ઠ +ડ +ઢ +ણ +ત +થ +દ +ધ +ન +પ +ફ +બ +ભ +મ +ય +ર +લ +ળ +વ +શ +ષ +સ +હ + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi index cf4eaa3c6..aa1f960ba 100644 --- a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi @@ -1,17 +1,22 @@ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. #common exceptions -# Dr -डॉ # Rs रु +# Dr +डॉ +# Dr +डा +# Mr +श्री #others -टी.वी +टीवी #phonetics # A ए +ऐ # B बी # C @@ -22,9 +27,11 @@ ई # F ऐफ +एफ # G जी # H +ऐच एच # I आइ @@ -34,10 +41,13 @@ के # L ऐल +एल # M ऐम +एम # N ऐन +एन # O ओ # P @@ -47,6 +57,7 @@ # R आर # S +ऐस एस # T टी @@ -58,8 +69,45 @@ डब्ल्यू # X ऐक्स +एक्स # Y वाय +वाई # Z ज़ैड +#consonants +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ल +व +श +ष +स +ह + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn new file mode 100644 index 000000000..1c20f61c2 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn @@ -0,0 +1,70 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +ರೂ +# Dr +ಡಾ +# Mr +ಶ್ರೀ + +#others + + +#phonetics +# A +ಎ +# B +ಬಿ +# C +ಸಿ +# D +ಡಿ +# E +ಇ +# F +ಎಫ್ +# G +ಜಿ +# H +ಹೆಚ್ +ಎಚ್ +# I +ಐ +# J +ಜೆ +# K +ಕೆ +# L +ಎಲ್ +# M +ಎಂ +# N +ಎನ್ +# O +ಒ +# P +ಪಿ +# Q +ಕ್ಯೂ +# R +ಆರ್ +# S +ಎಸ್ +# T +ಟಿ +# U +ಯು +# V +ವಿ +# W +ಡಬ್ಲ್ಯೂ +# X +ಎಕ್ಸ್ +# Y +ವೈ +# Z +ಜೆಡ್ + +#consonants + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml new file mode 100644 index 000000000..35ffc8f97 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml @@ -0,0 +1,67 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Dr +ഡോ +# Mr +ശ്രീ + +#others + + +#phonetics +# A +എ +# B +ബി +# C +സി +# D +ഡി +# E +ഇ +# F +എഫ് +# G +ജി +# H +എച്ച് +# I +ഐ +# J +ജെ +# K +കെ +# L +എൽ +# M +എം +# N +എൻ +# O +ഒ +# P +പി +# Q +ക്യൂ +# R +ആർ +# S +എസ് +# T +ടി +# U +യു +# V +വി +# W +ഡബ്ല്യു +# X +എക്സ് +# Y +വൈ +# Z +സെഡ് + +#consonants + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni new file mode 100644 index 000000000..22ecbae70 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni @@ -0,0 +1,65 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Dr +দা + +#others + + +#phonetics +# A +এ +# B +বি +# C +সি +# D +ডি +# E +ই +# F +এফ +# G +জি +# H +এইচ +# I +আম +# J +জে +# K +কে +# L +এল +# M +এম +# N +এন +# O +হে +# P +পি +# Q +কিউ +# R +আর +# S +এস +# T +টি +# U +ইউ +# V +ভি +# W +ডব্লু +# X +এক্স +# Y +ওয়াই +# Z +জেড + +#consonants + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr new file mode 100644 index 000000000..1ece23c12 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr @@ -0,0 +1,113 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +रु +# Dr +डॉ +# Dr +डा +# Mr +श्री + +#others + + +#phonetics +# A +ए +ऐ +# B +बी +# C +सी +# D +डी +# E +ई +# F +ऐफ +एफ +# G +जी +# H +ऐच +एच +# I +आइ +# J +जे +# K +के +# L +ऐल +एल +# M +ऐम +एम +# N +ऐन +एन +# O +ओ +# P +पी +# Q +क्यू +# R +आर +# S +ऐस +एस +# T +टी +# U +यू +# V +वी +# W +डब्ल्यू +# X +ऐक्स +एक्स +# Y +वाय +वाई +# Z +ज़ैड + +#consonants +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +प +फ +ब +भ +म +य +र +ल +व +श +ष +स +ह + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ory b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ory new file mode 100644 index 000000000..8442c0b77 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ory @@ -0,0 +1,101 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Mr +ରୀ + +#others + + +#phonetics +# A + +# B + +# C + +# D + +# E + +# F + +# G + +# H + +# I + +# J + +# K + +# L + +# M + +# N + +# O + +# P + +# Q + +# R + +# S + +# T + +# U + +# V + +# W + +# X + +# Y + +# Z + + +#consonants +କ +ଖ +ଗ +ଘ +ଙ +ଚ +ଛ +ଜ +ଝ +ଞ +ଟ +ଠ +ଡ +ଢ +ଣ +ତ +ଥ +ଦ +ଧ +ନ +ପ +ଫ +ବ +ଵ +ଭ +ମ +ଯ +ୟ +ର +ଲ +ଳ +ୱ +ଶ +ଷ +ସ +ହ + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.pa b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.pa new file mode 100644 index 000000000..d4ea62748 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.pa @@ -0,0 +1,102 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Dr +ਡਾ +# Dr +ਪ੍ਰੋ +# Mr +ਸ੍ਰੀ + +#others + + +#phonetics +# A +ਏ +# B +ਬੀ +# C +ਸੀ +# D +ਡੀ +# E +ਈ +# F +ਐੱਫ +# G +ਜੀ +# H +ਐਚ +# I +ਆਈ +# J +ਜੇ +# K +ਕੇ +# L +ਐਲ +# M +ਐੱਮ +# N +ਐੱਨ +# O +ਓ +# P +ਪੀ +# Q +ਕੀਓ +# R +ਆਰ +# S +ਐੱਸ +ਸ +# T +ਟੀ +# U +ਯੂ +# V +ਵੀ +# W +ਡਬਲਿਊ +# X +ਐਕ੍ਸ +# Y +ਵਾਈ +# Z +ਜ਼ੈਡ + +#consonants +ਕ +ਖ +ਗ +ਘ +ਙ +ਚ +ਛ +ਜ +ਝ +ਞ +ਟ +ਠ +ਡ +ਢ +ਣ +ਤ +ਥ +ਦ +ਧ +ਨ +ਪ +ਫ +ਬ +ਭ +ਮ +ਯ +ਰ +ਲ +ਵ +ੜ +ਸ +ਹ + diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta index a138e69bd..8e8bbcd3e 100644 --- a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta @@ -1,276 +1,71 @@ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. -#Special cases are included for prefixes that ONLY appear before 0-9 numbers. -#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) -#usually upper case letters are initials in a name -அ -ஆ -இ -ஈ -உ -ஊ -எ +#common exceptions +# Rs +ர +# Rs +ூ +# Mr +திரு + +#others + + +#phonetics +# A ஏ -ஐ -ஒ -ஓ -ஔ -ஃ -க -கா -கி -கீ -கு -கூ -கெ -கே -கை -கொ -கோ -கௌ -க் -ச -சா -சி +# B +பீ +# C சீ -சு -சூ -செ -சே -சை -சொ -சோ -சௌ -ச் -ட -டா -டி +# D டீ -டு -டூ -டெ -டே -டை -டொ -டோ -டௌ -ட் -த -தா -தி -தீ -து -தூ -தெ -தே -தை -தொ -தோ -தௌ -த் -ப -பா -பி -பீ -பு -பூ -பெ -பே -பை -பொ -போ -பௌ -ப் -ற -றா -றி -றீ -று -றூ -றெ -றே -றை -றொ -றோ -றௌ -ற் -ய -யா -யி -யீ -யு +# E +ஈ +# F +எஃப் +# G +ஜீ +# H +எச் +ஹெச் +# I +ஐ +# J +ஜே +ஜை +# K +கே +# L +எல் +# M +எம் +# N +என் +# O +ஓ +# P +ப்பீ +# Q +கியூ +# R +ஆர் +# S +எஸ் +# T +ட்டீ +# U யூ -யெ -யே -யை -யொ -யோ -யௌ -ய் -ர -ரா -ரி -ரீ -ரு -ரூ -ரெ -ரே -ரை -ரொ -ரோ -ரௌ -ர் -ல -லா -லி -லீ -லு -லூ -லெ -லே -லை -லொ -லோ -லௌ -ல் -வ -வா -வி +# V வீ -வு -வூ -வெ -வே +# W +டபிள்-யூ +# X +எக்ஸ் +# Y வை -வொ -வோ -வௌ -வ் -ள -ளா -ளி -ளீ -ளு -ளூ -ளெ -ளே -ளை -ளொ -ளோ -ளௌ -ள் -ழ -ழா -ழி -ழீ -ழு -ழூ -ழெ -ழே -ழை -ழொ -ழோ -ழௌ -ழ் -ங -ஙா -ஙி -ஙீ -ஙு -ஙூ -ஙெ -ஙே -ஙை -ஙொ -ஙோ -ஙௌ -ங் -ஞ -ஞா -ஞி -ஞீ -ஞு -ஞூ -ஞெ -ஞே -ஞை -ஞொ -ஞோ -ஞௌ -ஞ் -ண -ணா -ணி -ணீ -ணு -ணூ -ணெ -ணே -ணை -ணொ -ணோ -ணௌ -ண் -ந -நா -நி -நீ -நு -நூ -நெ -நே -நை -நொ -நோ -நௌ -ந் -ம -மா -மி -மீ -மு -மூ -மெ -மே -மை -மொ -மோ -மௌ -ம் -ன -னா -னி -னீ -னு -னூ -னெ -னே -னை -னொ -னோ -னௌ -ன் - - -#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks -திரு -திருமதி -வண -கௌரவ - - -#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) -உ.ம் -#கா.ம் -#எ.ம் +# Z +செட் +#consonants -#Numbers only. These should only induce breaks when followed by a numeric sequence -# add NUMERIC_ONLY after the word for this function -#This case is mostly for the english "No." which can either be a sentence of its own, or -#if followed by a number, a non-breaking prefix -No #NUMERIC_ONLY# -Nos -Art #NUMERIC_ONLY# -Nr -pp #NUMERIC_ONLY# diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te new file mode 100644 index 000000000..a596aab65 --- /dev/null +++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te @@ -0,0 +1,70 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. + +#common exceptions +# Rs +ర +# Rs +ూ +# Mr +శ్రీ + +#others + + +#phonetics +# A +ఎ +# B +బి +# C +సి +# D +డి +# E +ఇ +# F +ఎఫ్ +# G +జి +# H +హెచ్ +# I +ఐ +# J +జె +# K +కె +# L +ఎల్ +# M +ఎం +ఎమ్ +# N +ఎన్ +# O +ఓ +# P +పి +# Q +క్యూ +# R +ఆర్ +# S +ఎస్ +# T +టి +# U +యు +# V +వి +# W +డబ్ల్యూ +# X +ఎక్స్ +# Y +వై +# Z +జెడ్ + +#consonants + |