Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBarry Haddow <barry.haddow@gmail.com>2019-11-08 17:56:58 +0300
committerBarry Haddow <barry.haddow@gmail.com>2019-11-08 17:56:58 +0300
commit103707002699a1e114a2f45c1ef1c2b20a981964 (patch)
treed76f79e98e8d3553853918a977e9fd2b84c28ab1
parentb1163966b1a9b4a3d6eec5a54b8bbf5f674a447b (diff)
support for several Indic languages
-rwxr-xr-xscripts/ems/support/split-sentences.perl24
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.asm65
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.bn65
l---------scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et1
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu105
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi54
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn70
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml67
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni65
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr113
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ory101
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.pa102
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta327
-rw-r--r--scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te70
14 files changed, 944 insertions, 285 deletions
diff --git a/scripts/ems/support/split-sentences.perl b/scripts/ems/support/split-sentences.perl
index 9d588c265..38d56e0d6 100755
--- a/scripts/ems/support/split-sentences.perl
+++ b/scripts/ems/support/split-sentences.perl
@@ -124,8 +124,15 @@ sub preprocess {
# Sentences can start with upper-case, numnbers, or Indic characters
my $sentence_start = "\\p{IsUpper}0-9";
- $sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if $language eq "hi";
+ $sentence_start .= "\\p{Block: Devanagari}\\p{Block: Devanagari_Extended}" if ($language eq "hi" || $language eq "mr");
$sentence_start .= "\\p{Block: Gujarati}" if $language eq "gu";
+ $sentence_start .= "\\p{Block: Bengali}" if ($language eq "asm" || $language eq "bn" || $language eq "mni");
+ $sentence_start .= "\\p{Block: Kannada}" if $language eq "kn";
+ $sentence_start .= "\\p{Block: Malayalam}" if $language eq "ml";
+ $sentence_start .= "\\p{Block: Oriya}" if $language eq "ory";
+ $sentence_start .= "\\p{Block: Gurmukhi}" if $language eq "pa";
+ $sentence_start .= "\\p{Block: Tamil}" if $language eq "ta";
+ $sentence_start .= "\\p{Block: Telugu}" if $language eq "te";
# we include danda and double danda (U+0964 and U+0965) as sentence split characters
@@ -171,21 +178,6 @@ sub preprocess {
#$text =~ s/ +/ /g;
}
- # Indic languages dealt with using regular rules above
- # Hindi and Gujarati do not capitalise beginning of sentence characters.
- # Also Hindi traditionally uses a danda as a sentence separator (U+0964)
- #if ($language eq 'hi' || $language eq 'gu') {
- # $text =~ s{
- # ( (?: [\.\?!\x{0964}\x{0965}] | \.\.+ )
- # [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPf}]*
- # )
- # \s+
- # ( [\'\"\x{201e}\x{bb}\(\[\¿\¡\p{IsPi}]*
- # [\p{Block: Devanagari_Extended}\p{Block: Gujarati}]
- # )
- # }{$1\n$2}gx;
- #}
-
# Urdu support
# https://en.wikipedia.org/wiki/Urdu_alphabet#Encoding_Urdu_in_Unicode
if ($language eq 'ur') {
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.asm b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.asm
new file mode 100644
index 000000000..866ee158a
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.asm
@@ -0,0 +1,65 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Dr
+ড
+
+#others
+
+
+#phonetics
+# A
+এ
+# B
+বি
+# C
+সি
+# D
+ডি
+# E
+ই
+# F
+এফ
+# G
+জি
+# H
+এইচ
+# I
+আম
+# J
+জে
+# K
+কে
+# L
+এল
+# M
+এম
+# N
+এন
+# O
+হে
+# P
+পি
+# Q
+কিউ
+# R
+আর
+# S
+এস
+# T
+টি
+# U
+ইউ
+# V
+ভি
+# W
+ডব্লু
+# X
+এক্স
+# Y
+ওয়াই
+# Z
+জেড
+
+#consonants
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.bn b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.bn
new file mode 100644
index 000000000..866ee158a
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.bn
@@ -0,0 +1,65 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Dr
+ড
+
+#others
+
+
+#phonetics
+# A
+এ
+# B
+বি
+# C
+সি
+# D
+ডি
+# E
+ই
+# F
+এফ
+# G
+জি
+# H
+এইচ
+# I
+আম
+# J
+জে
+# K
+কে
+# L
+এল
+# M
+এম
+# N
+এন
+# O
+হে
+# P
+পি
+# Q
+কিউ
+# R
+আর
+# S
+এস
+# T
+টি
+# U
+ইউ
+# V
+ভি
+# W
+ডব্লু
+# X
+এক্স
+# Y
+ওয়াই
+# Z
+জেড
+
+#consonants
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et
new file mode 120000
index 000000000..adf849495
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.et
@@ -0,0 +1 @@
+nonbreaking_prefix.fi \ No newline at end of file
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu
new file mode 100644
index 000000000..856cdb9ab
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.gu
@@ -0,0 +1,105 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Rs
+રૂ
+# Dr
+ડો
+# Dr
+ડૉ
+# Mr
+શ્રી
+
+#others
+
+
+#phonetics
+# A
+એ
+# B
+બી
+# C
+સી
+# D
+ડી
+# E
+ઇ
+# F
+એફ
+# G
+જી
+# H
+એચ
+# I
+આઈ
+# J
+જે
+# K
+કે
+# L
+એલ
+# M
+એમ
+# N
+એન
+# O
+ઓ
+# P
+પી
+# Q
+ક્યૂ
+# R
+આર
+# S
+એસ
+# T
+ટી
+# U
+યુ
+# V
+વી
+# W
+ડબલ્યુ
+# X
+એક્સ
+# Y
+વાય
+# Z
+ઝેડ
+
+#consonants
+ક
+ખ
+ગ
+ઘ
+ઙ
+ચ
+છ
+જ
+ઝ
+ઞ
+ટ
+ઠ
+ડ
+ઢ
+ણ
+ત
+થ
+દ
+ધ
+ન
+પ
+ફ
+બ
+ભ
+મ
+ય
+ર
+લ
+ળ
+વ
+શ
+ષ
+સ
+હ
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi
index cf4eaa3c6..aa1f960ba 100644
--- a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hi
@@ -1,17 +1,22 @@
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
#common exceptions
-# Dr
-डॉ
# Rs
रु
+# Dr
+डॉ
+# Dr
+डा
+# Mr
+श्री
#others
-टी.वी
+टीवी
#phonetics
# A
+ऐ
# B
बी
# C
@@ -22,9 +27,11 @@
# F
ऐफ
+एफ
# G
जी
# H
+ऐच
एच
# I
आइ
@@ -34,10 +41,13 @@
के
# L
ऐल
+एल
# M
ऐम
+एम
# N
ऐन
+एन
# O
# P
@@ -47,6 +57,7 @@
# R
आर
# S
+ऐस
एस
# T
टी
@@ -58,8 +69,45 @@
डब्ल्यू
# X
ऐक्स
+एक्स
# Y
वाय
+वाई
# Z
ज़ैड
+#consonants
+क
+ख
+ग
+घ
+ङ
+च
+छ
+ज
+झ
+ञ
+ट
+ठ
+ड
+ढ
+ण
+त
+थ
+द
+ध
+न
+प
+फ
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn
new file mode 100644
index 000000000..1c20f61c2
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.kn
@@ -0,0 +1,70 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Rs
+ರೂ
+# Dr
+ಡಾ
+# Mr
+ಶ್ರೀ
+
+#others
+
+
+#phonetics
+# A
+ಎ
+# B
+ಬಿ
+# C
+ಸಿ
+# D
+ಡಿ
+# E
+ಇ
+# F
+ಎಫ್
+# G
+ಜಿ
+# H
+ಹೆಚ್
+ಎಚ್‌
+# I
+ಐ
+# J
+ಜೆ
+# K
+ಕೆ
+# L
+ಎಲ್
+# M
+ಎಂ
+# N
+ಎನ್
+# O
+ಒ
+# P
+ಪಿ
+# Q
+ಕ್ಯೂ
+# R
+ಆರ್
+# S
+ಎಸ್
+# T
+ಟಿ
+# U
+ಯು
+# V
+ವಿ
+# W
+ಡಬ್ಲ್ಯೂ
+# X
+ಎಕ್ಸ್
+# Y
+ವೈ
+# Z
+ಜೆಡ್
+
+#consonants
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml
new file mode 100644
index 000000000..35ffc8f97
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ml
@@ -0,0 +1,67 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Dr
+ഡോ
+# Mr
+ശ്രീ
+
+#others
+
+
+#phonetics
+# A
+എ
+# B
+ബി
+# C
+സി
+# D
+ഡി
+# E
+ഇ
+# F
+എഫ്
+# G
+ജി
+# H
+എച്ച്
+# I
+ഐ
+# J
+ജെ
+# K
+കെ
+# L
+എൽ
+# M
+എം
+# N
+എൻ
+# O
+ഒ
+# P
+പി
+# Q
+ക്യൂ
+# R
+ആർ
+# S
+എസ്
+# T
+ടി
+# U
+യു
+# V
+വി
+# W
+ഡബ്ല്യു
+# X
+എക്സ്
+# Y
+വൈ
+# Z
+സെഡ്
+
+#consonants
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni
new file mode 100644
index 000000000..22ecbae70
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mni
@@ -0,0 +1,65 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Dr
+দা
+
+#others
+
+
+#phonetics
+# A
+এ
+# B
+বি
+# C
+সি
+# D
+ডি
+# E
+ই
+# F
+এফ
+# G
+জি
+# H
+এইচ
+# I
+আম
+# J
+জে
+# K
+কে
+# L
+এল
+# M
+এম
+# N
+এন
+# O
+হে
+# P
+পি
+# Q
+কিউ
+# R
+আর
+# S
+এস
+# T
+টি
+# U
+ইউ
+# V
+ভি
+# W
+ডব্লু
+# X
+এক্স
+# Y
+ওয়াই
+# Z
+জেড
+
+#consonants
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr
new file mode 100644
index 000000000..1ece23c12
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.mr
@@ -0,0 +1,113 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Rs
+रु
+# Dr
+डॉ
+# Dr
+डा
+# Mr
+श्री
+
+#others
+
+
+#phonetics
+# A
+ए
+ऐ
+# B
+बी
+# C
+सी
+# D
+डी
+# E
+ई
+# F
+ऐफ
+एफ
+# G
+जी
+# H
+ऐच
+एच
+# I
+आइ
+# J
+जे
+# K
+के
+# L
+ऐल
+एल
+# M
+ऐम
+एम
+# N
+ऐन
+एन
+# O
+ओ
+# P
+पी
+# Q
+क्यू
+# R
+आर
+# S
+ऐस
+एस
+# T
+टी
+# U
+यू
+# V
+वी
+# W
+डब्ल्यू
+# X
+ऐक्स
+एक्स
+# Y
+वाय
+वाई
+# Z
+ज़ैड
+
+#consonants
+क
+ख
+ग
+घ
+ङ
+च
+छ
+ज
+झ
+ञ
+ट
+ठ
+ड
+ढ
+ण
+त
+थ
+द
+ध
+न
+प
+फ
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ory b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ory
new file mode 100644
index 000000000..8442c0b77
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ory
@@ -0,0 +1,101 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Mr
+ରୀ
+
+#others
+
+
+#phonetics
+# A
+
+# B
+
+# C
+
+# D
+
+# E
+
+# F
+
+# G
+
+# H
+
+# I
+
+# J
+
+# K
+
+# L
+
+# M
+
+# N
+
+# O
+
+# P
+
+# Q
+
+# R
+
+# S
+
+# T
+
+# U
+
+# V
+
+# W
+
+# X
+
+# Y
+
+# Z
+
+
+#consonants
+କ
+ଖ
+ଗ
+ଘ
+ଙ
+ଚ
+ଛ
+ଜ
+ଝ
+ଞ
+ଟ
+ଠ
+ଡ
+ଢ
+ଣ
+ତ
+ଥ
+ଦ
+ଧ
+ନ
+ପ
+ଫ
+ବ
+ଵ
+ଭ
+ମ
+ଯ
+ୟ
+ର
+ଲ
+ଳ
+ୱ
+ଶ
+ଷ
+ସ
+ହ
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.pa b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.pa
new file mode 100644
index 000000000..d4ea62748
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.pa
@@ -0,0 +1,102 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Dr
+ਡਾ
+# Dr
+ਪ੍ਰੋ
+# Mr
+ਸ੍ਰੀ
+
+#others
+
+
+#phonetics
+# A
+ਏ
+# B
+ਬੀ
+# C
+ਸੀ
+# D
+ਡੀ
+# E
+ਈ
+# F
+ਐੱਫ
+# G
+ਜੀ
+# H
+ਐਚ
+# I
+ਆਈ
+# J
+ਜੇ
+# K
+ਕੇ
+# L
+ਐਲ
+# M
+ਐੱਮ
+# N
+ਐੱਨ
+# O
+ਓ
+# P
+ਪੀ
+# Q
+ਕੀਓ
+# R
+ਆਰ
+# S
+ਐੱਸ
+ਸ
+# T
+ਟੀ
+# U
+ਯੂ
+# V
+ਵੀ
+# W
+ਡਬਲਿਊ
+# X
+ਐਕ੍ਸ
+# Y
+ਵਾਈ
+# Z
+ਜ਼ੈਡ
+
+#consonants
+ਕ
+ਖ
+ਗ
+ਘ
+ਙ
+ਚ
+ਛ
+ਜ
+ਝ
+ਞ
+ਟ
+ਠ
+ਡ
+ਢ
+ਣ
+ਤ
+ਥ
+ਦ
+ਧ
+ਨ
+ਪ
+ਫ
+ਬ
+ਭ
+ਮ
+ਯ
+ਰ
+ਲ
+ਵ
+ੜ
+ਸ
+ਹ
+
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta
index a138e69bd..8e8bbcd3e 100644
--- a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.ta
@@ -1,276 +1,71 @@
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-அ
-ஆ
-இ
-ஈ
-உ
-ஊ
-எ
+#common exceptions
+# Rs
+ர
+# Rs
+ூ
+# Mr
+திரு
+
+#others
+
+
+#phonetics
+# A
-ஐ
-ஒ
-ஓ
-ஔ
-ஃ
-க
-கா
-கி
-கீ
-கு
-கூ
-கெ
-கே
-கை
-கொ
-கோ
-கௌ
-க்
-ச
-சா
-சி
+# B
+பீ
+# C
சீ
-சு
-சூ
-செ
-சே
-சை
-சொ
-சோ
-சௌ
-ச்
-ட
-டா
-டி
+# D
டீ
-டு
-டூ
-டெ
-டே
-டை
-டொ
-டோ
-டௌ
-ட்
-த
-தா
-தி
-தீ
-து
-தூ
-தெ
-தே
-தை
-தொ
-தோ
-தௌ
-த்
-ப
-பா
-பி
-பீ
-பு
-பூ
-பெ
-பே
-பை
-பொ
-போ
-பௌ
-ப்
-ற
-றா
-றி
-றீ
-று
-றூ
-றெ
-றே
-றை
-றொ
-றோ
-றௌ
-ற்
-ய
-யா
-யி
-யீ
-யு
+# E
+ஈ
+# F
+எஃப்
+# G
+ஜீ
+# H
+எச்
+ஹெச்
+# I
+ஐ
+# J
+ஜே
+ஜை
+# K
+கே
+# L
+எல்
+# M
+எம்
+# N
+என்
+# O
+ஓ
+# P
+ப்பீ
+# Q
+கியூ
+# R
+ஆர்
+# S
+எஸ்
+# T
+ட்டீ
+# U
யூ
-யெ
-யே
-யை
-யொ
-யோ
-யௌ
-ய்
-ர
-ரா
-ரி
-ரீ
-ரு
-ரூ
-ரெ
-ரே
-ரை
-ரொ
-ரோ
-ரௌ
-ர்
-ல
-லா
-லி
-லீ
-லு
-லூ
-லெ
-லே
-லை
-லொ
-லோ
-லௌ
-ல்
-வ
-வா
-வி
+# V
வீ
-வு
-வூ
-வெ
-வே
+# W
+டபிள்-யூ
+# X
+எக்ஸ்
+# Y
வை
-வொ
-வோ
-வௌ
-வ்
-ள
-ளா
-ளி
-ளீ
-ளு
-ளூ
-ளெ
-ளே
-ளை
-ளொ
-ளோ
-ளௌ
-ள்
-ழ
-ழா
-ழி
-ழீ
-ழு
-ழூ
-ழெ
-ழே
-ழை
-ழொ
-ழோ
-ழௌ
-ழ்
-ங
-ஙா
-ஙி
-ஙீ
-ஙு
-ஙூ
-ஙெ
-ஙே
-ஙை
-ஙொ
-ஙோ
-ஙௌ
-ங்
-ஞ
-ஞா
-ஞி
-ஞீ
-ஞு
-ஞூ
-ஞெ
-ஞே
-ஞை
-ஞொ
-ஞோ
-ஞௌ
-ஞ்
-ண
-ணா
-ணி
-ணீ
-ணு
-ணூ
-ணெ
-ணே
-ணை
-ணொ
-ணோ
-ணௌ
-ண்
-ந
-நா
-நி
-நீ
-நு
-நூ
-நெ
-நே
-நை
-நொ
-நோ
-நௌ
-ந்
-ம
-மா
-மி
-மீ
-மு
-மூ
-மெ
-மே
-மை
-மொ
-மோ
-மௌ
-ம்
-ன
-னா
-னி
-னீ
-னு
-னூ
-னெ
-னே
-னை
-னொ
-னோ
-னௌ
-ன்
-
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-திரு
-திருமதி
-வண
-கௌரவ
-
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-உ.ம்
-#கா.ம்
-#எ.ம்
+# Z
+செட்
+#consonants
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-No #NUMERIC_ONLY#
-Nos
-Art #NUMERIC_ONLY#
-Nr
-pp #NUMERIC_ONLY#
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te
new file mode 100644
index 000000000..a596aab65
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.te
@@ -0,0 +1,70 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+
+#common exceptions
+# Rs
+ర
+# Rs
+ూ
+# Mr
+శ్రీ
+
+#others
+
+
+#phonetics
+# A
+ఎ
+# B
+బి
+# C
+సి
+# D
+డి
+# E
+ఇ
+# F
+ఎఫ్
+# G
+జి
+# H
+హెచ్‌
+# I
+ఐ
+# J
+జె
+# K
+కె
+# L
+ఎల్
+# M
+ఎం
+ఎమ్
+# N
+ఎన్
+# O
+ఓ
+# P
+పి
+# Q
+క్యూ
+# R
+ఆర్
+# S
+ఎస్
+# T
+టి
+# U
+యు
+# V
+వి
+# W
+డబ్ల్యూ
+# X
+ఎక్స్
+# Y
+వై
+# Z
+జెడ్
+
+#consonants
+