lock m_vocab variable access in Encode() and Lookup(). Other functions are still not threadsafe

author: Hieu Hoang <fishandfrolick@gmail.com> 2012-06-26 20:20:46 +0400
committer: Hieu Hoang <fishandfrolick@gmail.com> 2012-06-26 21:33:34 +0400
commit: 93bff3f2013b2732c67355d2e9bd253fba4670a7 (patch)
tree: 9fb8e2359a4d78c3ba554f75ae22dd6402634c43 /scripts/tokenizer
parent: 272f39a48719bb7c2e4582e73e769b2387e2dcb9 (diff)
18 files changed, 1 insertions, 2720 deletions
diff --git a/scripts/tokenizer/nonbreaking_prefixes/README.txt b/scripts/tokenizer/nonbreaking_prefixes/README.txt
deleted file mode 100644
index 02cdfccb9..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/README.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-The language suffix can be found here:
-
-http://www.loc.gov/standards/iso639-2/php/code_list.php
-
-
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca
deleted file mode 100644
index 2f4fdfc67..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca
+++ /dev/null
@@ -1,75 +0,0 @@
-Dr
-Dra
-pàg
-p
-c
-av
-Sr
-Sra
-adm
-esq
-Prof
-S.A
-S.L
-p.e
-ptes
-Sta
-St
-pl
-màx
-cast
-dir
-nre
-fra
-admdora
-Emm
-Excma
-espf
-dc
-admdor
-tel
-angl
-aprox
-ca
-dept
-dj
-dl
-dt
-ds
-dg
-dv
-ed
-entl
-al
-i.e
-maj
-smin
-n
-núm
-pta
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de
deleted file mode 100644
index 35fdf5eee..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.de
+++ /dev/null
@@ -1,325 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-#no german words end in single lower-case letters, so we throw those in too.
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-
-
-#Roman Numerals. A dot after one of these is not a sentence break in German.
-I
-II
-III
-IV
-V
-VI
-VII
-VIII
-IX
-X
-XI
-XII
-XIII
-XIV
-XV
-XVI
-XVII
-XVIII
-XIX
-XX
-i
-ii
-iii
-iv
-v
-vi
-vii
-viii
-ix
-x
-xi
-xii
-xiii
-xiv
-xv
-xvi
-xvii
-xviii
-xix
-xx
-
-#Titles and Honorifics
-Adj
-Adm
-Adv
-Asst
-Bart
-Bldg
-Brig
-Bros
-Capt
-Cmdr
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-Dr
-Ens
-Gen
-Gov
-Hon
-Hosp
-Insp
-Lt
-MM
-MR
-MRS
-MS
-Maj
-Messrs
-Mlle
-Mme
-Mr
-Mrs
-Ms
-Msgr
-Op
-Ord
-Pfc
-Ph
-Prof
-Pvt
-Rep
-Reps
-Res
-Rev
-Rt
-Sen
-Sens
-Sfc
-Sgt
-Sr
-St
-Supt
-Surg
-
-#Misc symbols
-Mio
-Mrd
-bzw
-v
-vs
-usw
-d.h
-z.B
-u.a
-etc
-Mrd
-MwSt
-ggf
-d.J
-D.h
-m.E
-vgl
-I.F
-z.T
-sogen
-ff
-u.E
-g.U
-g.g.A
-c.-à-d
-Buchst
-u.s.w
-sog
-u.ä
-Std
-evtl
-Zt
-Chr
-u.U
-o.ä
-Ltd
-b.A
-z.Zt
-spp
-sen
-SA
-k.o
-jun
-i.H.v
-dgl
-dergl
-Co
-zzt
-usf
-s.p.a
-Dkr
-Corp
-bzgl
-BSE
-
-#Number indicators
-# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
-No
-Nos
-Art
-Nr
-pp
-ca
-Ca
-
-#Ordinals are done with . in German - "1." = "1st" in English
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20
-21
-22
-23
-24
-25
-26
-27
-28
-29
-30
-31
-32
-33
-34
-35
-36
-37
-38
-39
-40
-41
-42
-43
-44
-45
-46
-47
-48
-49
-50
-51
-52
-53
-54
-55
-56
-57
-58
-59
-60
-61
-62
-63
-64
-65
-66
-67
-68
-69
-70
-71
-72
-73
-74
-75
-76
-77
-78
-79
-80
-81
-82
-83
-84
-85
-86
-87
-88
-89
-90
-91
-92
-93
-94
-95
-96
-97
-98
-99
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el
deleted file mode 100644
index 0470f9192..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.el
+++ /dev/null
@@ -1,2 +0,0 @@
-# for now, just include the Greek equivalent of "Mr."
-κ
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en
deleted file mode 100644
index e1a3733b5..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.en
+++ /dev/null
@@ -1,107 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-Adj
-Adm
-Adv
-Asst
-Bart
-Bldg
-Brig
-Bros
-Capt
-Cmdr
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-Dr
-Drs
-Ens
-Gen
-Gov
-Hon
-Hr
-Hosp
-Insp
-Lt
-MM
-MR
-MRS
-MS
-Maj
-Messrs
-Mlle
-Mme
-Mr
-Mrs
-Ms
-Msgr
-Op
-Ord
-Pfc
-Ph
-Prof
-Pvt
-Rep
-Reps
-Res
-Rev
-Rt
-Sen
-Sens
-Sfc
-Sgt
-Sr
-St
-Supt
-Surg
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-v
-vs
-i.e
-rev
-e.g
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-No #NUMERIC_ONLY# 
-Nos
-Art #NUMERIC_ONLY#
-Nr
-pp #NUMERIC_ONLY#
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es
deleted file mode 100644
index d8b275518..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es
+++ /dev/null
@@ -1,118 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
-
-A.C
-Apdo
-Av
-Bco
-CC.AA
-Da
-Dep
-Dn
-Dr
-Dra
-EE.UU
-Excmo
-FF.CC
-Fil 
-Gral
-J.C
-Let
-Lic
-N.B
-P.D
-P.V.P
-Prof
-Pts
-Rte
-S.A
-S.A.R
-S.E
-S.L
-S.R.C
-Sr
-Sra
-Srta
-Sta
-Sto
-T.V.E
-Tel
-Ud
-Uds
-V.B
-V.E
-Vd
-Vds
-a/c
-adj
-admón
-afmo
-apdo
-av
-c
-c.f
-c.g
-cap
-cm
-cta
-dcha
-doc
-ej
-entlo
-esq
-etc
-f.c
-gr 
-grs
-izq
-kg
-km
-mg
-mm
-nÃºm
-núm
-p
-p.a
-p.ej
-ptas
-pÃ¡g 
-pÃ¡gs
-pág
-págs
-q.e.g.e
-q.e.s.m
-s
-s.s.s
-vid
-vol
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.fr b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.fr
deleted file mode 100644
index 28126fa57..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.fr
+++ /dev/null
@@ -1,153 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-#
-#any single upper case letter  followed by a period is not a sentence ender
-#usually upper case letters are initials in a name
-#no French words end in single lower-case letters, so we throw those in too?
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-
-# Period-final abbreviation list for French
-A.C.N
-A.M
-art
-ann
-apr
-av
-auj
-lib
-B.P
-boul
-ca
-c.-à-d
-cf
-ch.-l
-chap
-contr
-C.P.I
-C.Q.F.D
-C.N
-C.N.S
-C.S
-dir
-éd
-e.g
-env
-al
-etc
-E.V
-ex
-fasc
-fém
-fig
-fr
-hab
-ibid
-id
-i.e
-inf
-LL.AA
-LL.AA.II
-LL.AA.RR
-LL.AA.SS
-L.D
-LL.EE
-LL.MM
-LL.MM.II.RR
-loc.cit
-masc
-MM
-ms
-N.B
-N.D.A
-N.D.L.R
-N.D.T
-n/réf
-NN.SS
-N.S
-N.D
-N.P.A.I
-p.c.c
-pl
-pp
-p.ex
-p.j
-P.S
-R.A.S
-R.-V
-R.P
-R.I.P
-SS
-S.S
-S.A
-S.A.I
-S.A.R
-S.A.S
-S.E
-sec
-sect
-sing
-S.M
-S.M.I.R
-sq
-sqq
-suiv
-sup
-suppl
-tél
-T.S.V.P
-vb
-vol
-vs
-X.O
-Z.I
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is
deleted file mode 100644
index 5b8a71086..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is
+++ /dev/null
@@ -1,251 +0,0 @@
-no #NUMERIC_ONLY#
-No #NUMERIC_ONLY#
-nr #NUMERIC_ONLY#
-Nr #NUMERIC_ONLY#
-nR #NUMERIC_ONLY#
-NR #NUMERIC_ONLY#
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-^
-í
-á
-ó
-æ
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-ab.fn
-a.fn
-afs
-al
-alm
-alg
-andh
-ath
-aths
-atr
-ao
-au
-aukaf
-áfn
-áhrl.s
-áhrs
-ákv.gr
-ákv
-bh
-bls
-dr
-e.Kr
-et
-ef
-efn
-ennfr
-eink
-end
-e.st
-erl
-fél
-fskj
-fh
-f.hl
-físl
-fl
-fn
-fo
-forl
-frb
-frl
-frh
-frt
-fsl
-fsh
-fs
-fsk
-fst
-f.Kr
-ft
-fv
-fyrrn
-fyrrv
-germ
-gm
-gr
-hdl
-hdr
-hf
-hl
-hlsk
-hljsk
-hljv
-hljóðv
-hr
-hv
-hvk
-holl
-Hos
-höf
-hk
-hrl
-ísl
-kaf
-kap
-Khöfn
-kk
-kg
-kk
-km
-kl
-klst
-kr
-kt
-kgúrsk
-kvk
-leturbr
-lh
-lh.nt
-lh.þt
-lo
-ltr
-mlja
-mljó
-millj
-mm
-mms
-m.fl
-miðm
-mgr
-mst
-mín
-nf
-nh
-nhm
-nl
-nk
-nmgr
-no
-núv
-nt
-o.áfr
-o.m.fl
-ohf
-o.fl
-o.s.frv
-ófn
-ób
-óákv.gr
-óákv
-pfn
-PR
-pr
-Ritstj
-Rvík
-Rvk
-samb
-samhlj
-samn
-samn
-sbr
-sek
-sérn
-sf
-sfn
-sh
-sfn
-sh
-s.hl
-sk
-skv
-sl
-sn
-so
-ss.us
-s.st
-samþ
-sbr
-shlj
-sign
-skál
-st
-st.s
-stk
-sþ
-teg
-tbl
-tfn
-tl
-tvíhlj
-tvt
-till
-to
-umr
-uh
-us
-uppl
-útg
-vb
-Vf
-vh
-vkf
-Vl
-vl
-vlf
-vmf
-8vo
-vsk
-vth
-þt
-þf
-þjs
-þgf
-þlt
-þolm
-þm
-þml
-þýð
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
deleted file mode 100644
index 992b9ecd4..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
+++ /dev/null
@@ -1,180 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-Adj
-Adm
-Adv
-Amn 
-Arch 
-Asst
-Avv
-Bart
-Bcc
-Bldg
-Brig
-Bros
-C.A.P
-C.P
-Capt
-Cc
-Cmdr
-Co
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-Dott
-Dr
-Drs
-Egr
-Ens
-Gen
-Geom
-Gov
-Hon
-Hosp
-Hr
-Id
-Ing
-Insp
-Lt
-MM
-MR
-MRS
-MS
-Maj
-Messrs
-Mlle
-Mme
-Mo
-Mons
-Mr
-Mrs
-Ms
-Msgr
-N.B
-Op
-Ord
-P.S
-P.T
-Pfc
-Ph
-Prof
-Pvt
-RP
-RSVP
-Rag
-Rep
-Reps
-Res
-Rev
-Rif
-Rt
-S.A
-S.B.F
-S.P.M
-S.p.A
-S.r.l
-Sen
-Sens
-Sfc
-Sgt
-Sig
-Sigg
-Soc
-Spett
-Sr
-St
-Supt
-Surg
-V.P
-
-# other
-a.c 
-acc
-all 
-banc
-c.a
-c.c.p
-c.m
-c.p
-c.s
-c.v
-corr
-dott
-e.p.c
-ecc
-es 
-fatt
-gg
-int
-lett
-ogg
-on
-p.c
-p.c.c
-p.es
-p.f
-p.r
-p.v
-post
-pp
-racc
-ric
-s.n.c
-seg
-sgg
-ss
-tel
-u.s
-v.r
-v.s
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-v
-vs
-i.e
-rev
-e.g
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-No #NUMERIC_ONLY# 
-Nos
-Art #NUMERIC_ONLY#
-Nr
-pp #NUMERIC_ONLY#
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.nl b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.nl
deleted file mode 100644
index c80c41772..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.nl
+++ /dev/null
@@ -1,115 +0,0 @@
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
-#         http://nl.wikipedia.org/wiki/Aanspreekvorm
-#         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-bacc
-bc
-bgen
-c.i
-dhr
-dr
-dr.h.c
-drs
-drs
-ds
-eint
-fa
-Fa
-fam
-gen
-genm
-ing
-ir
-jhr
-jkvr
-jr
-kand
-kol
-lgen
-lkol
-Lt
-maj
-Mej
-mevr
-Mme
-mr
-mr
-Mw
-o.b.s
-plv
-prof
-ritm
-tint
-Vz
-Z.D
-Z.D.H
-Z.E
-Z.Em
-Z.H
-Z.K.H
-Z.K.M
-Z.M
-z.v
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
-a.g.v
-bijv
-bijz
-bv
-d.w.z
-e.c
-e.g
-e.k
-ev
-i.p.v
-i.s.m
-i.t.t
-i.v.m
-m.a.w
-m.b.t
-m.b.v
-m.h.o
-m.i
-m.i.v
-v.w.t
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-Nr #NUMERIC_ONLY# 
-Nrs 
-nrs
-nr #NUMERIC_ONLY#
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl
deleted file mode 100644
index 6b7c106e6..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl
+++ /dev/null
@@ -1,283 +0,0 @@
-adw
-afr
-akad
-al
-Al
-am
-amer
-arch
-art
-Art
-artyst
-astr
-austr
-bałt
-bdb
-bł
-bm
-br
-bryg
-bryt
-centr
-ces
-chem
-chiń
-chir
-c.k
-c.o
-cyg
-cyw
-cyt
-czes
-czw
-cd
-Cd
-czyt
-ćw
-ćwicz
-daw
-dcn
-dekl
-demokr
-det
-diec
-dł
-dn
-dot
-dol
-dop
-dost
-dosł
-h.c
-ds
-dst
-duszp
-dypl
-egz
-ekol
-ekon
-elektr
-em
-ew
-fab
-farm
-fot
-fr
-gat
-gastr
-geogr
-geol
-gimn
-głęb
-gm
-godz
-górn
-gosp
-gr
-gram
-hist
-hiszp
-hr
-Hr
-hot
-id
-in
-im
-iron
-jn
-kard
-kat
-katol
-k.k
-kk
-kol
-kl
-k.p.a
-kpc
-k.p.c
-kpt
-kr
-k.r
-krak
-k.r.o
-kryt
-kult
-laic
-łac
-niem
-woj
-nb
-np
-Nb
-Np
-pol
-pow
-m.in
-pt
-ps
-Pt
-Ps
-cdn
-jw
-ryc
-rys
-Ryc
-Rys
-tj
-tzw
-Tzw
-tzn
-zob
-ang
-ub
-ul
-pw
-pn
-pl
-al
-k
-n
-nr #NUMERIC_ONLY#
-Nr #NUMERIC_ONLY#
-ww
-wł
-ur
-zm
-żyd
-żarg
-żyw
-wył
-bp
-bp
-wyst
-tow
-Tow
-o
-sp
-Sp
-st
-spółdz
-Spółdz
-społ
-spółgł
-stoł
-stow
-Stoł
-Stow
-zn
-zew
-zewn
-zdr
-zazw
-zast
-zaw
-zał
-zal
-zam
-zak
-zakł
-zagr
-zach
-adw
-Adw
-lek
-Lek
-med
-mec
-Mec
-doc
-Doc
-dyw
-dyr
-Dyw
-Dyr
-inż
-Inż
-mgr
-Mgr
-dh
-dr
-Dh
-Dr
-p
-P
-red
-Red
-prof
-prok
-Prof
-Prok
-hab
-płk
-Płk
-nadkom
-Nadkom
-podkom
-Podkom
-ks
-Ks
-gen
-Gen
-por
-Por
-reż
-Reż
-przyp
-Przyp
-śp
-św
-śW
-Śp
-Św
-ŚW
-szer
-Szer
-pkt #NUMERIC_ONLY#
-str #NUMERIC_ONLY#
-tab #NUMERIC_ONLY#
-Tab #NUMERIC_ONLY#
-tel
-ust #NUMERIC_ONLY#
-par #NUMERIC_ONLY#
-poz
-pok
-oo
-oO
-Oo
-OO
-r #NUMERIC_ONLY#
-l #NUMERIC_ONLY#
-s #NUMERIC_ONLY#
-najśw
-Najśw
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-Ś
-Ć
-Ż
-Ź
-Dz
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pt b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pt
deleted file mode 100644
index 5d65bf25a..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pt
+++ /dev/null
@@ -1,210 +0,0 @@
-#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
-#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
-#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
-
-#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
-#usually upper case letters are initials in a name
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-a
-b
-c
-d
-e
-f
-g
-h
-i
-j
-k
-l
-m
-n
-o
-p
-q
-r
-s
-t
-u
-v
-w
-x
-y
-z
-
-
-#Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
-I
-II
-III
-IV
-V
-VI
-VII
-VIII
-IX
-X
-XI
-XII
-XIII
-XIV
-XV
-XVI
-XVII
-XVIII
-XIX
-XX
-i
-ii
-iii
-iv
-v
-vi
-vii
-viii
-ix
-x
-xi
-xii
-xiii
-xiv
-xv
-xvi
-xvii
-xviii
-xix
-xx
-
-#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-Adj
-Adm
-Adv
-Art
-Ca
-Capt
-Cmdr
-Col
-Comdr
-Con
-Corp
-Cpl
-DR
-DRA
-Dr
-Dra
-Dras
-Drs
-Eng
-Enga
-Engas
-Engos
-Ex
-Exo
-Exmo
-Fig
-Gen
-Hosp
-Insp
-Lda
-MM
-MR
-MRS
-MS
-Maj
-Mrs
-Ms
-Msgr
-Op
-Ord
-Pfc
-Ph
-Prof
-Pvt
-Rep
-Reps
-Res
-Rev
-Rt
-Sen
-Sens
-Sfc
-Sgt
-Sr
-Sra
-Sras
-Srs
-Sto
-Supt
-Surg
-adj
-adm
-adv
-art
-cit
-col
-con
-corp
-cpl
-dr
-dra
-dras
-drs
-eng
-enga
-engas
-engos
-ex
-exo
-exmo
-fig
-op
-prof
-sr
-sra
-sras
-srs
-sto
-
-#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
-v
-vs
-i.e
-rev
-e.g
-
-#Numbers only. These should only induce breaks when followed by a numeric sequence
-# add NUMERIC_ONLY after the word for this function
-#This case is mostly for the english "No." which can either be a sentence of its own, or
-#if followed by a number, a non-breaking prefix
-No #NUMERIC_ONLY# 
-Nos
-Art #NUMERIC_ONLY#
-Nr
-p #NUMERIC_ONLY#
-pp #NUMERIC_ONLY#
-
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro
deleted file mode 100644
index d489f4654..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro
+++ /dev/null
@@ -1,38 +0,0 @@
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-dpdv
-etc
-șamd
-M.Ap.N
-dl
-Dl
-d-na
-D-na
-dvs
-Dvs
-pt
-Pt
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru
deleted file mode 100644
index 444465b35..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru
+++ /dev/null
@@ -1,259 +0,0 @@
-TBD: Russian uppercase alphabet [А-Я]
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-0гг
-1гг
-2гг
-3гг
-4гг
-5гг
-6гг
-7гг
-8гг
-9гг
-0г
-1г
-2г
-3г
-4г
-5г
-6г
-7г
-8г
-9г
-Xвв
-Vвв
-Iвв
-Lвв
-Mвв
-Cвв
-Xв
-Vв
-Iв
-Lв
-Mв
-Cв
-0м
-1м
-2м
-3м
-4м
-5м
-6м
-7м
-8м
-9м
-0мм
-1мм
-2мм
-3мм
-4мм
-5мм
-6мм
-7мм
-8мм
-9мм
-0см
-1см
-2см
-3см
-4см
-5см
-6см
-7см
-8см
-9см
-0дм
-1дм
-2дм
-3дм
-4дм
-5дм
-6дм
-7дм
-8дм
-9дм
-0л
-1л
-2л
-3л
-4л
-5л
-6л
-7л
-8л
-9л
-0км
-1км
-2км
-3км
-4км
-5км
-6км
-7км
-8км
-9км
-0га
-1га
-2га
-3га
-4га
-5га
-6га
-7га
-8га
-9га
-0кг
-1кг
-2кг
-3кг
-4кг
-5кг
-6кг
-7кг
-8кг
-9кг
-0т
-1т
-2т
-3т
-4т
-5т
-6т
-7т
-8т
-9т
-0г
-1г
-2г
-3г
-4г
-5г
-6г
-7г
-8г
-9г
-0мг
-1мг
-2мг
-3мг
-4мг
-5мг
-6мг
-7мг
-8мг
-9мг
-бульв
-в
-вв
-г
-га
-гг
-гл
-гос
-д
-дм
-доп
-др
-е
-ед
-ед
-зам
-и
-инд
-исп
-Исп
-к
-кап
-кг
-кв
-кл
-км
-кол
-комн
-коп
-куб
-л
-лиц
-лл
-м
-макс
-мг
-мин
-мл
-млн
-млрд
-мм
-н
-наб
-нач
-неуд
-ном
-о
-обл
-обр
-общ
-ок
-ост
-отл
-п
-пер
-перераб
-пл
-пос
-пр
-просп
-проф
-р
-ред
-руб
-с
-сб
-св
-см
-соч
-ср
-ст
-стр
-т
-тел
-Тел
-тех
-тт
-туп
-тыс
-уд
-ул
-уч
-физ
-х
-хор
-ч
-чел
-шт
-экз
-э
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk
deleted file mode 100644
index 1198d4829..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk
+++ /dev/null
@@ -1,474 +0,0 @@
-Bc
-Mgr
-RNDr
-PharmDr
-PhDr
-JUDr
-PaedDr
-ThDr
-Ing
-MUDr
-MDDr
-MVDr
-Dr
-ThLic
-PhD
-ArtD
-ThDr
-Dr
-DrSc
-CSs
-prof
-obr
-Obr
-Č
-č
-absol
-adj
-admin
-adr
-Adr
-adv
-advok
-afr
-ak
-akad
-akc
-akuz
-et
-al
-alch
-amer
-anat
-angl
-Angl
-anglosas
-anorg
-ap
-apod
-arch
-archeol
-archit
-arg
-art
-astr
-astrol
-astron
-atp
-atď
-austr
-Austr
-aut
-belg
-Belg
-bibl
-Bibl
-biol
-bot
-bud
-bás
-býv
-cest
-chem
-cirk
-csl
-čs
-Čs
-dat
-dep
-det
-dial
-diaľ
-dipl
-distrib
-dokl
-dosl
-dopr
-dram
-duš
-dv
-dvojčl
-dór
-ekol
-ekon
-el
-elektr
-elektrotech
-energet
-epic
-est
-etc
-etonym
-eufem
-európ
-Európ
-ev
-evid
-expr
-fa
-fam
-farm
-fem
-feud
-fil
-filat
-filoz
-fi
-fon
-form
-fot
-fr
-Fr
-franc
-Franc
-fraz
-fut
-fyz
-fyziol
-garb
-gen
-genet
-genpor
-geod
-geogr
-geol
-geom
-germ
-gr
-Gr
-gréc
-Gréc
-gréckokat
-hebr
-herald
-hist
-hlav
-hosp
-hromad
-hud
-hypok
-ident
-i.e
-ident
-imp
-impf
-indoeur
-inf
-inform
-instr
-int
-interj
-inšt
-inštr
-iron
-jap
-Jap
-jaz
-jedn
-juhoamer
-juhových
-juhozáp
-juž
-kanad
-Kanad
-kanc
-kapit
-kpt
-kart
-katastr
-knih
-kniž
-komp
-konj
-konkr
-kozmet
-krajč
-kresť
-kt
-kuch
-lat
-latinskoamer
-lek
-lex
-lingv
-lit
-litur
-log
-lok
-max
-Max
-maď
-Maď
-medzinár
-mest
-metr
-mil
-Mil
-min
-Min
-miner
-ml
-mld
-mn
-mod
-mytol
-napr
-nar
-Nar
-nasl
-nedok
-neg
-negat
-neklas
-nem
-Nem
-neodb
-neos
-neskl
-nesklon
-nespis
-nespráv
-neved
-než
-niekt
-niž
-nom
-náb
-nákl
-námor
-nár
-obch
-obj
-obv
-obyč
-obč
-občian
-odb
-odd
-ods
-ojed
-okr
-Okr
-opt
-opyt
-org
-os
-osob
-ot
-ovoc
-par
-part
-pejor
-pers
-pf
-Pf 
-P.f
-p.f
-pl
-Plk
-pod
-podst
-pokl
-polit
-politol
-polygr
-pomn
-popl
-por
-porad
-porov
-posch
-potrav
-použ
-poz
-pozit
-poľ
-poľno
-poľnohosp
-poľov
-pošt
-pož
-prac
-predl
-pren
-prep
-preuk
-priezv
-Priezv
-privl
-prof
-práv
-príd
-príj
-prík
-príp
-prír
-prísl
-príslov
-príč
-psych
-publ
-pís
-písm
-pôv
-refl
-reg
-rep
-resp
-rozk
-rozlič
-rozpráv
-roč
-Roč
-ryb
-rádiotech
-rím
-samohl
-semest
-sev
-severoamer
-severových
-severozáp
-sg
-skr
-skup
-sl
-Sloven
-soc
-soch
-sociol
-sp
-spol
-Spol
-spoloč
-spoluhl
-správ
-spôs
-st
-star
-starogréc
-starorím
-s.r.o
-stol
-stor
-str
-stredoamer
-stredoškol
-subj
-subst
-superl
-sv
-sz
-súkr
-súp
-súvzť
-tal
-Tal
-tech
-tel
-Tel
-telef
-teles
-telev
-teol
-trans
-turist
-tuzem
-typogr
-tzn
-tzv
-ukaz
-ul
-Ul
-umel
-univ
-ust
-ved
-vedľ
-verb
-veter
-vin
-viď
-vl
-vod
-vodohosp
-pnl
-vulg
-vyj
-vys
-vysokoškol
-vzťaž
-vôb
-vých
-výd
-výrob
-výsk
-výsl
-výtv
-výtvar
-význ
-včel
-vš
-všeob
-zahr
-zar
-zariad
-zast
-zastar
-zastaráv
-zb
-zdravot
-združ
-zjemn
-zlat
-zn
-Zn
-zool
-zr
-zried
-zv
-záhr
-zák
-zákl
-zám
-záp
-západoeur
-zázn
-územ
-účt
-čast
-čes
-Čes
-čl
-čísl
-živ
-pr
-fak
-Kr
-p.n.l
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl
deleted file mode 100644
index 230062c69..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl
+++ /dev/null
@@ -1,78 +0,0 @@
-dr
-Dr
-itd
-itn
-št #NUMERIC_ONLY#
-Št #NUMERIC_ONLY#
-d
-jan
-Jan
-feb
-Feb
-mar
-Mar
-apr
-Apr
-jun
-Jun
-jul
-Jul
-avg
-Avg
-sept
-Sept
-sep
-Sep
-okt
-Okt
-nov
-Nov
-dec
-Dec
-tj
-Tj
-npr
-Npr
-sl
-Sl
-op
-Op
-gl
-Gl
-oz
-Oz
-prev
-dipl
-ing
-prim
-Prim
-cf
-Cf
-gl
-Gl
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sv b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sv
deleted file mode 100644
index df5ef2959..000000000
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sv
+++ /dev/null
@@ -1,46 +0,0 @@
-#single upper case letter are usually initials
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-#misc abbreviations
-AB
-G
-VG
-dvs
-etc
-from
-iaf
-jfr
-kl
-kr
-mao
-mfl
-mm
-osv
-pga
-tex
-tom
-vs
diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index 08cfe7dad..b23628b65 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -11,7 +11,7 @@ use FindBin qw($RealBin);
 use strict;
 #use Time::HiRes;
 
-my $mydir = "$RealBin/nonbreaking_prefixes";
+my $mydir = "$RealBin/../share/nonbreaking_prefixes";
 
 my %NONBREAKING_PREFIX = ();
 my $language = "en";
author	Hieu Hoang <fishandfrolick@gmail.com>	2012-06-26 20:20:46 +0400
committer	Hieu Hoang <fishandfrolick@gmail.com>	2012-06-26 21:33:34 +0400
commit	93bff3f2013b2732c67355d2e9bd253fba4670a7 (patch)
tree	9fb8e2359a4d78c3ba554f75ae22dd6402634c43 /scripts/tokenizer
parent	272f39a48719bb7c2e4582e73e769b2387e2dcb9 (diff)