Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2011-02-16 13:44:26 +0300
committerphkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230>2011-02-16 13:44:26 +0300
commitdf901e7ce69641c06b239d29a34f784c748cd69d (patch)
tree9e66abc8b3e2e14961932914970582dbcac5e066 /scripts/tokenizer
parent76174ccd4bb95a775c1fa4a89a74e9d95da81a4e (diff)
added files from Tom Hoar
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3881 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/tokenizer')
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca75
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es86
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is251
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it77
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl283
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro38
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru259
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk474
-rw-r--r--scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl78
9 files changed, 1592 insertions, 29 deletions
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca
new file mode 100644
index 000000000..2f4fdfc67
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca
@@ -0,0 +1,75 @@
+Dr
+Dra
+pàg
+p
+c
+av
+Sr
+Sra
+adm
+esq
+Prof
+S.A
+S.L
+p.e
+ptes
+Sta
+St
+pl
+màx
+cast
+dir
+nre
+fra
+admdora
+Emm
+Excma
+espf
+dc
+admdor
+tel
+angl
+aprox
+ca
+dept
+dj
+dl
+dt
+ds
+dg
+dv
+ed
+entl
+al
+i.e
+maj
+smin
+n
+núm
+pta
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es
index cc5bc580d..d8b275518 100644
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es
@@ -31,54 +31,88 @@ Y
Z
# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
+
A.C
+Apdo
+Av
+Bco
+CC.AA
+Da
+Dep
+Dn
+Dr
+Dra
+EE.UU
+Excmo
+FF.CC
+Fil
+Gral
+J.C
+Let
+Lic
+N.B
+P.D
+P.V.P
+Prof
+Pts
+Rte
+S.A
+S.A.R
+S.E
+S.L
+S.R.C
+Sr
+Sra
+Srta
+Sta
+Sto
+T.V.E
+Tel
+Ud
+Uds
+V.B
+V.E
+Vd
+Vds
a/c
adj
+admón
afmo
-J.C
apdo
-Av
-cap
+av
+c
c.f
c.g
+cap
cm
+cta
+dcha
doc
-Dr
-EE.UU
+ej
+entlo
esq
etc
f.c
gr
grs
-Gral
+izq
kg
km
-Lic
-Fil
-Let
mg
mm
-N.B
núm
+núm
+p
+p.a
+p.ej
+ptas
pág
págs
-P.D
-ej
-Prof
-ptas
+pág
+págs
q.e.g.e
q.e.s.m
-S.A
-Sr
-Sra
-S.R.C
-Srta
+s
s.s.s
-Tel
-T.V.E
-Ud
-Uds
-Vd
-Vds
vid
-
+vol
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is
new file mode 100644
index 000000000..5b8a71086
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is
@@ -0,0 +1,251 @@
+no #NUMERIC_ONLY#
+No #NUMERIC_ONLY#
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+nR #NUMERIC_ONLY#
+NR #NUMERIC_ONLY#
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+^
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+ab.fn
+a.fn
+afs
+al
+alm
+alg
+andh
+ath
+aths
+atr
+ao
+au
+aukaf
+áfn
+áhrl.s
+áhrs
+ákv.gr
+ákv
+bh
+bls
+dr
+e.Kr
+et
+ef
+efn
+ennfr
+eink
+end
+e.st
+erl
+fél
+fskj
+fh
+f.hl
+físl
+fl
+fn
+fo
+forl
+frb
+frl
+frh
+frt
+fsl
+fsh
+fs
+fsk
+fst
+f.Kr
+ft
+fv
+fyrrn
+fyrrv
+germ
+gm
+gr
+hdl
+hdr
+hf
+hl
+hlsk
+hljsk
+hljv
+hljóðv
+hr
+hv
+hvk
+holl
+Hos
+höf
+hk
+hrl
+ísl
+kaf
+kap
+Khöfn
+kk
+kg
+kk
+km
+kl
+klst
+kr
+kt
+kgúrsk
+kvk
+leturbr
+lh
+lh.nt
+lh.þt
+lo
+ltr
+mlja
+mljó
+millj
+mm
+mms
+m.fl
+miðm
+mgr
+mst
+mín
+nf
+nh
+nhm
+nl
+nk
+nmgr
+no
+núv
+nt
+o.áfr
+o.m.fl
+ohf
+o.fl
+o.s.frv
+ófn
+ób
+óákv.gr
+óákv
+pfn
+PR
+pr
+Ritstj
+Rvík
+Rvk
+samb
+samhlj
+samn
+samn
+sbr
+sek
+sérn
+sf
+sfn
+sh
+sfn
+sh
+s.hl
+sk
+skv
+sl
+sn
+so
+ss.us
+s.st
+samþ
+sbr
+shlj
+sign
+skál
+st
+st.s
+stk
+sþ
+teg
+tbl
+tfn
+tl
+tvíhlj
+tvt
+till
+to
+umr
+uh
+us
+uppl
+útg
+vb
+Vf
+vh
+vkf
+Vl
+vl
+vlf
+vmf
+8vo
+vsk
+vth
+þt
+þf
+þjs
+þgf
+þlt
+þolm
+þm
+þml
+þýð
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
index ad4d355c7..992b9ecd4 100644
--- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it
@@ -31,32 +31,43 @@ Y
Z
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
-dott
-Sig
Adj
Adm
Adv
+Amn
+Arch
Asst
+Avv
Bart
+Bcc
Bldg
Brig
Bros
+C.A.P
+C.P
Capt
+Cc
Cmdr
+Co
Col
Comdr
Con
Corp
Cpl
DR
+Dott
Dr
Drs
+Egr
Ens
Gen
+Geom
Gov
Hon
-Hr
Hosp
+Hr
+Id
+Ing
Insp
Lt
MM
@@ -67,29 +78,89 @@ Maj
Messrs
Mlle
Mme
+Mo
+Mons
Mr
Mrs
Ms
Msgr
+N.B
Op
Ord
+P.S
+P.T
Pfc
Ph
Prof
Pvt
+RP
+RSVP
+Rag
Rep
Reps
Res
Rev
+Rif
Rt
+S.A
+S.B.F
+S.P.M
+S.p.A
+S.r.l
Sen
Sens
Sfc
Sgt
+Sig
+Sigg
+Soc
+Spett
Sr
St
Supt
Surg
+V.P
+
+# other
+a.c
+acc
+all
+banc
+c.a
+c.c.p
+c.m
+c.p
+c.s
+c.v
+corr
+dott
+e.p.c
+ecc
+es
+fatt
+gg
+int
+lett
+ogg
+on
+p.c
+p.c.c
+p.es
+p.f
+p.r
+p.v
+post
+pp
+racc
+ric
+s.n.c
+seg
+sgg
+ss
+tel
+u.s
+v.r
+v.s
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
v
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl
new file mode 100644
index 000000000..6b7c106e6
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl
@@ -0,0 +1,283 @@
+adw
+afr
+akad
+al
+Al
+am
+amer
+arch
+art
+Art
+artyst
+astr
+austr
+bałt
+bdb
+bł
+bm
+br
+bryg
+bryt
+centr
+ces
+chem
+chiń
+chir
+c.k
+c.o
+cyg
+cyw
+cyt
+czes
+czw
+cd
+Cd
+czyt
+ćw
+ćwicz
+daw
+dcn
+dekl
+demokr
+det
+diec
+dł
+dn
+dot
+dol
+dop
+dost
+dosł
+h.c
+ds
+dst
+duszp
+dypl
+egz
+ekol
+ekon
+elektr
+em
+ew
+fab
+farm
+fot
+fr
+gat
+gastr
+geogr
+geol
+gimn
+głęb
+gm
+godz
+górn
+gosp
+gr
+gram
+hist
+hiszp
+hr
+Hr
+hot
+id
+in
+im
+iron
+jn
+kard
+kat
+katol
+k.k
+kk
+kol
+kl
+k.p.a
+kpc
+k.p.c
+kpt
+kr
+k.r
+krak
+k.r.o
+kryt
+kult
+laic
+łac
+niem
+woj
+nb
+np
+Nb
+Np
+pol
+pow
+m.in
+pt
+ps
+Pt
+Ps
+cdn
+jw
+ryc
+rys
+Ryc
+Rys
+tj
+tzw
+Tzw
+tzn
+zob
+ang
+ub
+ul
+pw
+pn
+pl
+al
+k
+n
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+ww
+wł
+ur
+zm
+żyd
+żarg
+żyw
+wył
+bp
+bp
+wyst
+tow
+Tow
+o
+sp
+Sp
+st
+spółdz
+Spółdz
+społ
+spółgł
+stoł
+stow
+Stoł
+Stow
+zn
+zew
+zewn
+zdr
+zazw
+zast
+zaw
+zał
+zal
+zam
+zak
+zakł
+zagr
+zach
+adw
+Adw
+lek
+Lek
+med
+mec
+Mec
+doc
+Doc
+dyw
+dyr
+Dyw
+Dyr
+inż
+Inż
+mgr
+Mgr
+dh
+dr
+Dh
+Dr
+p
+P
+red
+Red
+prof
+prok
+Prof
+Prok
+hab
+płk
+Płk
+nadkom
+Nadkom
+podkom
+Podkom
+ks
+Ks
+gen
+Gen
+por
+Por
+reż
+Reż
+przyp
+Przyp
+śp
+św
+śW
+Śp
+Św
+ŚW
+szer
+Szer
+pkt #NUMERIC_ONLY#
+str #NUMERIC_ONLY#
+tab #NUMERIC_ONLY#
+Tab #NUMERIC_ONLY#
+tel
+ust #NUMERIC_ONLY#
+par #NUMERIC_ONLY#
+poz
+pok
+oo
+oO
+Oo
+OO
+r #NUMERIC_ONLY#
+l #NUMERIC_ONLY#
+s #NUMERIC_ONLY#
+najśw
+Najśw
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Dz
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro
new file mode 100644
index 000000000..d489f4654
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+dpdv
+etc
+șamd
+M.Ap.N
+dl
+Dl
+d-na
+D-na
+dvs
+Dvs
+pt
+Pt
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru
new file mode 100644
index 000000000..444465b35
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru
@@ -0,0 +1,259 @@
+TBD: Russian uppercase alphabet [А-Я]
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+0гг
+1гг
+2гг
+3гг
+4гг
+5гг
+6гг
+7гг
+8гг
+9гг
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+Xвв
+Vвв
+Iвв
+Lвв
+Mвв
+Cвв
+Xв
+Vв
+Iв
+Lв
+Mв
+Cв
+0м
+1м
+2м
+3м
+4м
+5м
+6м
+7м
+8м
+9м
+0мм
+1мм
+2мм
+3мм
+4мм
+5мм
+6мм
+7мм
+8мм
+9мм
+0см
+1см
+2см
+3см
+4см
+5см
+6см
+7см
+8см
+9см
+0дм
+1дм
+2дм
+3дм
+4дм
+5дм
+6дм
+7дм
+8дм
+9дм
+0л
+1л
+2л
+3л
+4л
+5л
+6л
+7л
+8л
+9л
+0км
+1км
+2км
+3км
+4км
+5км
+6км
+7км
+8км
+9км
+0га
+1га
+2га
+3га
+4га
+5га
+6га
+7га
+8га
+9га
+0кг
+1кг
+2кг
+3кг
+4кг
+5кг
+6кг
+7кг
+8кг
+9кг
+0т
+1т
+2т
+3т
+4т
+5т
+6т
+7т
+8т
+9т
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+0мг
+1мг
+2мг
+3мг
+4мг
+5мг
+6мг
+7мг
+8мг
+9мг
+бульв
+вв
+га
+гг
+гл
+гос
+дм
+доп
+др
+ед
+ед
+зам
+инд
+исп
+Исп
+кап
+кг
+кв
+кл
+км
+кол
+комн
+коп
+куб
+лиц
+лл
+макс
+мг
+мин
+мл
+млн
+млрд
+мм
+наб
+нач
+неуд
+ном
+обл
+обр
+общ
+ок
+ост
+отл
+п
+пер
+перераб
+пл
+пос
+пр
+просп
+проф
+ред
+руб
+сб
+св
+см
+соч
+ср
+ст
+стр
+тел
+Тел
+тех
+тт
+туп
+тыс
+уд
+ул
+уч
+физ
+хор
+чел
+шт
+экз
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk
new file mode 100644
index 000000000..1198d4829
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk
@@ -0,0 +1,474 @@
+Bc
+Mgr
+RNDr
+PharmDr
+PhDr
+JUDr
+PaedDr
+ThDr
+Ing
+MUDr
+MDDr
+MVDr
+Dr
+ThLic
+PhD
+ArtD
+ThDr
+Dr
+DrSc
+CSs
+prof
+obr
+Obr
+absol
+adj
+admin
+adr
+Adr
+adv
+advok
+afr
+ak
+akad
+akc
+akuz
+et
+al
+alch
+amer
+anat
+angl
+Angl
+anglosas
+anorg
+ap
+apod
+arch
+archeol
+archit
+arg
+art
+astr
+astrol
+astron
+atp
+atď
+austr
+Austr
+aut
+belg
+Belg
+bibl
+Bibl
+biol
+bot
+bud
+bás
+býv
+cest
+chem
+cirk
+csl
+čs
+Čs
+dat
+dep
+det
+dial
+diaľ
+dipl
+distrib
+dokl
+dosl
+dopr
+dram
+duš
+dv
+dvojčl
+dór
+ekol
+ekon
+el
+elektr
+elektrotech
+energet
+epic
+est
+etc
+etonym
+eufem
+európ
+Európ
+ev
+evid
+expr
+fa
+fam
+farm
+fem
+feud
+fil
+filat
+filoz
+fi
+fon
+form
+fot
+fr
+Fr
+franc
+Franc
+fraz
+fut
+fyz
+fyziol
+garb
+gen
+genet
+genpor
+geod
+geogr
+geol
+geom
+germ
+gr
+Gr
+gréc
+Gréc
+gréckokat
+hebr
+herald
+hist
+hlav
+hosp
+hromad
+hud
+hypok
+ident
+i.e
+ident
+imp
+impf
+indoeur
+inf
+inform
+instr
+int
+interj
+inšt
+inštr
+iron
+jap
+Jap
+jaz
+jedn
+juhoamer
+juhových
+juhozáp
+juž
+kanad
+Kanad
+kanc
+kapit
+kpt
+kart
+katastr
+knih
+kniž
+komp
+konj
+konkr
+kozmet
+krajč
+kresť
+kt
+kuch
+lat
+latinskoamer
+lek
+lex
+lingv
+lit
+litur
+log
+lok
+max
+Max
+maď
+Maď
+medzinár
+mest
+metr
+mil
+Mil
+min
+Min
+miner
+ml
+mld
+mn
+mod
+mytol
+napr
+nar
+Nar
+nasl
+nedok
+neg
+negat
+neklas
+nem
+Nem
+neodb
+neos
+neskl
+nesklon
+nespis
+nespráv
+neved
+než
+niekt
+niž
+nom
+náb
+nákl
+námor
+nár
+obch
+obj
+obv
+obyč
+obč
+občian
+odb
+odd
+ods
+ojed
+okr
+Okr
+opt
+opyt
+org
+os
+osob
+ot
+ovoc
+par
+part
+pejor
+pers
+pf
+Pf
+P.f
+p.f
+pl
+Plk
+pod
+podst
+pokl
+polit
+politol
+polygr
+pomn
+popl
+por
+porad
+porov
+posch
+potrav
+použ
+poz
+pozit
+poľ
+poľno
+poľnohosp
+poľov
+pošt
+pož
+prac
+predl
+pren
+prep
+preuk
+priezv
+Priezv
+privl
+prof
+práv
+príd
+príj
+prík
+príp
+prír
+prísl
+príslov
+príč
+psych
+publ
+pís
+písm
+pôv
+refl
+reg
+rep
+resp
+rozk
+rozlič
+rozpráv
+roč
+Roč
+ryb
+rádiotech
+rím
+samohl
+semest
+sev
+severoamer
+severových
+severozáp
+sg
+skr
+skup
+sl
+Sloven
+soc
+soch
+sociol
+sp
+spol
+Spol
+spoloč
+spoluhl
+správ
+spôs
+st
+star
+starogréc
+starorím
+s.r.o
+stol
+stor
+str
+stredoamer
+stredoškol
+subj
+subst
+superl
+sv
+sz
+súkr
+súp
+súvzť
+tal
+Tal
+tech
+tel
+Tel
+telef
+teles
+telev
+teol
+trans
+turist
+tuzem
+typogr
+tzn
+tzv
+ukaz
+ul
+Ul
+umel
+univ
+ust
+ved
+vedľ
+verb
+veter
+vin
+viď
+vl
+vod
+vodohosp
+pnl
+vulg
+vyj
+vys
+vysokoškol
+vzťaž
+vôb
+vých
+výd
+výrob
+výsk
+výsl
+výtv
+výtvar
+význ
+včel
+vš
+všeob
+zahr
+zar
+zariad
+zast
+zastar
+zastaráv
+zb
+zdravot
+združ
+zjemn
+zlat
+zn
+Zn
+zool
+zr
+zried
+zv
+záhr
+zák
+zákl
+zám
+záp
+západoeur
+zázn
+územ
+účt
+čast
+čes
+Čes
+čl
+čísl
+živ
+pr
+fak
+Kr
+p.n.l
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl
new file mode 100644
index 000000000..230062c69
--- /dev/null
+++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl
@@ -0,0 +1,78 @@
+dr
+Dr
+itd
+itn
+št #NUMERIC_ONLY#
+Št #NUMERIC_ONLY#
+d
+jan
+Jan
+feb
+Feb
+mar
+Mar
+apr
+Apr
+jun
+Jun
+jul
+Jul
+avg
+Avg
+sept
+Sept
+sep
+Sep
+okt
+Okt
+nov
+Nov
+dec
+Dec
+tj
+Tj
+npr
+Npr
+sl
+Sl
+op
+Op
+gl
+Gl
+oz
+Oz
+prev
+dipl
+ing
+prim
+Prim
+cf
+Cf
+gl
+Gl
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z