diff options
author | phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-02-16 13:44:26 +0300 |
---|---|---|
committer | phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> | 2011-02-16 13:44:26 +0300 |
commit | df901e7ce69641c06b239d29a34f784c748cd69d (patch) | |
tree | 9e66abc8b3e2e14961932914970582dbcac5e066 /scripts/tokenizer | |
parent | 76174ccd4bb95a775c1fa4a89a74e9d95da81a4e (diff) |
added files from Tom Hoar
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3881 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/tokenizer')
9 files changed, 1592 insertions, 29 deletions
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca new file mode 100644 index 000000000..2f4fdfc67 --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ca @@ -0,0 +1,75 @@ +Dr
+Dra
+pàg
+p
+c
+av
+Sr
+Sra
+adm
+esq
+Prof
+S.A
+S.L
+p.e
+ptes
+Sta
+St
+pl
+màx
+cast
+dir
+nre
+fra
+admdora
+Emm
+Excma
+espf
+dc
+admdor
+tel
+angl
+aprox
+ca
+dept
+dj
+dl
+dt
+ds
+dg
+dv
+ed
+entl
+al
+i.e
+maj
+smin
+n
+núm
+pta
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es index cc5bc580d..d8b275518 100644 --- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es @@ -31,54 +31,88 @@ Y Z # Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm + A.C +Apdo +Av +Bco +CC.AA +Da +Dep +Dn +Dr +Dra +EE.UU +Excmo +FF.CC +Fil +Gral +J.C +Let +Lic +N.B +P.D +P.V.P +Prof +Pts +Rte +S.A +S.A.R +S.E +S.L +S.R.C +Sr +Sra +Srta +Sta +Sto +T.V.E +Tel +Ud +Uds +V.B +V.E +Vd +Vds a/c adj +admón afmo -J.C apdo -Av -cap +av +c c.f c.g +cap cm +cta +dcha doc -Dr -EE.UU +ej +entlo esq etc f.c gr grs -Gral +izq kg km -Lic -Fil -Let mg mm -N.B núm +núm +p +p.a +p.ej +ptas pág págs -P.D -ej -Prof -ptas +pág +págs q.e.g.e q.e.s.m -S.A -Sr -Sra -S.R.C -Srta +s s.s.s -Tel -T.V.E -Ud -Uds -Vd -Vds vid - +vol diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is new file mode 100644 index 000000000..5b8a71086 --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.is @@ -0,0 +1,251 @@ +no #NUMERIC_ONLY# +No #NUMERIC_ONLY# +nr #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# +nR #NUMERIC_ONLY# +NR #NUMERIC_ONLY# +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +^ +í +á +ó +æ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +ab.fn +a.fn +afs +al +alm +alg +andh +ath +aths +atr +ao +au +aukaf +áfn +áhrl.s +áhrs +ákv.gr +ákv +bh +bls +dr +e.Kr +et +ef +efn +ennfr +eink +end +e.st +erl +fél +fskj +fh +f.hl +físl +fl +fn +fo +forl +frb +frl +frh +frt +fsl +fsh +fs +fsk +fst +f.Kr +ft +fv +fyrrn +fyrrv +germ +gm +gr +hdl +hdr +hf +hl +hlsk +hljsk +hljv +hljóðv +hr +hv +hvk +holl +Hos +höf +hk +hrl +ísl +kaf +kap +Khöfn +kk +kg +kk +km +kl +klst +kr +kt +kgúrsk +kvk +leturbr +lh +lh.nt +lh.þt +lo +ltr +mlja +mljó +millj +mm +mms +m.fl +miðm +mgr +mst +mín +nf +nh +nhm +nl +nk +nmgr +no +núv +nt +o.áfr +o.m.fl +ohf +o.fl +o.s.frv +ófn +ób +óákv.gr +óákv +pfn +PR +pr +Ritstj +Rvík +Rvk +samb +samhlj +samn +samn +sbr +sek +sérn +sf +sfn +sh +sfn +sh +s.hl +sk +skv +sl +sn +so +ss.us +s.st +samþ +sbr +shlj +sign +skál +st +st.s +stk +sþ +teg +tbl +tfn +tl +tvíhlj +tvt +till +to +umr +uh +us +uppl +útg +vb +Vf +vh +vkf +Vl +vl +vlf +vmf +8vo +vsk +vth +þt +þf +þjs +þgf +þlt +þolm +þm +þml +þýð diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it index ad4d355c7..992b9ecd4 100644 --- a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.it @@ -31,32 +31,43 @@ Y Z #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks -dott -Sig Adj Adm Adv +Amn +Arch Asst +Avv Bart +Bcc Bldg Brig Bros +C.A.P +C.P Capt +Cc Cmdr +Co Col Comdr Con Corp Cpl DR +Dott Dr Drs +Egr Ens Gen +Geom Gov Hon -Hr Hosp +Hr +Id +Ing Insp Lt MM @@ -67,29 +78,89 @@ Maj Messrs Mlle Mme +Mo +Mons Mr Mrs Ms Msgr +N.B Op Ord +P.S +P.T Pfc Ph Prof Pvt +RP +RSVP +Rag Rep Reps Res Rev +Rif Rt +S.A +S.B.F +S.P.M +S.p.A +S.r.l Sen Sens Sfc Sgt +Sig +Sigg +Soc +Spett Sr St Supt Surg +V.P + +# other +a.c +acc +all +banc +c.a +c.c.p +c.m +c.p +c.s +c.v +corr +dott +e.p.c +ecc +es +fatt +gg +int +lett +ogg +on +p.c +p.c.c +p.es +p.f +p.r +p.v +post +pp +racc +ric +s.n.c +seg +sgg +ss +tel +u.s +v.r +v.s #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) v diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl new file mode 100644 index 000000000..6b7c106e6 --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pl @@ -0,0 +1,283 @@ +adw +afr +akad +al +Al +am +amer +arch +art +Art +artyst +astr +austr +bałt +bdb +bł +bm +br +bryg +bryt +centr +ces +chem +chiń +chir +c.k +c.o +cyg +cyw +cyt +czes +czw +cd +Cd +czyt +ćw +ćwicz +daw +dcn +dekl +demokr +det +diec +dł +dn +dot +dol +dop +dost +dosł +h.c +ds +dst +duszp +dypl +egz +ekol +ekon +elektr +em +ew +fab +farm +fot +fr +gat +gastr +geogr +geol +gimn +głęb +gm +godz +górn +gosp +gr +gram +hist +hiszp +hr +Hr +hot +id +in +im +iron +jn +kard +kat +katol +k.k +kk +kol +kl +k.p.a +kpc +k.p.c +kpt +kr +k.r +krak +k.r.o +kryt +kult +laic +łac +niem +woj +nb +np +Nb +Np +pol +pow +m.in +pt +ps +Pt +Ps +cdn +jw +ryc +rys +Ryc +Rys +tj +tzw +Tzw +tzn +zob +ang +ub +ul +pw +pn +pl +al +k +n +nr #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# +ww +wł +ur +zm +żyd +żarg +żyw +wył +bp +bp +wyst +tow +Tow +o +sp +Sp +st +spółdz +Spółdz +społ +spółgł +stoł +stow +Stoł +Stow +zn +zew +zewn +zdr +zazw +zast +zaw +zał +zal +zam +zak +zakł +zagr +zach +adw +Adw +lek +Lek +med +mec +Mec +doc +Doc +dyw +dyr +Dyw +Dyr +inż +Inż +mgr +Mgr +dh +dr +Dh +Dr +p +P +red +Red +prof +prok +Prof +Prok +hab +płk +Płk +nadkom +Nadkom +podkom +Podkom +ks +Ks +gen +Gen +por +Por +reż +Reż +przyp +Przyp +śp +św +śW +Śp +Św +ŚW +szer +Szer +pkt #NUMERIC_ONLY# +str #NUMERIC_ONLY# +tab #NUMERIC_ONLY# +Tab #NUMERIC_ONLY# +tel +ust #NUMERIC_ONLY# +par #NUMERIC_ONLY# +poz +pok +oo +oO +Oo +OO +r #NUMERIC_ONLY# +l #NUMERIC_ONLY# +s #NUMERIC_ONLY# +najśw +Najśw +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Ś +Ć +Ż +Ź +Dz diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro new file mode 100644 index 000000000..d489f4654 --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ro @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +dpdv +etc +șamd +M.Ap.N +dl +Dl +d-na +D-na +dvs +Dvs +pt +Pt diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru new file mode 100644 index 000000000..444465b35 --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.ru @@ -0,0 +1,259 @@ +TBD: Russian uppercase alphabet [А-Я]
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+0гг
+1гг
+2гг
+3гг
+4гг
+5гг
+6гг
+7гг
+8гг
+9гг
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+Xвв
+Vвв
+Iвв
+Lвв
+Mвв
+Cвв
+Xв
+Vв
+Iв
+Lв
+Mв
+Cв
+0м
+1м
+2м
+3м
+4м
+5м
+6м
+7м
+8м
+9м
+0мм
+1мм
+2мм
+3мм
+4мм
+5мм
+6мм
+7мм
+8мм
+9мм
+0см
+1см
+2см
+3см
+4см
+5см
+6см
+7см
+8см
+9см
+0дм
+1дм
+2дм
+3дм
+4дм
+5дм
+6дм
+7дм
+8дм
+9дм
+0л
+1л
+2л
+3л
+4л
+5л
+6л
+7л
+8л
+9л
+0км
+1км
+2км
+3км
+4км
+5км
+6км
+7км
+8км
+9км
+0га
+1га
+2га
+3га
+4га
+5га
+6га
+7га
+8га
+9га
+0кг
+1кг
+2кг
+3кг
+4кг
+5кг
+6кг
+7кг
+8кг
+9кг
+0т
+1т
+2т
+3т
+4т
+5т
+6т
+7т
+8т
+9т
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+0мг
+1мг
+2мг
+3мг
+4мг
+5мг
+6мг
+7мг
+8мг
+9мг
+бульв
+в
+вв
+г
+га
+гг
+гл
+гос
+д
+дм
+доп
+др
+е
+ед
+ед
+зам
+и
+инд
+исп
+Исп
+к
+кап
+кг
+кв
+кл
+км
+кол
+комн
+коп
+куб
+л
+лиц
+лл
+м
+макс
+мг
+мин
+мл
+млн
+млрд
+мм
+н
+наб
+нач
+неуд
+ном
+о
+обл
+обр
+общ
+ок
+ост
+отл
+п
+пер
+перераб
+пл
+пос
+пр
+просп
+проф
+р
+ред
+руб
+с
+сб
+св
+см
+соч
+ср
+ст
+стр
+т
+тел
+Тел
+тех
+тт
+туп
+тыс
+уд
+ул
+уч
+физ
+х
+хор
+ч
+чел
+шт
+экз
+э
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk new file mode 100644 index 000000000..1198d4829 --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sk @@ -0,0 +1,474 @@ +Bc +Mgr +RNDr +PharmDr +PhDr +JUDr +PaedDr +ThDr +Ing +MUDr +MDDr +MVDr +Dr +ThLic +PhD +ArtD +ThDr +Dr +DrSc +CSs +prof +obr +Obr +Č +č +absol +adj +admin +adr +Adr +adv +advok +afr +ak +akad +akc +akuz +et +al +alch +amer +anat +angl +Angl +anglosas +anorg +ap +apod +arch +archeol +archit +arg +art +astr +astrol +astron +atp +atď +austr +Austr +aut +belg +Belg +bibl +Bibl +biol +bot +bud +bás +býv +cest +chem +cirk +csl +čs +Čs +dat +dep +det +dial +diaľ +dipl +distrib +dokl +dosl +dopr +dram +duš +dv +dvojčl +dór +ekol +ekon +el +elektr +elektrotech +energet +epic +est +etc +etonym +eufem +európ +Európ +ev +evid +expr +fa +fam +farm +fem +feud +fil +filat +filoz +fi +fon +form +fot +fr +Fr +franc +Franc +fraz +fut +fyz +fyziol +garb +gen +genet +genpor +geod +geogr +geol +geom +germ +gr +Gr +gréc +Gréc +gréckokat +hebr +herald +hist +hlav +hosp +hromad +hud +hypok +ident +i.e +ident +imp +impf +indoeur +inf +inform +instr +int +interj +inšt +inštr +iron +jap +Jap +jaz +jedn +juhoamer +juhových +juhozáp +juž +kanad +Kanad +kanc +kapit +kpt +kart +katastr +knih +kniž +komp +konj +konkr +kozmet +krajč +kresť +kt +kuch +lat +latinskoamer +lek +lex +lingv +lit +litur +log +lok +max +Max +maď +Maď +medzinár +mest +metr +mil +Mil +min +Min +miner +ml +mld +mn +mod +mytol +napr +nar +Nar +nasl +nedok +neg +negat +neklas +nem +Nem +neodb +neos +neskl +nesklon +nespis +nespráv +neved +než +niekt +niž +nom +náb +nákl +námor +nár +obch +obj +obv +obyč +obč +občian +odb +odd +ods +ojed +okr +Okr +opt +opyt +org +os +osob +ot +ovoc +par +part +pejor +pers +pf +Pf +P.f +p.f +pl +Plk +pod +podst +pokl +polit +politol +polygr +pomn +popl +por +porad +porov +posch +potrav +použ +poz +pozit +poľ +poľno +poľnohosp +poľov +pošt +pož +prac +predl +pren +prep +preuk +priezv +Priezv +privl +prof +práv +príd +príj +prík +príp +prír +prísl +príslov +príč +psych +publ +pís +písm +pôv +refl +reg +rep +resp +rozk +rozlič +rozpráv +roč +Roč +ryb +rádiotech +rím +samohl +semest +sev +severoamer +severových +severozáp +sg +skr +skup +sl +Sloven +soc +soch +sociol +sp +spol +Spol +spoloč +spoluhl +správ +spôs +st +star +starogréc +starorím +s.r.o +stol +stor +str +stredoamer +stredoškol +subj +subst +superl +sv +sz +súkr +súp +súvzť +tal +Tal +tech +tel +Tel +telef +teles +telev +teol +trans +turist +tuzem +typogr +tzn +tzv +ukaz +ul +Ul +umel +univ +ust +ved +vedľ +verb +veter +vin +viď +vl +vod +vodohosp +pnl +vulg +vyj +vys +vysokoškol +vzťaž +vôb +vých +výd +výrob +výsk +výsl +výtv +výtvar +význ +včel +vš +všeob +zahr +zar +zariad +zast +zastar +zastaráv +zb +zdravot +združ +zjemn +zlat +zn +Zn +zool +zr +zried +zv +záhr +zák +zákl +zám +záp +západoeur +zázn +územ +účt +čast +čes +Čes +čl +čísl +živ +pr +fak +Kr +p.n.l +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl new file mode 100644 index 000000000..230062c69 --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.sl @@ -0,0 +1,78 @@ +dr
+Dr
+itd
+itn
+št #NUMERIC_ONLY#
+Št #NUMERIC_ONLY#
+d
+jan
+Jan
+feb
+Feb
+mar
+Mar
+apr
+Apr
+jun
+Jun
+jul
+Jul
+avg
+Avg
+sept
+Sept
+sep
+Sep
+okt
+Okt
+nov
+Nov
+dec
+Dec
+tj
+Tj
+npr
+Npr
+sl
+Sl
+op
+Op
+gl
+Gl
+oz
+Oz
+prev
+dipl
+ing
+prim
+Prim
+cf
+Cf
+gl
+Gl
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
|