diff options
author | phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> | 2010-09-15 20:06:04 +0400 |
---|---|---|
committer | phkoehn <phkoehn@1f5c12ca-751b-0410-a591-d2e778427230> | 2010-09-15 20:06:04 +0400 |
commit | fb8b0eb180b576d71d0cc9534e78b37fd50271cb (patch) | |
tree | 9726a27a3c9c22a6298da34fc74dfe1f24250cc6 /scripts/tokenizer | |
parent | d0aecca9fe464e5ba1ef3308c721a1918dfe4801 (diff) |
new prefix files for tokenizer
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@3467 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'scripts/tokenizer')
3 files changed, 447 insertions, 0 deletions
diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es new file mode 100644 index 000000000..cc5bc580d --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.es @@ -0,0 +1,84 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm +A.C +a/c +adj +afmo +J.C +apdo +Av +cap +c.f +c.g +cm +doc +Dr +EE.UU +esq +etc +f.c +gr +grs +Gral +kg +km +Lic +Fil +Let +mg +mm +N.B +núm +pág +págs +P.D +ej +Prof +ptas +q.e.g.e +q.e.s.m +S.A +Sr +Sra +S.R.C +Srta +s.s.s +Tel +T.V.E +Ud +Uds +Vd +Vds +vid + diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.fr b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.fr new file mode 100644 index 000000000..28126fa57 --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.fr @@ -0,0 +1,153 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. +# +#any single upper case letter followed by a period is not a sentence ender +#usually upper case letters are initials in a name +#no French words end in single lower-case letters, so we throw those in too? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + +# Period-final abbreviation list for French +A.C.N +A.M +art +ann +apr +av +auj +lib +B.P +boul +ca +c.-à-d +cf +ch.-l +chap +contr +C.P.I +C.Q.F.D +C.N +C.N.S +C.S +dir +éd +e.g +env +al +etc +E.V +ex +fasc +fém +fig +fr +hab +ibid +id +i.e +inf +LL.AA +LL.AA.II +LL.AA.RR +LL.AA.SS +L.D +LL.EE +LL.MM +LL.MM.II.RR +loc.cit +masc +MM +ms +N.B +N.D.A +N.D.L.R +N.D.T +n/réf +NN.SS +N.S +N.D +N.P.A.I +p.c.c +pl +pp +p.ex +p.j +P.S +R.A.S +R.-V +R.P +R.I.P +SS +S.S +S.A +S.A.I +S.A.R +S.A.S +S.E +sec +sect +sing +S.M +S.M.I.R +sq +sqq +suiv +sup +suppl +tél +T.S.V.P +vb +vol +vs +X.O +Z.I diff --git a/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pt b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pt new file mode 100644 index 000000000..5d65bf25a --- /dev/null +++ b/scripts/tokenizer/nonbreaking_prefixes/nonbreaking_prefix.pt @@ -0,0 +1,210 @@ +#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +#Roman Numerals. A dot after one of these is not a sentence break in Portuguese. +I +II +III +IV +V +VI +VII +VIII +IX +X +XI +XII +XIII +XIV +XV +XVI +XVII +XVIII +XIX +XX +i +ii +iii +iv +v +vi +vii +viii +ix +x +xi +xii +xiii +xiv +xv +xvi +xvii +xviii +xix +xx + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Art +Ca +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +DRA +Dr +Dra +Dras +Drs +Eng +Enga +Engas +Engos +Ex +Exo +Exmo +Fig +Gen +Hosp +Insp +Lda +MM +MR +MRS +MS +Maj +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +Sra +Sras +Srs +Sto +Supt +Surg +adj +adm +adv +art +cit +col +con +corp +cpl +dr +dra +dras +drs +eng +enga +engas +engos +ex +exo +exmo +fig +op +prof +sr +sra +sras +srs +sto + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +p #NUMERIC_ONLY# +pp #NUMERIC_ONLY# + |