Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2015-01-21 20:23:44 +0300
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2015-01-22 14:25:02 +0300
commit1dce55f41367e54da9f6740a5dfae1104c6aa531 (patch)
tree05bb609edda881b40c5e760912bd933bbb1417e7 /contrib
parentad6f3a802622ba6aff9ce31b82dc81315dd32755 (diff)
C++ tokenizer based on RE2. Not by me.
Some differences from Moses tokenizer: fraction characters count as numbers, _ handling, URLs Currently 3x slower than perl :'(. Looking to make it faster by composing regex substitutions. TODO eliminate sprintf and fixed-size buffers.
Diffstat (limited to 'contrib')
-rw-r--r--contrib/c++tokenizer/Jamfile2
-rw-r--r--contrib/c++tokenizer/tokenizer.cpp736
-rw-r--r--contrib/c++tokenizer/tokenizer.h115
-rw-r--r--contrib/c++tokenizer/tokenizer_main.cpp212
4 files changed, 1065 insertions, 0 deletions
diff --git a/contrib/c++tokenizer/Jamfile b/contrib/c++tokenizer/Jamfile
new file mode 100644
index 000000000..f6a74a9df
--- /dev/null
+++ b/contrib/c++tokenizer/Jamfile
@@ -0,0 +1,2 @@
+external-lib re2 ;
+exe tokenizer : tokenizer.cpp tokenizer_main.cpp re2 : <cflags>-std=c++11 ;
diff --git a/contrib/c++tokenizer/tokenizer.cpp b/contrib/c++tokenizer/tokenizer.cpp
new file mode 100644
index 000000000..ca5065046
--- /dev/null
+++ b/contrib/c++tokenizer/tokenizer.cpp
@@ -0,0 +1,736 @@
+#include "tokenizer.h"
+#include <sstream>
+#include <iterator>
+#include <memory>
+#include <vector>
+#include <algorithm>
+
+namespace {
+
+// frequently used regexp's are pre-compiled thus:
+
+RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>");
+RE2 mult_spc_x(" +"); // multiple spaces
+RE2 tag_line_x("^<.+>$"); // lines beginning and ending with open/close angle-bracket pairs
+RE2 white_line_x("^\\s*$"); // lines entirely composed of whitespace
+RE2 ctrls_x("[\\000-\\037]*"); // match any control characters
+RE2 head_spc_x("^ "); // match a leading space on a line
+RE2 tail_spc_x(" $"); // match a trailing space on a line
+RE2 genl_spc_x("\\s+"); // any sequence of one or more whitespace characters
+RE2 specials_x("([^_\\p{L}\\p{N}\\s\\.\\'\\`\\,\\-])"); // any surely non-token character
+RE2 hyphen_x("([\\p{L}\\p{N}])(-)([\\p{L}\\p{N}])"); // any hyphenated pronouncable sequence
+RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); // and slash-conjoined " "
+RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); // sentence-final punctuation sequence (non qm em)
+RE2 qx_x("([?!])"); // one qm/em mark
+RE2 braces_x("([\\]\\[\\(\\){}<>])"); // any open or close of a pair
+RE2 endq_x("([^'])' "); // post-token single-quote or doubled single-quote
+RE2 postncomma_x("([^\\p{N}]),"); // comma after non-number
+RE2 prencomma_x(",([^\\p{N}])"); // comma before non-number
+RE2 nanaapos_x("([^\\p{L}])'([^\\p{L}])"); // non-letter'non-letter contraction form
+RE2 nxpaapos_x("([^\\p{L}\\p{N}])'([\\p{L}])"); // alnum'non-letter contraction form
+RE2 napaapos_x("([^\\p{L}])'([\\p{L}])"); // non-letter'letter contraction form
+RE2 panaapos_x("([\\p{L}])'([^\\p{L}])"); // letter'non-letter contraction form
+RE2 papaapos_x("([\\p{L}])'([\\p{L}])"); // letter'letter contraction form
+RE2 pnsapos_x("([\\p{N}])[']s"); // plural number
+RE2 letter_x("\\p{L}"); // a letter
+RE2 lower_x("^\\p{Ll}"); // a lower-case letter
+RE2 sinteger_x("^\\p{N}"); // not a digit mark
+RE2 dotskey_x("MANYDOTS(\\d+)"); // token for a dot sequence parameterized by seq length
+RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}");
+RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+");
+RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})");
+RE2 nonbreak_x("-\\p{L}"); // where not to break a protected form
+
+RE2 x1_v_d("([ ([{<])\""); // a valid non-letter preceeding a double-quote
+RE2 x1_v_gg("([ ([{<])``"); // a valid non-letter preceeding directional doubled open single-quote
+RE2 x1_v_g("([ ([{<])`([^`])"); // a valid non-letter preceeding directional unitary single-quote
+RE2 x1_v_q("([ ([{<])'"); // a valid non-letter preceeding undirected embedded quotes
+RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit
+RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit
+RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit
+RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace
+RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms
+RE2 right_x("[\\p{Sc}({¿¡]+"); //
+RE2 left_x("[,.?!:;\\%})]+"); //
+RE2 curr_en_x("^[\'][\\p{L}]"); //
+RE2 pre_en_x("[\\p{L}\\p{N}]$"); //
+RE2 curr_fr_x("[\\p{L}\\p{N}][\']$"); //
+RE2 post_fr_x("^[\\p{L}\\p{N}]"); //
+RE2 quotes_x("^[\'\"]+$"); //
+RE2 endnum_x("[-\'\"]"); //
+
+// anything rarely used will just be given as a string and compiled on demand by RE2
+
+}; // end anonymous namespace
+
+
+#ifdef TOKENIZER_NAMESPACE
+namespace TOKENIZER_NAMESPACE {
+#endif
+
+// where to load nonbreaking_prefix.XX files
+// and protected_pattern.XX files
+
+std::string Tokenizer::cfg_dir(".");
+
+
+// static method
+void
+Tokenizer::set_config_dir(const std::string& dir) {
+ if (dir.empty()) {
+ cfg_dir = ".";
+ } else {
+ cfg_dir.assign(dir);
+ }
+}
+
+
+Tokenizer::Tokenizer(const std::string& _lang_iso,
+ bool _skip_xml_p,
+ bool _skip_alltags_p,
+ bool _non_escape_p,
+ bool _aggressive_hyphen_p,
+ bool _penn_p,
+ bool _verbose_p)
+ : lang_iso(_lang_iso)
+ , english_p(_lang_iso.compare("en")==0)
+ , latin_p((!english_p) && (_lang_iso.compare("fr")==0 || _lang_iso.compare("it")==0))
+ , skip_xml_p(_skip_xml_p)
+ , skip_alltags_p(_skip_alltags_p)
+ , non_escape_p(_non_escape_p)
+ , aggressive_hyphen_p(_aggressive_hyphen_p)
+ , penn_p(_penn_p)
+ , verbose_p(_verbose_p)
+{
+}
+
+
+//
+// dtor deletes dynamically allocated per-language RE2 compiled expressions
+//
+Tokenizer::~Tokenizer()
+{
+ for (auto& ptr : prot_pat_vec) {
+ if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
+ continue;
+ delete ptr;
+ }
+}
+
+
+//
+// stuffs numeric-only prefixes into nbpre_num_set,
+// others into nbpre_gen_set
+//
+std::pair<int,int>
+Tokenizer::load_prefixes(std::ifstream& ifs)
+{
+ RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
+ std::string line;
+ int nnon = 0;
+ int nnum = 0;
+
+ while (std::getline(ifs,line)) {
+ if (!line.empty() && line.at(0) != '#') {
+ std::string prefix;
+ if (RE2::PartialMatch(line,numonly,&prefix)) {
+ nbpre_num_set.insert(prefix);
+ nnum++;
+ } else {
+ nbpre_gen_set.insert(line);
+ nnon++;
+ }
+ }
+ }
+ return std::make_pair(nnon,nnum);
+}
+
+
+//
+// load files (make sure to call set_config_dir before, if ever
+// for nonbreaking prefixes and protected patterns
+//
+void
+Tokenizer::init() {
+ std::string nbpre_path(cfg_dir);
+ nbpre_path.append("/nonbreaking_prefix.").append(lang_iso);
+ // default to generic version
+ if (::access(nbpre_path.c_str(),R_OK))
+ nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1);
+
+ if (::access(nbpre_path.c_str(),R_OK) == 0) {
+ std::ifstream cfg(nbpre_path.c_str());
+ try {
+ std::pair<int,int> counts = load_prefixes(cfg);
+ if (verbose_p) {
+ std::cerr << "loaded " << counts.first << " non-numeric, "
+ << counts.second << " numeric prefixes from "
+ << nbpre_path << std::endl;
+ }
+ } catch (...) {
+ std::ostringstream ess;
+ ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__;
+ throw std::runtime_error(ess.str());
+ }
+ } else if (verbose_p) {
+ std::cerr << "no prefix file found: " << nbpre_path << std::endl;
+ }
+
+ if (nbpre_gen_set.empty() && nbpre_num_set.empty()) {
+ std::ostringstream ess;
+ ess << "Error at " << __FILE__ << ":" << __LINE__ << " : "
+ << "No known abbreviations for language " << lang_iso;
+ throw std::runtime_error(ess.str());
+ }
+
+ std::string protpat_path(cfg_dir);
+ protpat_path.append("/protected_pattern.").append(lang_iso);
+ // default to generic version
+ if (::access(protpat_path.c_str(),R_OK))
+ protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
+
+ prot_pat_vec.push_back(&numprefixed_x);
+ prot_pat_vec.push_back(&quasinumeric_x);
+
+ if (::access(protpat_path.c_str(),R_OK) == 0) {
+ std::ifstream cfg(protpat_path.c_str());
+ char linebuf[1028];
+ int npat = 0;
+ try {
+ linebuf[0]='(';
+ while (cfg.good()) {
+ cfg.getline(linebuf+1,1024);
+ if (linebuf[1] && linebuf[1] != '#') {
+ strcat(linebuf,")");
+ prot_pat_vec.push_back(new RE2(linebuf));
+ npat++;
+ }
+ }
+ } catch (...) {
+ std::ostringstream ess;
+ ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__;
+ throw std::runtime_error(ess.str());
+ }
+ if (verbose_p) {
+ std::cerr << "loaded " << npat << " protected patterns from "
+ << protpat_path << std::endl;
+ }
+ } else if (verbose_p) {
+ std::cerr << "no protected file found: " << protpat_path << std::endl;
+ }
+}
+
+
+//
+// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
+// assumes protections are applied already, some invariants are in place
+//
+void
+Tokenizer::protected_tokenize(std::string& text) {
+ std::vector<std::string> words;
+ size_t pos = 0;
+ if (text.at(pos) == ' ')
+ ++pos;
+ size_t next = text.find(' ',pos);
+ while (next != std::string::npos) {
+ if (next - pos)
+ words.push_back(text.substr(pos,next-pos));
+ pos = next + 1;
+ while (pos < text.size() && text.at(pos) == ' ')
+ ++pos;
+ next = text.find(' ',pos);
+ }
+ if (pos < text.size() && text.at(pos) != ' ')
+ words.push_back(text.substr(pos,text.size()-pos));
+
+ text.clear();
+
+ // regurgitate words with look-ahead handling for tokens with final .
+ for (size_t ii = 0; ii < words.size(); ++ii) {
+ size_t len = words[ii].size();
+
+ if (len > 1 && words[ii].at(len-1) == '.') {
+ std::string prefix(words[ii].substr(0,len-1));
+ bool gen_prefix_p = nbpre_gen_set.find(prefix) != nbpre_gen_set.end();
+ bool embeds_p = prefix.find('.') != std::string::npos;
+ bool letter_p = RE2::PartialMatch(prefix.c_str(),letter_x);
+ bool more_p = ii < words.size() - 1;
+ bool nlower_p = more_p && RE2::PartialMatch(words[ii+1].c_str(),lower_x);
+ bool num_prefix_p = (!gen_prefix_p) && nbpre_num_set.find(prefix) != nbpre_num_set.end();
+ bool nint_p = more_p && RE2::PartialMatch(words[ii+1].c_str(),sinteger_x);
+ bool isolate_p = true;
+ if (gen_prefix_p) {
+ isolate_p = false;
+ } else if (num_prefix_p && nint_p) {
+ isolate_p = false;
+ } else if (embeds_p && letter_p) {
+ isolate_p = false;
+ } else if (nlower_p) {
+ isolate_p = false;
+ }
+ if (isolate_p) {
+ words[ii].assign(prefix);
+ words[ii].append(" .");
+ }
+ }
+
+ text.append(words[ii]);
+ if (ii < words.size() - 1)
+ text.append(" ");
+ }
+}
+
+
+bool
+Tokenizer::escape(std::string& text) {
+ static const char escaping[] = "&|<>'\"[]";
+ static const char *replacements[] = {
+ "&amp;",
+ "&#124;",
+ "&lt;",
+ "&gt;",
+ "&apos;",
+ "&quot;",
+ "&#91;",
+ "&#93;"
+ };
+ bool modified = false;
+ const char *next = escaping;
+
+ for (int ii = 0; *next; ++ii, ++next) {
+ size_t pos = 0;
+ for (pos = text.find(*next,pos); pos != std::string::npos;
+ pos = (++pos < text.size() ? text.find(*next,pos) : std::string::npos)) {
+ std::string replacement(replacements[ii]);
+ if (*next != '\'') {
+ if (pos > 0 && text.at(pos-1) == ' ' && pos < text.size()-1 && text.at(pos+1) != ' ')
+ replacement.append(" ");
+ }
+ text.replace(pos,1,replacement);
+ modified = true;
+ }
+ }
+
+ return modified;
+}
+
+
+std::string
+Tokenizer::tokenize(const std::string& buf)
+{
+ static const char *apos_refs = "\\1 ' \\2";
+ static const char *right_refs = "\\1 '\\2";
+ static const char *left_refs = "\\1' \\2";
+ static const char *comma_refs = "\\1 , \\2";
+ static const char *isolate_ref = " \\1 ";
+ static const char *special_refs = "\\1 @\\2@ \\3";
+
+ std::string outs;
+ std::string text(buf);
+
+ if (skip_alltags_p) {
+ RE2::GlobalReplace(&text,genl_tags_x," ");
+ }
+
+ RE2::GlobalReplace(&text,genl_spc_x," ");
+ RE2::GlobalReplace(&text,ctrls_x,"");
+
+ size_t pos;
+ int num = 0;
+
+ if (!penn_p) {
+ // this is the main moses-compatible tokenizer
+
+ // push all the prefixes matching protected patterns
+ std::vector<std::string> prot_stack;
+ std::string match;
+ for (auto& pat : prot_pat_vec) {
+ pos = 0;
+ while (RE2::PartialMatch(text.substr(pos),*pat,&match)) {
+ pos = text.find(match,pos);
+ if (pos == std::string::npos)
+ break;
+ size_t len = match.size();
+ if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') {
+ char subst[32];
+ int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++);
+ text.replace(pos,len,subst,nsubst);
+ prot_stack.push_back(match);
+ pos += nsubst;
+ } else {
+ pos += len;
+ }
+ }
+ }
+
+ // collapse spaces
+ RE2::GlobalReplace(&text,mult_spc_x," ");
+
+ // strip leading space
+ if (text.at(0) == ' ')
+ text = text.substr(1);
+
+ // strip trailing space
+ if (text.at(text.size()-1) == ' ')
+ text = text.substr(0,text.size()-1);
+
+ // isolate hyphens, if non-default option is set
+ if (aggressive_hyphen_p)
+ RE2::GlobalReplace(&text,hyphen_x,special_refs);
+
+ // find successive dots, protect them
+ pos = text.find("..");
+ while (pos != std::string::npos && pos < text.size()) {
+ char subst[12];
+ size_t lim = pos + 2;
+ while (lim < text.size() && text.at(lim) == '.') ++lim;
+ snprintf(subst,sizeof(subst),"MANYDOTS%.3d",lim-pos);
+ text.replace(pos,lim-pos,subst,11);
+ pos = text.find("..",pos+11);
+
+ }
+
+ // terminate token at superscript or subscript sequence when followed by lower-case
+ RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
+
+ // isolate commas after non-digits
+ RE2::GlobalReplace(&text,postncomma_x,"\\1 , ");
+
+ // isolate commas before non-digits
+ RE2::GlobalReplace(&text,prencomma_x," , \\1");
+
+ // replace backtick with single-quote
+ pos = text.find("`");
+ while (pos != std::string::npos) {
+ text.replace(pos,1,"'",1);
+ pos = text.find("`");
+ }
+
+ // replace doubled single-quotes with double-quotes
+ pos = text.find("''");
+ while (pos != std::string::npos) {
+ text.replace(pos,2,"\"",1);
+ pos = text.find("''",pos+1);
+ }
+
+ // isolate special characters
+ RE2::GlobalReplace(&text,specials_x,isolate_ref);
+
+ if (english_p) {
+ // english contractions to the right
+ RE2::GlobalReplace(&text,nanaapos_x,apos_refs);
+ RE2::GlobalReplace(&text,nxpaapos_x,apos_refs);
+ RE2::GlobalReplace(&text,panaapos_x,apos_refs);
+ RE2::GlobalReplace(&text,papaapos_x,right_refs);
+ RE2::GlobalReplace(&text,pnsapos_x,"\\1 's");
+ } else if (latin_p) {
+ // italian,french contractions to the left
+ RE2::GlobalReplace(&text,nanaapos_x,apos_refs);
+ RE2::GlobalReplace(&text,napaapos_x,apos_refs);
+ RE2::GlobalReplace(&text,panaapos_x,apos_refs);
+ RE2::GlobalReplace(&text,papaapos_x,left_refs);
+ }
+
+ protected_tokenize(text);
+
+ // restore prefix-protected strings
+ num = 0;
+ for (auto& prot : prot_stack) {
+ char subst[32];
+ snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++);
+ size_t loc = text.find(subst);
+ while (loc != std::string::npos) {
+ text.replace(loc,18,prot);
+ loc = text.find(subst,loc+18);
+ }
+ }
+
+ // restore dot-sequences with correct length
+ std::string numstr;
+ pos = 0;
+ while (RE2::PartialMatch(text,dotskey_x,&numstr)) {
+ int count = std::strtoul(numstr.c_str(),0,0);
+ int loc = text.find("MANYDOTS",pos);
+ std::ostringstream fss;
+ fss << text.substr(0,loc);
+ if (loc > 0 && text.at(loc-1) != ' ')
+ fss << ' ';
+ for (int ii = 0; ii < count; ++ii)
+ fss << '.';
+ int sublen = 8 + numstr.size();
+ pos = loc + sublen;
+ if (pos < text.size() && text.at(pos) != ' ')
+ fss << ' ';
+ fss << text.substr(pos);
+ pos = loc;
+ text.assign(fss.str());
+ }
+
+ // escape moses mark-up
+ if (!non_escape_p)
+ escape(text);
+
+ // return value
+ outs.assign(text);
+
+ } else {
+ // tokenize_penn case
+
+ // directed quote patches
+ size_t len = text.size();
+ if (len > 2 && text.substr(0,2) == "``")
+ text.replace(0,2,"`` ",3);
+ else if (text.at(0) == '"')
+ text.replace(0,1,"`` ",3);
+ else if (text.at(0) == '`' || text.at(0) == '\'')
+ text.replace(0,1,"` ",2);
+ static char one_gg[] = "\\1 ``";
+ RE2::GlobalReplace(&text,x1_v_d,one_gg);
+ RE2::GlobalReplace(&text,x1_v_gg,one_gg);
+ RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
+ RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
+
+ // protect ellipsis
+ for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
+ text.replace(pos,3,"MANYELIPSIS",11);
+
+ // numeric commas
+ RE2::GlobalReplace(&text,ndndcomma_x,comma_refs);
+ RE2::GlobalReplace(&text,pdndcomma_x,comma_refs);
+ RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs);
+
+ // isolable symbols
+ RE2::GlobalReplace(&text,symbol_x,isolate_ref);
+
+ // isolable slash
+ RE2::GlobalReplace(&text,slash_x,special_refs);
+
+ // isolate final period
+ RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
+
+ // isolate q.m., e.m.
+ RE2::GlobalReplace(&text,qx_x,isolate_ref);
+
+ // isolate braces
+ RE2::GlobalReplace(&text,braces_x,isolate_ref);
+
+ // convert open/close punctuation
+ RE2::GlobalReplace(&text,"\\(","-LRB-");
+ RE2::GlobalReplace(&text,"\\[","-LSB-");
+ RE2::GlobalReplace(&text,"\\{","-LCB-");
+ RE2::GlobalReplace(&text,"\\)","-RRB-");
+ RE2::GlobalReplace(&text,"\\]","-RSB-");
+ RE2::GlobalReplace(&text,"\\}","-RCB-");
+
+ // isolate double-dash hyphen
+ RE2::GlobalReplace(&text,"--"," -- ");
+
+ // insure leading and trailing space on line, to simplify exprs
+ // also make sure final . has one space on each side
+ len = text.size();
+ while (len > 1 && text.at(len-1) == ' ') --len;
+ if (len < text.size())
+ text.assign(text.substr(0,len));
+ if (len > 2 && text.at(len-1) == '.') {
+ if (text.at(len-2) != ' ') {
+ text.assign(text.substr(0,len-1));
+ text.append(" . ");
+ } else {
+ text.assign(text.substr(0,len-1));
+ text.append(". ");
+ }
+ } else {
+ text.append(" ");
+ }
+ std::string ntext(" ");
+ ntext.append(text);
+
+ // convert double quote to paired single-quotes
+ RE2::GlobalReplace(&ntext,"\""," '' ");
+
+ // deal with contractions in penn style
+ RE2::GlobalReplace(&ntext,endq_x,"\\1 ' ");
+ RE2::GlobalReplace(&ntext,contract_x," '\\1 ");
+ RE2::GlobalReplace(&ntext,"'ll "," 'll ");
+ RE2::GlobalReplace(&ntext,"'re "," 're ");
+ RE2::GlobalReplace(&ntext,"'ve "," 've ");
+ RE2::GlobalReplace(&ntext,"n't "," n't ");
+ RE2::GlobalReplace(&ntext,"'LL "," 'LL ");
+ RE2::GlobalReplace(&ntext,"'RE "," 'RE ");
+ RE2::GlobalReplace(&ntext,"'VE "," 'VE ");
+ RE2::GlobalReplace(&ntext,"N'T "," N'T ");
+ RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not ");
+ RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye ");
+ RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me ");
+ RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na ");
+ RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta ");
+ RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me ");
+ RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n ");
+ RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n ");
+ RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n ");
+ RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n ");
+ RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
+
+ protected_tokenize(ntext);
+
+ // restore ellipsis
+ RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
+
+ // collapse spaces
+ RE2::GlobalReplace(&ntext,mult_spc_x," ");
+
+ // escape moses meta-characters
+ if (!non_escape_p)
+ escape(ntext);
+
+ // strip out wrapping spaces from line in result string
+ outs.assign(ntext.substr(1,ntext.size()-2));
+ }
+
+ return outs;
+}
+
+
+std::size_t
+Tokenizer::tokenize(std::istream& is, std::ostream& os)
+{
+ size_t line_no = 0;
+ while (is.good() && os.good()) {
+ std::string istr;
+ std::getline(is,istr);
+ line_no ++;
+ if (istr.empty())
+ continue;
+ if (skip_xml_p && RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)) {
+ os << istr << std::endl;
+ } else {
+ std::string bstr(" ");
+ bstr.append(istr).append(" ");
+ os << tokenize(bstr) << std::endl;
+ }
+ if (verbose_p && ((line_no % 1000) == 0)) {
+ std::cerr << line_no << ' ';
+ std::cerr.flush();
+ }
+ }
+ return line_no;
+}
+
+
+namespace {
+
+std::string trim(const std::string& in)
+{
+ std::size_t start = 0;
+ std::size_t limit = in.size();
+ while (start < limit && in.at(start) < '!') ++start;
+ while (start < limit && in.at(limit-1) < '!') --limit;
+ if (start == limit) return std::string("");
+ if (start > 0 || limit < in.size())
+ return in.substr(start,limit-start);
+ return std::string(in);
+}
+
+
+std::vector<std::string> split(const std::string& in)
+{
+ std::vector<std::string> outv;
+ std::istringstream iss(in);
+ std::copy(std::istream_iterator<std::string>(iss),
+ std::istream_iterator<std::string>(),
+ std::back_inserter(outv));
+ return outv;
+}
+
+};
+
+
+std::string
+Tokenizer::detokenize(const std::string& buf)
+{
+ std::vector<std::string> words = split(trim(buf));
+
+ std::size_t squotes = 0;
+ std::size_t dquotes = 0;
+ std::string prepends(" ");
+
+ std::ostringstream oss;
+
+ std::size_t nwords = words.size();
+ std::size_t iword = 0;
+
+ for (auto word: words) {
+ if (RE2::FullMatch(word,right_x)) {
+ oss << prepends << word;
+ prepends.clear();
+ } else if (RE2::FullMatch(word,left_x)) {
+ oss << word;
+ prepends = " ";
+ } else if (english_p && iword && RE2::FullMatch(word,curr_en_x) && RE2::FullMatch(words[iword-1],pre_en_x)) {
+ oss << word;
+ prepends = " ";
+ } else if (latin_p && iword < nwords - 2 && RE2::FullMatch(word,curr_fr_x) && RE2::FullMatch(words[iword+1],post_fr_x)) {
+ oss << prepends << word;
+ prepends.clear();
+ } else if (word.size() == 1) {
+ if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
+ (word.at(0) == '"' && ((dquotes % 2) == 0))) {
+ if (english_p && iword && word.at(0) == '\'' && words[iword-1].at(words[iword-1].size()-1) == 's') {
+ oss << word;
+ prepends = " ";
+ } else {
+ oss << prepends << word;
+ prepends.clear();
+ if (word.at(0) == '\'')
+ squotes++;
+ else
+ dquotes++;
+ }
+ } else {
+ oss << word;
+ prepends = " ";
+ if (word.at(0) == '\'')
+ squotes++;
+ else if (word.at(0) == '"')
+ dquotes++;
+ }
+ } else {
+ oss << prepends << word;
+ prepends.clear();
+ }
+ iword++;
+ }
+
+
+ std::string text(oss.str());
+ RE2::GlobalReplace(&text," +"," ");
+ RE2::GlobalReplace(&text,"\n ","\n");
+ RE2::GlobalReplace(&text," \n","\n");
+ return trim(text);
+}
+
+
+std::size_t
+Tokenizer::detokenize(std::istream& is, std::ostream& os)
+{
+ size_t line_no = 0;
+ while (is.good() && os.good()) {
+ std::string istr;
+ std::getline(is,istr);
+ line_no ++;
+ if (istr.empty())
+ continue;
+ if (skip_xml_p && RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)) {
+ os << istr << std::endl;
+ } else {
+ os << detokenize(istr) << std::endl;
+ }
+ }
+ return line_no;
+}
+
+
+#ifdef TOKENIZER_NAMESPACE
+}; // namespace
+#endif
+
diff --git a/contrib/c++tokenizer/tokenizer.h b/contrib/c++tokenizer/tokenizer.h
new file mode 100644
index 000000000..aab673cc0
--- /dev/null
+++ b/contrib/c++tokenizer/tokenizer.h
@@ -0,0 +1,115 @@
+#include <string>
+#include <iostream>
+#include <cstdlib>
+#include <fstream>
+#include <sstream>
+#include <unordered_map>
+#include <set>
+#include <vector>
+#include <iterator>
+#include <stdexcept>
+
+#include <re2/re2.h>
+#include <unistd.h>
+
+#ifdef TOKENIZER_NAMESPACE
+namespace TOKENIZER_NAMESPACE {
+#endif
+
+//
+// @about
+// Tokenizer implements the process of Koehn's tokenizer.perl via RE2
+//
+class Tokenizer {
+
+private:
+
+ static std::string cfg_dir;
+
+ std::set<std::string> nbpre_num_set;
+ std::set<std::string> nbpre_gen_set;
+ std::vector<re2::RE2 *> prot_pat_vec;
+
+protected:
+
+ // language
+ std::string lang_iso;
+ bool english_p; // is lang_iso "en"
+ bool latin_p; // is lang_iso "fr" or "it"
+ bool skip_xml_p;
+ bool skip_alltags_p;
+ bool non_escape_p;
+ bool aggressive_hyphen_p;
+ bool penn_p;
+ bool verbose_p;
+
+ std::pair<int,int> load_prefixes(std::ifstream& ifs); // used by init(), parameterized by lang_iso
+
+ // escapes specials into entities from the set &|"'[] (after tokenization, when enabled)
+ bool escape(std::string& inplace);
+
+ // in-place 1 line tokenizer, replaces input string, depends on wrapper to set-up invariants
+ void protected_tokenize(std::string& inplace);
+
+public:
+
+ // cfg_dir is assumed shared by all languages
+ static void set_config_dir(const std::string& _cfg_dir);
+
+ // no throw
+ Tokenizer(const std::string& _lang_iso = "en",
+ bool _skip_xml_p = true, // skips isolated (linewise) tags in any case
+ bool _skip_alltags_p = true, // skip all xml style tags
+ bool _non_escape_p = false, // default is to call escape method before return
+ bool _aggressive_hyphen_p = false, // hyphens become tokens when true
+ bool _penn_p = false, // Treebank-3 compatible tokenization when true
+ bool _verbose_p = false);
+
+ // frees dynamically compiled expressions
+ ~Tokenizer();
+
+ // required before other methods, may throw
+ void init();
+
+ // streaming tokenizer reads from is, writes to os, preserving line breaks
+ std::size_t tokenize(std::istream& is, std::ostream& os);
+
+ // tokenize padded line buffer to return string
+ std::string tokenize(const std::string& buf);
+
+ void tokenize(const std::string& buf, std::string& outs) {
+ outs = tokenize(buf);
+ }
+
+ // tokenize to a vector
+ std::vector<std::string> tokens(const std::string& in) {
+ std::istringstream tokss(tokenize(in));
+ std::vector<std::string> outv;
+ std::copy(std::istream_iterator<std::string>(tokss),
+ std::istream_iterator<std::string>(),
+ std::back_inserter(outv));
+ return outv;
+ }
+
+ // streaming detokenizer reads from is, writes to os, preserving breaks
+ std::size_t detokenize(std::istream& is, std::ostream &os);
+
+ // detokenize padded line buffer to return string
+ std::string detokenize(const std::string& buf);
+
+ void detokenize(const std::string& buf, std::string& outs) {
+ outs = detokenize(buf);
+ }
+
+ // detokenize from a vector
+ std::string detokenize(const std::vector<std::string>& inv) {
+ std::ostringstream oss;
+ std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," "));
+ return detokenize(oss.str());
+ }
+
+}; // end class Tokenizer
+
+#ifdef TOKENIZER_NAMESPACE
+};
+#endif
diff --git a/contrib/c++tokenizer/tokenizer_main.cpp b/contrib/c++tokenizer/tokenizer_main.cpp
new file mode 100644
index 000000000..a4fc8f97b
--- /dev/null
+++ b/contrib/c++tokenizer/tokenizer_main.cpp
@@ -0,0 +1,212 @@
+#include "tokenizer.h"
+#include <memory>
+#include <vector>
+#include <cctype>
+
+#ifdef TOKENIZER_NAMESPACE
+using namespace TOKENIZER_NAMESPACE ;
+#endif
+
+
+void
+usage(const char *path)
+{
+ std::cerr << "Usage: " << path << "[-{v|x|p|a|e|]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
+ std::cerr << " -v -- verbose" << std::endl;
+ std::cerr << " -w -- word filter" << std::endl;
+ std::cerr << " -x -- skip xml tag lines" << std::endl;
+ std::cerr << " -y -- skip all xml tags" << std::endl;
+ std::cerr << " -e -- escape entities" << std::endl;
+ std::cerr << " -a -- aggressive hyphenization" << std::endl;
+ std::cerr << " -p -- treebank-3 style" << std::endl;
+ std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
+ std::cerr << " -o OUT -- output file path" << std::endl;
+ std::cerr << "Default is -c ., stdin, stdout." << std::endl;
+ std::cerr << "LL in en,fr,it affect contraction." << std::endl;
+}
+
+
+std::string token_word(const std::string& in) {
+ int pos = -1;
+ int digits_prefixed = 0;
+ int nalpha = 0;
+ int len = in.size();
+ std::vector<char> cv;
+ int last_quirk = -1;
+ while (++pos < len) {
+ char ch = in.at(pos);
+ if (std::isdigit(ch)) {
+ if (digits_prefixed > 0) {
+ last_quirk = pos;
+ break;
+ }
+ digits_prefixed--;
+ cv.push_back(std::tolower(ch));
+ } else if (std::isalpha(ch)) {
+ if (digits_prefixed < 0)
+ digits_prefixed = -digits_prefixed;
+ cv.push_back(std::tolower(ch));
+ nalpha++;
+ } else {
+ if (digits_prefixed < 0)
+ digits_prefixed = -digits_prefixed;
+ last_quirk = pos;
+ if ((ch == '-' || ch == '\'') && pos != 0) {
+ cv.push_back(ch);
+ } else {
+ break;
+ }
+ }
+ }
+ if (last_quirk == pos || digits_prefixed > 0 && nalpha == 0)
+ cv.clear(); // invalid word
+ return std::string(cv.begin(),cv.end());
+}
+
+
+int
+copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
+ int nlines = 0;
+ std::string line;
+ while (ifs.good() && std::getline(ifs,line)) {
+ if (line.empty()) continue;
+ std::vector<std::string> tokens(tize.tokens(line));
+ int count = 0;
+ for (auto& token: tokens) {
+ std::string word(token_word(token));
+ if (word.empty()) continue;
+ ofs << word << ' ';
+ count++;
+ }
+ if (count) {
+ ofs << std::endl;
+ nlines++;
+ }
+ }
+ return nlines;
+}
+
+
+int main(int ac, char **av)
+{
+ int rc = 0;
+ std::string lang_iso;
+ std::vector<std::string> args;
+ std::string out_path;
+ char *cfg_path = 0;
+ bool next_cfg_p = false;
+ bool next_output_p = false;
+ bool verbose_p = false;
+ bool detag_p = false;
+ bool alltag_p = false;
+ bool escape_p = true;
+ bool aggro_p = false;
+ bool penn_p = false;
+ bool words_p = false;
+
+ const char *prog = av[0];
+ while (++av,--ac) {
+ if (**av == '-') {
+ switch (av[0][1]) {
+ case 'h':
+ usage(prog);
+ exit(0);
+ case 'c':
+ next_cfg_p = true;
+ break;
+ case 'o':
+ next_output_p = true;
+ break;
+ case 'v':
+ verbose_p = true;
+ break;
+ case 'e':
+ escape_p = false;
+ break;
+ case 'w':
+ words_p = true;
+ break;
+ case 'x':
+ detag_p = true;
+ break;
+ case 'y':
+ alltag_p = true;
+ break;
+ case 'a':
+ aggro_p = true;
+ break;
+ case 'l':
+ // ignored
+ break;
+ case 'p':
+ penn_p = true;
+ break;
+ default:
+ std::cerr << "Unknown option: " << *av << std::endl;
+ ::exit(1);
+ }
+ } else if (lang_iso.empty() && strlen(*av) == 2) {
+ lang_iso = *av;
+ } else if (**av == '-') {
+ ++*av;
+ } else if (next_output_p) {
+ next_output_p = false;
+ out_path = *av;
+ } else if (next_cfg_p) {
+ next_cfg_p = false;
+ cfg_path = *av;
+ } else {
+ args.push_back(std::string(*av));
+ }
+ }
+
+ if (!cfg_path) {
+ cfg_path = getenv("TOKENIZER_SHARED_DIR");
+ }
+ if (cfg_path) {
+ Tokenizer::set_config_dir(std::string(cfg_path));
+ }
+
+ std::unique_ptr<std::ofstream> pofs = 0;
+ if (!out_path.empty()) {
+ pofs.reset(new std::ofstream(out_path.c_str()));
+ }
+ std::ostream& ofs(pofs ? *pofs : std::cout);
+
+ Tokenizer tize(lang_iso,detag_p,alltag_p,!escape_p,aggro_p,penn_p,verbose_p);
+ tize.init();
+ size_t nlines = 0;
+
+ if (words_p) {
+ if (args.empty()) {
+ nlines += copy_words(tize,std::cin,ofs);
+ } else {
+ for (std::string& arg : args) {
+ try {
+ std::ifstream ifs(arg.c_str());
+ nlines += copy_words(tize,ifs,ofs);
+ } catch (...) {
+ std::cerr << "Exception extracting words from path " << arg << std::endl;
+ }
+ }
+ }
+ } else if (args.empty()) {
+ nlines = tize.tokenize(std::cin,ofs);
+ } else {
+ for (std::string& arg : args) {
+ try {
+ std::ifstream ifs(arg.c_str());
+ nlines = tize.tokenize(ifs,ofs);
+ } catch (...) {
+ std::cerr << "Exception tokenizing from path " << arg << std::endl;
+ }
+ }
+ }
+
+ if (verbose_p)
+ std::cerr << "%%% tokenized lines: " << nlines << std::endl;
+
+ return rc;
+}
+
+