#include "tokenizer.h" #include #include #include #include #include #include #include #include #include #include #include namespace { // anonymous namespace // frequently used regexp's are pre-compiled thus: RE2 genl_tags_x("<[/!\\p{L}]+[^>]*>"); RE2 mult_spc_x(" +"); // multiple spaces RE2 tag_line_x("^<.+>$"); // lines beginning and ending with open/close angle-bracket pairs RE2 white_line_x("^\\s*$"); // lines entirely composed of whitespace RE2 slash_x("([\\p{L}\\p{N}])(/)([\\p{L}\\p{N}])"); // and slash-conjoined " " RE2 final_x("([^.])([.])([\\]\\)}>\"']*) ?$"); // sentence-final punctuation sequence (non qm em) RE2 qx_x("([?!])"); // one qm/em mark RE2 braces_x("([\\]\\[\$\${}<>])"); // any open or close of a pair RE2 endq_x("([^'])' "); // post-token single-quote or doubled single-quote RE2 letter_x("\\p{L}"); // a letter RE2 lower_x("^\\p{Ll}"); // a lower-case letter RE2 sinteger_x("^\\p{N}"); // not a digit mark RE2 numprefixed_x("[-+/.@\\\\#\\%&\\p{Sc}\\p{N}]*[\\p{N}]+-[-'`\"\\p{L}]*\\p{L}"); RE2 quasinumeric_x("[-.;:@\\\\#\%&\\p{Sc}\\p{So}\\p{N}]*[\\p{N}]+"); RE2 numscript_x("([\\p{N}\\p{L}])([\\p{No}]+)(\\p{Ll})"); RE2 x1_v_d("([ ([{<])\""); // a valid non-letter preceeding a double-quote RE2 x1_v_gg("([ ([{<])``"); // a valid non-letter preceeding directional doubled open single-quote RE2 x1_v_g("([ ([{<])`([^`])"); // a valid non-letter preceeding directional unitary single-quote RE2 x1_v_q("([ ([{<])'"); // a valid non-letter preceeding undirected embedded quotes RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms, as embedded RE2 right_x("[({¿¡]+"); // symbols which conjoin to the right RE2 left_x("[,.?!:;\\%\\p{Sc}})]+"); // symbols conjoin to the left RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to the left RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes // anything rarely used will just be given as a string and compiled on demand by RE2 const char * SPC_BYTE = " "; //const char * //URL_VALID_SYM_CHARS = "-._~:/?#[]@!$&'()*+,;="; inline bool class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) { while (s < e) { GUnicodeType tclass = g_unichar_type(*s); if (tclass == gclass) return true; switch (tclass) { case G_UNICODE_SPACING_MARK: case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_PARAGRAPH_SEPARATOR: case G_UNICODE_SPACE_SEPARATOR: ++s; continue; break; default: return false; } } return false; } const char *ESCAPE_MOSES[] = { "|", // | 0 "[", // [ 1 "]", // ] 2 "&", // & 3 (26) "<", // < 4 (3c) ">", // > 5 (3e) "'", // ' 6 (27) """, // " 7 (22) }; const std::set ESCAPE_SET = { std::string(ESCAPE_MOSES[0]), std::string(ESCAPE_MOSES[1]), std::string(ESCAPE_MOSES[2]), std::string(ESCAPE_MOSES[3]), std::string(ESCAPE_MOSES[4]), std::string(ESCAPE_MOSES[5]), std::string(ESCAPE_MOSES[6]), std::string(ESCAPE_MOSES[7]), }; const std::map ENTITY_MAP = { { std::wstring(L"""), L'"' }, { std::wstring(L"&"), L'&' }, { std::wstring(L"'"), L'\'' }, { std::wstring(L"<"), L'<' }, { std::wstring(L">"), L'>' }, { std::wstring(L" "), L'\u00A0' }, { std::wstring(L"¡"), L'\u00A1' }, { std::wstring(L"¢"), L'\u00A2' }, { std::wstring(L"£"), L'\u00A3' }, { std::wstring(L"¤"), L'\u00A4' }, { std::wstring(L"¥"), L'\u00A5' }, { std::wstring(L"¦"), L'\u00A6' }, { std::wstring(L"§"), L'\u00A7' }, { std::wstring(L"¨"), L'\u00A8' }, { std::wstring(L"©"), L'\u00A9' }, { std::wstring(L"ª"), L'\u00AA' }, { std::wstring(L"«"), L'\u00AB' }, { std::wstring(L"¬"), L'\u00AC' }, { std::wstring(L""), L'\u00AD' }, { std::wstring(L"®"), L'\u00AE' }, { std::wstring(L"¯"), L'\u00AF' }, { std::wstring(L"°"), L'\u00B0' }, { std::wstring(L"±"), L'\u00B1' }, { std::wstring(L"²"), L'\u00B2' }, { std::wstring(L"³"), L'\u00B3' }, { std::wstring(L"´"), L'\u00B4' }, { std::wstring(L"µ"), L'\u00B5' }, { std::wstring(L"¶"), L'\u00B6' }, { std::wstring(L"·"), L'\u00B7' }, { std::wstring(L"¸"), L'\u00B8' }, { std::wstring(L"¹"), L'\u00B9' }, { std::wstring(L"º"), L'\u00BA' }, { std::wstring(L"»"), L'\u00BB' }, { std::wstring(L"¼"), L'\u00BC' }, { std::wstring(L"½"), L'\u00BD' }, { std::wstring(L"¾"), L'\u00BE' }, { std::wstring(L"¿"), L'\u00BF' }, { std::wstring(L"À"), L'\u00C0' }, { std::wstring(L"Á"), L'\u00C1' }, { std::wstring(L"Â"), L'\u00C2' }, { std::wstring(L"Ã"), L'\u00C3' }, { std::wstring(L"Ä"), L'\u00C4' }, { std::wstring(L"Å"), L'\u00C5' }, { std::wstring(L"Æ"), L'\u00C6' }, { std::wstring(L"Ç"), L'\u00C7' }, { std::wstring(L"È"), L'\u00C8' }, { std::wstring(L"É"), L'\u00C9' }, { std::wstring(L"Ê"), L'\u00CA' }, { std::wstring(L"Ë"), L'\u00CB' }, { std::wstring(L"Ì"), L'\u00CC' }, { std::wstring(L"Í"), L'\u00CD' }, { std::wstring(L"Î"), L'\u00CE' }, { std::wstring(L"Ï"), L'\u00CF' }, { std::wstring(L"Ð"), L'\u00D0' }, { std::wstring(L"Ñ"), L'\u00D1' }, { std::wstring(L"Ò"), L'\u00D2' }, { std::wstring(L"Ó"), L'\u00D3' }, { std::wstring(L"Ô"), L'\u00D4' }, { std::wstring(L"Õ"), L'\u00D5' }, { std::wstring(L"Ö"), L'\u00D6' }, { std::wstring(L"×"), L'\u00D7' }, { std::wstring(L"Ø"), L'\u00D8' }, { std::wstring(L"Ù"), L'\u00D9' }, { std::wstring(L"Ú"), L'\u00DA' }, { std::wstring(L"Û"), L'\u00DB' }, { std::wstring(L"Ü"), L'\u00DC' }, { std::wstring(L"Ý"), L'\u00DD' }, { std::wstring(L"Þ"), L'\u00DE' }, { std::wstring(L"ß"), L'\u00DF' }, { std::wstring(L"à"), L'\u00E0' }, { std::wstring(L"á"), L'\u00E1' }, { std::wstring(L"â"), L'\u00E2' }, { std::wstring(L"ã"), L'\u00E3' }, { std::wstring(L"ä"), L'\u00E4' }, { std::wstring(L"å"), L'\u00E5' }, { std::wstring(L"æ"), L'\u00E6' }, { std::wstring(L"ç"), L'\u00E7' }, { std::wstring(L"è"), L'\u00E8' }, { std::wstring(L"é"), L'\u00E9' }, { std::wstring(L"ê"), L'\u00EA' }, { std::wstring(L"ë"), L'\u00EB' }, { std::wstring(L"ì"), L'\u00EC' }, { std::wstring(L"í"), L'\u00ED' }, { std::wstring(L"î"), L'\u00EE' }, { std::wstring(L"ï"), L'\u00EF' }, { std::wstring(L"ð"), L'\u00F0' }, { std::wstring(L"ñ"), L'\u00F1' }, { std::wstring(L"ò"), L'\u00F2' }, { std::wstring(L"ó"), L'\u00F3' }, { std::wstring(L"ô"), L'\u00F4' }, { std::wstring(L"õ"), L'\u00F5' }, { std::wstring(L"ö"), L'\u00F6' }, { std::wstring(L"÷"), L'\u00F7' }, { std::wstring(L"ø"), L'\u00F8' }, { std::wstring(L"ù"), L'\u00F9' }, { std::wstring(L"ú"), L'\u00FA' }, { std::wstring(L"û"), L'\u00FB' }, { std::wstring(L"ü"), L'\u00FC' }, { std::wstring(L"ý"), L'\u00FD' }, { std::wstring(L"þ"), L'\u00FE' }, { std::wstring(L"ÿ"), L'\u00FF' }, { std::wstring(L"Œ"), L'\u0152' }, { std::wstring(L"œ"), L'\u0153' }, { std::wstring(L"Š"), L'\u0160' }, { std::wstring(L"š"), L'\u0161' }, { std::wstring(L"Ÿ"), L'\u0178' }, { std::wstring(L"ƒ"), L'\u0192' }, { std::wstring(L"ˆ"), L'\u02C6' }, { std::wstring(L"˜"), L'\u02DC' }, { std::wstring(L"Α"), L'\u0391' }, { std::wstring(L"Β"), L'\u0392' }, { std::wstring(L"Γ"), L'\u0393' }, { std::wstring(L"Δ"), L'\u0394' }, { std::wstring(L"Ε"), L'\u0395' }, { std::wstring(L"Ζ"), L'\u0396' }, { std::wstring(L"Η"), L'\u0397' }, { std::wstring(L"Θ"), L'\u0398' }, { std::wstring(L"Ι"), L'\u0399' }, { std::wstring(L"Κ"), L'\u039A' }, { std::wstring(L"Λ"), L'\u039B' }, { std::wstring(L"Μ"), L'\u039C' }, { std::wstring(L"Ν"), L'\u039D' }, { std::wstring(L"Ξ"), L'\u039E' }, { std::wstring(L"Ο"), L'\u039F' }, { std::wstring(L"Π"), L'\u03A0' }, { std::wstring(L"Ρ"), L'\u03A1' }, { std::wstring(L"Σ"), L'\u03A3' }, { std::wstring(L"Τ"), L'\u03A4' }, { std::wstring(L"Υ"), L'\u03A5' }, { std::wstring(L"Φ"), L'\u03A6' }, { std::wstring(L"Χ"), L'\u03A7' }, { std::wstring(L"Ψ"), L'\u03A8' }, { std::wstring(L"Ω"), L'\u03A9' }, { std::wstring(L"α"), L'\u03B1' }, { std::wstring(L"β"), L'\u03B2' }, { std::wstring(L"γ"), L'\u03B3' }, { std::wstring(L"δ"), L'\u03B4' }, { std::wstring(L"ε"), L'\u03B5' }, { std::wstring(L"ζ"), L'\u03B6' }, { std::wstring(L"η"), L'\u03B7' }, { std::wstring(L"θ"), L'\u03B8' }, { std::wstring(L"ι"), L'\u03B9' }, { std::wstring(L"κ"), L'\u03BA' }, { std::wstring(L"λ"), L'\u03BB' }, { std::wstring(L"μ"), L'\u03BC' }, { std::wstring(L"ν"), L'\u03BD' }, { std::wstring(L"ξ"), L'\u03BE' }, { std::wstring(L"ο"), L'\u03BF' }, { std::wstring(L"π"), L'\u03C0' }, { std::wstring(L"ρ"), L'\u03C1' }, { std::wstring(L"ς"), L'\u03C2' }, { std::wstring(L"σ"), L'\u03C3' }, { std::wstring(L"τ"), L'\u03C4' }, { std::wstring(L"υ"), L'\u03C5' }, { std::wstring(L"φ"), L'\u03C6' }, { std::wstring(L"χ"), L'\u03C7' }, { std::wstring(L"ψ"), L'\u03C8' }, { std::wstring(L"ω"), L'\u03C9' }, { std::wstring(L"ϑ"), L'\u03D1' }, { std::wstring(L"ϒ"), L'\u03D2' }, { std::wstring(L"ϖ"), L'\u03D6' }, { std::wstring(L" "), L'\u2002' }, { std::wstring(L" "), L'\u2003' }, { std::wstring(L" "), L'\u2009' }, { std::wstring(L"‌"), L'\u200C' }, { std::wstring(L"‍"), L'\u200D' }, { std::wstring(L"‎"), L'\u200E' }, { std::wstring(L"‏"), L'\u200F' }, { std::wstring(L"–"), L'\u2013' }, { std::wstring(L"—"), L'\u2014' }, { std::wstring(L"‘"), L'\u2018' }, { std::wstring(L"’"), L'\u2019' }, { std::wstring(L"‚"), L'\u201A' }, { std::wstring(L"“"), L'\u201C' }, { std::wstring(L"”"), L'\u201D' }, { std::wstring(L"„"), L'\u201E' }, { std::wstring(L"†"), L'\u2020' }, { std::wstring(L"‡"), L'\u2021' }, { std::wstring(L"•"), L'\u2022' }, { std::wstring(L"…"), L'\u2026' }, { std::wstring(L"‰"), L'\u2030' }, { std::wstring(L"′"), L'\u2032' }, { std::wstring(L"″"), L'\u2033' }, { std::wstring(L"‹"), L'\u2039' }, { std::wstring(L"›"), L'\u203A' }, { std::wstring(L"‾"), L'\u203E' }, { std::wstring(L"⁄"), L'\u2044' }, { std::wstring(L"€"), L'\u20AC' }, { std::wstring(L"ℑ"), L'\u2111' }, { std::wstring(L"℘"), L'\u2118' }, { std::wstring(L"ℜ"), L'\u211C' }, { std::wstring(L"™"), L'\u2122' }, { std::wstring(L"ℵ"), L'\u2135' }, { std::wstring(L"←"), L'\u2190' }, { std::wstring(L"↑"), L'\u2191' }, { std::wstring(L"→"), L'\u2192' }, { std::wstring(L"↓"), L'\u2193' }, { std::wstring(L"↔"), L'\u2194' }, { std::wstring(L"↵"), L'\u21B5' }, { std::wstring(L"⇐"), L'\u21D0' }, { std::wstring(L"⇑"), L'\u21D1' }, { std::wstring(L"⇒"), L'\u21D2' }, { std::wstring(L"⇓"), L'\u21D3' }, { std::wstring(L"⇔"), L'\u21D4' }, { std::wstring(L"∀"), L'\u2200' }, { std::wstring(L"∂"), L'\u2202' }, { std::wstring(L"∃"), L'\u2203' }, { std::wstring(L"∅"), L'\u2205' }, { std::wstring(L"∇"), L'\u2207' }, { std::wstring(L"∈"), L'\u2208' }, { std::wstring(L"∉"), L'\u2209' }, { std::wstring(L"∋"), L'\u220B' }, { std::wstring(L"∏"), L'\u220F' }, { std::wstring(L"∑"), L'\u2211' }, { std::wstring(L"−"), L'\u2212' }, { std::wstring(L"∗"), L'\u2217' }, { std::wstring(L"√"), L'\u221A' }, { std::wstring(L"∝"), L'\u221D' }, { std::wstring(L"∞"), L'\u221E' }, { std::wstring(L"∠"), L'\u2220' }, { std::wstring(L"∧"), L'\u2227' }, { std::wstring(L"∨"), L'\u2228' }, { std::wstring(L"∩"), L'\u2229' }, { std::wstring(L"∪"), L'\u222A' }, { std::wstring(L"∫"), L'\u222B' }, { std::wstring(L"∴"), L'\u2234' }, { std::wstring(L"∼"), L'\u223C' }, { std::wstring(L"≅"), L'\u2245' }, { std::wstring(L"≈"), L'\u2248' }, { std::wstring(L"≠"), L'\u2260' }, { std::wstring(L"≡"), L'\u2261' }, { std::wstring(L"≤"), L'\u2264' }, { std::wstring(L"≥"), L'\u2265' }, { std::wstring(L"⊂"), L'\u2282' }, { std::wstring(L"⊃"), L'\u2283' }, { std::wstring(L"⊄"), L'\u2284' }, { std::wstring(L"⊆"), L'\u2286' }, { std::wstring(L"⊇"), L'\u2287' }, { std::wstring(L"⊕"), L'\u2295' }, { std::wstring(L"⊗"), L'\u2297' }, { std::wstring(L"⊥"), L'\u22A5' }, { std::wstring(L"⋅"), L'\u22C5' }, { std::wstring(L"⌈"), L'\u2308' }, { std::wstring(L"⌉"), L'\u2309' }, { std::wstring(L"⌊"), L'\u230A' }, { std::wstring(L"⌋"), L'\u230B' }, { std::wstring(L"⟨"), L'\u2329' }, { std::wstring(L"⟩"), L'\u232A' }, { std::wstring(L"◊"), L'\u25CA' }, { std::wstring(L"♠"), L'\u2660' }, { std::wstring(L"♣"), L'\u2663' }, { std::wstring(L"♥"), L'\u2665' }, { std::wstring(L"♦"), L'\u2666' } }; inline gunichar get_entity(gunichar *ptr, size_t len) { // try hex, decimal entity first gunichar ech(0); if (ptr[1] == gunichar(L'#') && len > 3) { std::wstringstream wss; int wch = 0; try { wss << std::hex << std::wstring((wchar_t *)(ptr+2),len-3); wss >> wch; ech = gunichar(wch); } catch (...) { ech = 0; } } else if (g_unichar_type(ptr[1]) == G_UNICODE_DECIMAL_NUMBER) { std::wstringstream wss; int wch = 0; try { wss << std::dec << std::wstring((wchar_t *)(ptr+1),len-2); wss >> wch; ech = gunichar(wch); } catch (...) { ech = 0; } } if (ech) return ech; std::map::const_iterator it = ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len)); return it != ENTITY_MAP.end() ? it->second : gunichar(0); } inline gunichar get_entity(char *ptr, size_t len) { glong ulen = 0; gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen); gunichar gch = get_entity(gtmp,ulen); g_free(gtmp); return gch; } inline std::string trim(const std::string& in) { std::size_t start = 0; std::size_t limit = in.size(); while (start < limit && in.at(start) < '!') ++start; while (start < limit && in.at(limit-1) < '!') --limit; if (start == limit) return std::string(""); if (start > 0 || limit < in.size()) return in.substr(start,limit-start); return std::string(in); } inline std::vector split(const std::string& in) { std::vector outv; std::istringstream iss(in); std::copy(std::istream_iterator(iss), std::istream_iterator(), std::back_inserter(outv)); return outv; } }; // end anonymous namespace #ifdef TOKENIZER_NAMESPACE namespace TOKENIZER_NAMESPACE { #endif void Tokenizer::set_config_dir(const std::string& dir) { if (dir.empty()) { cfg_dir = "."; } else { cfg_dir.assign(dir); } } Tokenizer::Tokenizer(const Parameters& _) : nthreads(_.nthreads ? _.nthreads : 1) , chunksize(_.chunksize) , lang_iso(_.lang_iso) , english_p(_.lang_iso.compare("en")==0) , latin_p((!english_p) && (_.lang_iso.compare("fr")==0 || _.lang_iso.compare("it")==0)) , skip_xml_p(_.detag_p) , skip_alltags_p(_.alltag_p) , entities_p(_.entities_p) , escape_p(_.escape_p) , unescape_p(_.unescape_p) , aggressive_hyphen_p(_.aggro_p) , supersub_p(_.supersub_p) , url_p(_.url_p) , downcase_p(_.downcase_p) , normalize_p(_.normalize_p) , penn_p(_.penn_p) , narrow_latin_p(_.narrow_latin_p) , narrow_kana_p(_.narrow_kana_p) , refined_p(_.refined_p) , drop_bad_p(_.drop_bad_p) , splits_p(_.split_p) , verbose_p(_.verbose_p) , para_marks_p(_.para_marks_p) , split_breaks_p(_.split_breaks_p) { if (_.cfg_path) set_config_dir(_.cfg_path); } // // dtor deletes dynamically allocated per-language RE2 compiled expressions // Tokenizer::~Tokenizer() { for (auto& ptr : prot_pat_vec) { if (ptr == &numprefixed_x || ptr == &quasinumeric_x) continue; delete ptr; } } // // stuffs numeric-only prefixes into nbpre_num_set, // others into nbpre_gen_set // std::pair Tokenizer::load_prefixes(std::ifstream& ifs) { RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)"); std::string line; int nnon = 0; int nnum = 0; while (std::getline(ifs,line)) { if (!line.empty() && line[0] != '#') { std::string prefix; if (RE2::PartialMatch(line,numonly,&prefix)) { nbpre_num_set.insert(prefix); gunichar * x=g_utf8_to_ucs4_fast((const gchar *)prefix.c_str(),prefix.size(),0); nbpre_num_ucs4.insert(std::wstring((wchar_t *)x)); g_free(x); nnum++; } else { nbpre_gen_set.insert(line); gunichar * x=g_utf8_to_ucs4_fast((const gchar *)line.c_str(),line.size(),0); nbpre_gen_ucs4.insert(std::wstring((wchar_t *)x)); g_free(x); nnon++; } } } return std::make_pair(nnon,nnum); } // // load files (make sure to call set_config_dir before, if ever // for nonbreaking prefixes and protected patterns // void Tokenizer::init(const char *cfg_dir_optional) { if (cfg_dir_optional) set_config_dir(std::string(cfg_dir_optional)); std::string dir_path(cfg_dir); dir_path.append("/nonbreaking_prefixes"); if (::access(dir_path.c_str(),X_OK)) { dir_path = cfg_dir; } std::string nbpre_path(dir_path); nbpre_path.append("/nonbreaking_prefix.").append(lang_iso); // default to generic version if (::access(nbpre_path.c_str(),R_OK)) nbpre_path = nbpre_path.substr(0,nbpre_path.size()-lang_iso.size()-1); if (::access(nbpre_path.c_str(),R_OK) == 0) { std::ifstream cfg(nbpre_path.c_str()); try { std::pair counts = load_prefixes(cfg); if (verbose_p) { std::cerr << "loaded " << counts.first << " non-numeric, " << counts.second << " numeric prefixes from " << nbpre_path << std::endl; } } catch (...) { std::ostringstream ess; ess << "I/O error reading " << nbpre_path << " in " << __FILE__ << " at " << __LINE__; throw std::runtime_error(ess.str()); } } else if (verbose_p) { std::cerr << "no prefix file found: " << nbpre_path << std::endl; } if (nbpre_gen_set.empty() && nbpre_num_set.empty()) { std::ostringstream ess; ess << "Error at " << __FILE__ << ":" << __LINE__ << " : " << "No known abbreviations for language " << lang_iso; throw std::runtime_error(ess.str()); } std::string protpat_path(cfg_dir); protpat_path.append("/protected_pattern.").append(lang_iso); // default to generic version if (::access(protpat_path.c_str(),R_OK)) protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1); prot_pat_vec.push_back(&numprefixed_x); prot_pat_vec.push_back(&quasinumeric_x); if (::access(protpat_path.c_str(),R_OK) == 0) { std::ifstream cfg(protpat_path.c_str()); char linebuf[1028]; int npat = 0; try { linebuf[0]='('; while (cfg.good()) { cfg.getline(linebuf+1,1024); if (linebuf[1] && linebuf[1] != '#') { strcat(linebuf,")"); prot_pat_vec.push_back(new RE2(linebuf)); npat++; } } } catch (...) { std::ostringstream ess; ess << "I/O error reading " << protpat_path << " in " << __FILE__ << " at " << __LINE__; throw std::runtime_error(ess.str()); } if (verbose_p) { std::cerr << "loaded " << npat << " protected patterns from " << protpat_path << std::endl; } } else if (verbose_p) { std::cerr << "no protected file found: " << protpat_path << std::endl; } } void Tokenizer::reset() { } // // apply ctor-selected tokenization to a string, in-place, no newlines allowed, // assumes protections are applied already, some invariants are in place, // e.g. that successive chars <= ' ' have been normalized to a single ' ' // void Tokenizer::protected_tokenize(std::string& text) { std::vector words; re2::StringPiece textpc(text); int pos = 0; if (textpc[pos] == ' ') ++pos; size_t next = text.find(' ',pos); while (next != std::string::npos) { if (next - pos) words.push_back(textpc.substr(pos,next-pos)); pos = next + 1; while (pos < textpc.size() && textpc[pos] == ' ') ++pos; next = textpc.find(' ',pos); } if (pos < textpc.size() && textpc[pos] != ' ') words.push_back(textpc.substr(pos,textpc.size()-pos)); // regurgitate words with look-ahead handling for tokens with final mumble std::string outs; std::size_t nwords(words.size()); for (size_t ii = 0; ii < nwords; ++ii) { bool more_p = ii < nwords - 1; size_t len = words[ii].size(); bool sentence_break_p = len > 1 && words[ii][len-1] == '.'; // suppress break if it is an non-breaking prefix if (sentence_break_p) { re2::StringPiece pfx(words[ii].substr(0,len-1)); std::string pfxs(pfx.as_string()); if (nbpre_gen_set.find(pfxs) != nbpre_gen_set.end()) { // general non-breaking prefix sentence_break_p = false; } else if (more_p && nbpre_num_set.find(pfxs) != nbpre_num_set.end() && RE2::PartialMatch(words[ii+1],sinteger_x)) { // non-breaking before numeric sentence_break_p = false; } else if (pfxs.find('.') != std::string::npos && RE2::PartialMatch(pfx,letter_x)) { // terminal isolated letter does not break sentence_break_p = false; } else if (more_p && RE2::PartialMatch(words[ii+1],lower_x)) { // lower-case look-ahead does not break sentence_break_p = false; } } outs.append(words[ii].data(),len); if (sentence_break_p) outs.append(" ."); if (more_p) outs.append(SPC_BYTE,1); } text.assign(outs.begin(),outs.end()); } bool Tokenizer::unescape(std::string& word) { std::ostringstream oss; std::size_t was = 0; // last processed std::size_t pos = 0; // last unprocessed std::size_t len = 0; // processed length bool hit = false; for (std::size_t endp=0; (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos; was = endp == std::string::npos ? pos : 1+endp) { len = endp - pos + 1; glong ulen(0); gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen); gunichar gbuf[2] = { 0 }; if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) { gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0); if (escape_p && ESCAPE_SET.find(std::string(gstr)) != ESCAPE_SET.end()) { // do not unescape moses escapes when escape flag is turned on oss << word.substr(was,1+endp-was); } else { if (was < pos) oss << word.substr(was,pos-was); oss << gstr; was += ulen; hit = true; } g_free(gstr); } else { oss << word.substr(was,1+endp-was); } g_free(gtmp); } if (was < word.size()) oss << word.substr(was); if (hit) word = oss.str(); return hit; } bool Tokenizer::escape(std::string& text) { bool mod_p = false; std::string outs; const char *pp = text.c_str(); // from pp to pt is uncopied const char *ep = pp + text.size(); const char *pt = pp; while (pt < ep) { if (*pt & 0x80) { const char *mk = (const char *)g_utf8_find_next_char((const gchar *)pt,(const gchar *)ep); if (!mk) { if (mod_p) outs.append(pp,pt-pp+1); } else { if (mod_p) outs.append(pp,mk-pp); pt = --mk; } pp = ++pt; continue; } const char *sequence_p = 0; if (*pt < '?') { if (*pt == '&') { // check for a pre-existing escape const char *sc = strchr(pt,';'); if (!sc || sc-pt < 2 || sc-pt > 9) { sequence_p = ESCAPE_MOSES[3]; } } else if (*pt == '\'') { sequence_p = ESCAPE_MOSES[6]; } else if (*pt == '"') { sequence_p = ESCAPE_MOSES[7]; } } else if (*pt > ']') { if (*pt =='|') { // 7c sequence_p = ESCAPE_MOSES[0]; } } else if (*pt > 'Z') { if (*pt == '<') { // 3e sequence_p = ESCAPE_MOSES[4]; } else if (*pt == '>') { // 3c sequence_p = ESCAPE_MOSES[5]; } else if (*pt == '[') { // 5b sequence_p = ESCAPE_MOSES[1]; } else if (*pt == ']') { // 5d sequence_p = ESCAPE_MOSES[2]; } } if (sequence_p) { if (pt > pp) outs.append(pp,pt-pp); outs.append(sequence_p); mod_p = true; pp = ++pt; } else { ++pt; } } if (mod_p) { if (pp < pt) { outs.append(pp,pt-pp); } text.assign(outs.begin(),outs.end()); } return mod_p; } std::string Tokenizer::penn_tokenize(const std::string& buf) { static const char *comma_refs = "\\1 , \\2"; static const char *isolate_ref = " \\1 "; static const char *special_refs = "\\1 @\\2@ \\3"; std::string text(buf); std::string outs; if (skip_alltags_p) RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE); // directed quote patches size_t len = text.size(); if (len > 2 && text.substr(0,2) == "``") text.replace(0,2,"`` ",3); else if (text[0] == '"') text.replace(0,1,"`` ",3); else if (text[0] == '`' || text[0] == '\'') text.replace(0,1,"` ",2); static char one_gg[] = "\\1 ``"; RE2::GlobalReplace(&text,x1_v_d,one_gg); RE2::GlobalReplace(&text,x1_v_gg,one_gg); RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2"); RE2::GlobalReplace(&text,x1_v_q,"\\1 ` "); // protect ellipsis for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11)) text.replace(pos,3,"MANYELIPSIS",11); // numeric commas RE2::GlobalReplace(&text,ndndcomma_x,comma_refs); RE2::GlobalReplace(&text,pdndcomma_x,comma_refs); RE2::GlobalReplace(&text,ndpdcomma_x,comma_refs); // isolable symbols RE2::GlobalReplace(&text,symbol_x,isolate_ref); // isolable slash RE2::GlobalReplace(&text,slash_x,special_refs); // isolate final period RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3"); // isolate q.m., e.m. RE2::GlobalReplace(&text,qx_x,isolate_ref); // isolate braces RE2::GlobalReplace(&text,braces_x,isolate_ref); // convert open/close punctuation RE2::GlobalReplace(&text,"\$","-LRB-"); RE2::GlobalReplace(&text,"\\[","-LSB-"); RE2::GlobalReplace(&text,"\\{","-LCB-"); RE2::GlobalReplace(&text,"\$","-RRB-"); RE2::GlobalReplace(&text,"\\]","-RSB-"); RE2::GlobalReplace(&text,"\\}","-RCB-"); // isolate double-dash hyphen RE2::GlobalReplace(&text,"--"," -- "); // insure leading and trailing space on line, to simplify exprs // also make sure final . has one space on each side len = text.size(); while (len > 1 && text[len-1] == ' ') --len; if (len < text.size()) text.assign(text.substr(0,len)); if (len > 2 && text[len-1] == '.') { if (text[len-2] != ' ') { text.assign(text.substr(0,len-1)); text.append(" . "); } else { text.assign(text.substr(0,len-1)); text.append(". "); } } else { text.append(SPC_BYTE,1); } std::string ntext(SPC_BYTE); ntext.append(text); // convert double quote to paired single-quotes RE2::GlobalReplace(&ntext,"\""," '' "); // deal with contractions in penn style RE2::GlobalReplace(&ntext,endq_x,"\\1 ' "); RE2::GlobalReplace(&ntext,contract_x," '\\1 "); RE2::GlobalReplace(&ntext,"'ll "," 'll "); RE2::GlobalReplace(&ntext,"'re "," 're "); RE2::GlobalReplace(&ntext,"'ve "," 've "); RE2::GlobalReplace(&ntext,"n't "," n't "); RE2::GlobalReplace(&ntext,"'LL "," 'LL "); RE2::GlobalReplace(&ntext,"'RE "," 'RE "); RE2::GlobalReplace(&ntext,"'VE "," 'VE "); RE2::GlobalReplace(&ntext,"N'T "," N'T "); RE2::GlobalReplace(&ntext," ([Cc])annot "," \\1an not "); RE2::GlobalReplace(&ntext," ([Dd])'ye "," \\1' ye "); RE2::GlobalReplace(&ntext," ([Gg])imme "," \\1im me "); RE2::GlobalReplace(&ntext," ([Gg])onna "," \\1on na "); RE2::GlobalReplace(&ntext," ([Gg])otta "," \\1ot ta "); RE2::GlobalReplace(&ntext," ([Ll])emme "," \\1em me "); RE2::GlobalReplace(&ntext," ([Mm])ore'n "," \\1ore 'n "); RE2::GlobalReplace(&ntext," '([Tt])is "," '\\1 is 'n "); RE2::GlobalReplace(&ntext," '([Tt])was "," '\\1 was 'n "); RE2::GlobalReplace(&ntext," '([Tt])were "," '\\1 were 'n "); RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na "); protected_tokenize(ntext); // restore ellipsis RE2::GlobalReplace(&ntext,"MANYELIPSIS","..."); // collapse spaces RE2::GlobalReplace(&ntext,mult_spc_x,SPC_BYTE); // escape moses meta-characters if (escape_p) escape(ntext); // strip out wrapping spaces from line in result string outs.assign(ntext.substr(1,ntext.size()-2)); return outs; } std::string Tokenizer::quik_tokenize(const std::string& buf) { std::string text(buf); size_t pos; int num = 0; // this is the main moses-compatible tokenizer // push all the prefixes matching protected patterns std::vector prot_stack; std::string match; for (auto& pat : prot_pat_vec) { pos = 0; while (RE2::PartialMatch(text.substr(pos),*pat,&match)) { pos = text.find(match,pos); if (pos == std::string::npos) break; size_t len = match.size(); if (text[pos-1] == ' ' || text[pos-1] == '\'' || text[pos-1] == '`'|| text[pos-1] == '"') { char subst[32]; int nsubst = snprintf(subst,sizeof(subst)," THISISPROTECTED%.3d ",num++); text.replace(pos,len,subst,nsubst); prot_stack.push_back(match); pos += nsubst; } else { pos += len; } } } const char *pt(text.c_str()); const char *ep(pt + text.size()); while (pt < ep && *pt >= 0 && *pt <= ' ') ++pt; glong ulen(0); gunichar *usrc(g_utf8_to_ucs4_fast((const gchar *)pt,ep - pt, &ulen)); // g_free gunichar *ucs4(usrc); gunichar *lim4(ucs4 + ulen); gunichar *nxt4 = ucs4; gunichar *ubuf(g_new0(gunichar,ulen*6+1)); // g_free gunichar *uptr(ubuf); gunichar prev_uch(0); gunichar next_uch(*ucs4); gunichar curr_uch(0); GUnicodeType curr_type(G_UNICODE_UNASSIGNED); GUnicodeType next_type((ucs4 && *ucs4) ? g_unichar_type(*ucs4) : G_UNICODE_UNASSIGNED); GUnicodeType prev_type(G_UNICODE_UNASSIGNED); bool post_break_p = false; bool in_num_p = next_uch <= gunichar(L'9') && next_uch >= gunichar(L'0'); bool in_url_p = false; int since_start = 0; int alpha_prefix = 0; int bad_length = 0; while (ucs4 < lim4) { prev_uch = curr_uch; prev_type = curr_type; curr_uch = next_uch; curr_type = next_type; if (++nxt4 >= lim4) { next_uch = 0; next_type = G_UNICODE_UNASSIGNED; } else { next_uch = *nxt4; next_type = g_unichar_type(next_uch); } if (url_p) { if (!in_url_p && *ucs4 < 0x80L) { // url chars must be in the basic plane if (!since_start) { if (std::isalpha(char(*ucs4))) alpha_prefix++; } else if (alpha_prefix == since_start && char(*ucs4) == ':' && next_type != G_UNICODE_SPACE_SEPARATOR) { in_url_p = true; } } } bool pre_break_p = false; const wchar_t *substitute_p = 0; if (post_break_p) { *uptr++ = gunichar(L' '); since_start = bad_length = 0; in_url_p = in_num_p = post_break_p = false; } retry: switch (curr_type) { case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: if (in_url_p || in_num_p) pre_break_p = true; // fallthough case G_UNICODE_UPPERCASE_LETTER: case G_UNICODE_LOWERCASE_LETTER: if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER) curr_uch = g_unichar_tolower(*ucs4); break; case G_UNICODE_SPACING_MARK: pre_break_p = true; in_num_p = false; curr_uch = 0; break; case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LETTER_NUMBER: case G_UNICODE_OTHER_NUMBER: if (!in_num_p && !in_url_p) { switch (prev_type) { case G_UNICODE_DASH_PUNCTUATION: case G_UNICODE_FORMAT: case G_UNICODE_OTHER_PUNCTUATION: case G_UNICODE_UPPERCASE_LETTER: case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_DECIMAL_NUMBER: break; default: pre_break_p = true; } } in_num_p = true; break; case G_UNICODE_CONNECT_PUNCTUATION: if (curr_uch != gunichar(L'_')) { if (in_url_p) { in_url_p = false; post_break_p = pre_break_p = true; } } if (in_num_p) { post_break_p = pre_break_p = true; } else { switch (next_type) { case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: break; default: post_break_p = pre_break_p = true; } switch (prev_type) { case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: break; default: post_break_p = pre_break_p = true; } } break; case G_UNICODE_FORMAT: in_url_p = in_num_p = false; break; case G_UNICODE_DASH_PUNCTUATION: if (aggressive_hyphen_p && !in_url_p && curr_uch != next_uch && prev_uch != curr_uch && (!(prev_uch == L' ' || !prev_uch) && !(next_uch == L' ' || !next_uch))) { substitute_p = L"@-@"; post_break_p = pre_break_p = true; } else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) || ( curr_uch > gunichar(L'\u2011') && curr_uch != gunichar(L'\u30A0') && curr_uch < gunichar(L'\uFE63') ) ) { // dash, not a hyphen post_break_p = pre_break_p = true; } else if (next_type == G_UNICODE_SPACE_SEPARATOR) { } else { if (prev_type == curr_type) { if (next_type != curr_type) { post_break_p = !in_url_p; } } else if (next_type == curr_type) { pre_break_p = !in_url_p; } else if ((prev_type == G_UNICODE_UPPERCASE_LETTER || prev_type == G_UNICODE_LOWERCASE_LETTER) && next_type == G_UNICODE_DECIMAL_NUMBER) { in_num_p = false; } else if (in_num_p || since_start == 0) { switch (next_type) { case G_UNICODE_UPPERCASE_LETTER: case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: case G_UNICODE_SPACE_SEPARATOR: in_num_p = false; break; case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LETTER_NUMBER: case G_UNICODE_OTHER_NUMBER: case G_UNICODE_OTHER_PUNCTUATION: break; default: post_break_p = true; pre_break_p = prev_uch != curr_uch; } } else if (in_url_p) { pre_break_p = curr_uch != gunichar(L'-'); } else { switch (prev_type) { case G_UNICODE_UPPERCASE_LETTER: case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LETTER_NUMBER: case G_UNICODE_OTHER_NUMBER: case G_UNICODE_OTHER_PUNCTUATION: switch (next_type) { case G_UNICODE_UPPERCASE_LETTER: case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: case G_UNICODE_TITLECASE_LETTER: case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LETTER_NUMBER: case G_UNICODE_OTHER_NUMBER: break; case G_UNICODE_OTHER_PUNCTUATION: if (prev_type != next_type) break; default: post_break_p = pre_break_p = prev_uch != curr_uch; } break; default: post_break_p = pre_break_p = prev_uch != curr_uch; break; } } } break; case G_UNICODE_OTHER_PUNCTUATION: switch (curr_uch) { case gunichar(L':'): case gunichar(L'/'): if (refined_p && !in_url_p && prev_type == G_UNICODE_DECIMAL_NUMBER && next_type == G_UNICODE_DECIMAL_NUMBER) { break; } // fall-through case gunichar(L'!'): case gunichar(L'#'): case gunichar(L';'): case gunichar(L'?'): case gunichar(L'@'): post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR; break; case gunichar(L'+'): post_break_p = pre_break_p = !in_num_p && since_start > 0; in_num_p = in_num_p || since_start == 0; break; case gunichar(L'&'): if (unescape_p) { if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) { gunichar *eptr = nxt4; GUnicodeType eptr_type(G_UNICODE_UNASSIGNED); for (++eptr; eptr < lim4 && *eptr != gunichar(L';'); ++eptr) { eptr_type = g_unichar_type(*eptr); if (eptr_type != G_UNICODE_LOWERCASE_LETTER && eptr_type != G_UNICODE_UPPERCASE_LETTER && eptr_type != G_UNICODE_DECIMAL_NUMBER) break; } gunichar ech(0); if (*eptr == gunichar(L';') && (ech = get_entity(ucs4,eptr-ucs4+1))) { curr_uch = ech; curr_type = g_unichar_type(ech); ucs4 = eptr; nxt4 = ++eptr; next_uch = *nxt4; next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED; goto retry; } } } if (entities_p && !in_url_p) { gunichar *cur4 = nxt4; if (*cur4 == gunichar('#')) ++cur4; while (g_unichar_isalnum(*cur4)) ++cur4; if (cur4 > nxt4 && *cur4 == gunichar(';')) { if (since_start) { *uptr++ = gunichar(L' '); since_start = 0; } ++cur4; memcpy(uptr,ucs4,cur4-ucs4); uptr += cur4-ucs4; ucs4 = cur4; *uptr++ = gunichar(L' '); pre_break_p = post_break_p = false; curr_uch = *ucs4; curr_type = ucs4 < lim4 ? g_unichar_type(curr_uch) : G_UNICODE_UNASSIGNED; nxt4 = ++cur4; next_uch = *nxt4; next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED; goto retry; } } post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR; if (escape_p) substitute_p = L"&"; break; case gunichar(L'\''): if (english_p) { if (!in_url_p) { bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER; pre_break_p = true; if (next_letter_p && refined_p) { // break sha n't instead of shan 't: if (prev_uch == gunichar(L'n') || prev_uch == gunichar(L'N')) { *(uptr - 1) = gunichar(L' '); *(uptr++) = prev_uch; pre_break_p = false; } } post_break_p = since_start == 0 || (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER); } } else if (latin_p) { post_break_p = !in_url_p; pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER; } else { post_break_p = pre_break_p = !in_url_p; } if (escape_p) substitute_p = L"'"; break; case gunichar(L'"'): post_break_p = pre_break_p = true; if (escape_p) substitute_p = L"""; break; case gunichar(L','): pre_break_p = !in_num_p || next_type != G_UNICODE_DECIMAL_NUMBER; post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER; break; case gunichar(L'%'): if (refined_p) { pre_break_p = !in_num_p; post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER; } else { post_break_p = pre_break_p = true; } break; case gunichar(L'.'): if (prev_uch != '.') { if (!in_num_p) { switch (next_type) { case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_UPPERCASE_LETTER: break; default: if (since_start > 0) { switch (prev_type) { case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_UPPERCASE_LETTER: { std::wstring k((wchar_t *)(uptr-since_start),since_start); if (nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) { // general non-breaking prefix } else if (nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end() && class_follows_p(nxt4,lim4,G_UNICODE_DECIMAL_NUMBER)) { // non-breaking before numeric } else if (k.find(curr_uch) != std::wstring::npos) { if (since_start > 1) { GUnicodeType tclass = g_unichar_type(*(uptr-2)); switch (tclass) { case G_UNICODE_UPPERCASE_LETTER: case G_UNICODE_LOWERCASE_LETTER: pre_break_p = true; break; default: break; } } // terminal isolated letter does not break } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) || g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) { // lower-case look-ahead does not break } else { pre_break_p = true; } break; } default: pre_break_p = true; break; } } break; } } else { switch (next_type) { case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_UPPERCASE_LETTER: break; default: pre_break_p = true; } } } else if (next_uch != '.') { post_break_p = true; } break; default: post_break_p = pre_break_p = true; break; } break; case G_UNICODE_CLOSE_PUNCTUATION: case G_UNICODE_FINAL_PUNCTUATION: case G_UNICODE_INITIAL_PUNCTUATION: case G_UNICODE_OPEN_PUNCTUATION: switch (curr_uch) { case gunichar(L'('): case gunichar(L')'): break; case gunichar(L'['): if (escape_p) substitute_p = L"["; break; case gunichar(L']'): if (escape_p) substitute_p = L"]"; break; default: in_url_p = false; } post_break_p = pre_break_p = !in_url_p; break; case G_UNICODE_CURRENCY_SYMBOL: if (refined_p) { post_break_p = in_num_p; // was in number, so break it pre_break_p = !in_num_p; in_num_p = in_num_p || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'.') || next_uch == gunichar(L','); } else { post_break_p = pre_break_p = true; in_num_p = false; } if (curr_uch != gunichar(L'$')) in_url_p = false; break; case G_UNICODE_MODIFIER_SYMBOL: case G_UNICODE_MATH_SYMBOL: switch (curr_uch) { case gunichar(L'`'): if (english_p) { if (!in_url_p) { pre_break_p = true; post_break_p = since_start == 0 || (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER); } } else if (latin_p) { post_break_p = !in_url_p; pre_break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER; } else { post_break_p = pre_break_p = !in_url_p; } if (escape_p) substitute_p = L"'"; else curr_uch = gunichar(L'\''); break; case gunichar(L'|'): if (escape_p) substitute_p = L"|"; post_break_p = pre_break_p = true; break; case gunichar(L'<'): if (escape_p) substitute_p = L"<"; post_break_p = pre_break_p = true; break; case gunichar(L'>'): if (escape_p) substitute_p = L">"; post_break_p = pre_break_p = true; break; case gunichar(L'%'): post_break_p = in_num_p; pre_break_p = !in_num_p && !in_url_p; in_num_p = false; break; case gunichar(L'='): case gunichar(L'~'): in_num_p = false; post_break_p = pre_break_p = !in_url_p; break; case gunichar(L'+'): post_break_p = pre_break_p = !in_url_p; if (in_url_p) { in_num_p = false; } else if (refined_p) { // handle floating point as e.g. 1.2e+3.4 bool next_digit_p = next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'.'); pre_break_p = !in_num_p; in_num_p = next_digit_p && prev_type != G_UNICODE_DECIMAL_NUMBER; post_break_p = !in_num_p; } else { in_num_p = in_num_p || since_start == 0; } break; default: post_break_p = pre_break_p = true; break; } break; case G_UNICODE_OTHER_SYMBOL: post_break_p = pre_break_p = true; break; case G_UNICODE_CONTROL: if (drop_bad_p) { curr_uch = gunichar(L' '); } else if (curr_uch < gunichar(L' ')) { curr_uch = gunichar(L' '); } else if (curr_uch == gunichar(L'\u0092') && (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) { // observed corpus corruption case if (english_p) { pre_break_p = true; post_break_p = since_start == 0 || (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER); } else if (latin_p) { post_break_p = true; pre_break_p = prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER; } else { post_break_p = pre_break_p = true; } if (escape_p) substitute_p = L"'"; else curr_uch = gunichar(L'\''); } else { post_break_p = pre_break_p = true; } in_url_p = in_num_p = false; break; case G_UNICODE_LINE_SEPARATOR: case G_UNICODE_SPACE_SEPARATOR: curr_uch = gunichar(L' '); in_url_p = in_num_p = false; break; case G_UNICODE_ENCLOSING_MARK: in_url_p = false; break; case G_UNICODE_NON_SPACING_MARK: case G_UNICODE_PRIVATE_USE: case G_UNICODE_SURROGATE: in_url_p = in_num_p = false; break; case G_UNICODE_UNASSIGNED: default: // malformed bytes are dropped (invalid utf8 unicode) if (drop_bad_p) { curr_uch = 0; } else { pre_break_p = since_start > 0 && bad_length == 0; curr_type = G_UNICODE_UNASSIGNED; } in_url_p = in_num_p = false; break; } if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) { if (since_start) { // non-empty token emitted previously, so pre-break must emit token separator *uptr++ = gunichar(L' '); since_start = bad_length = 0; } if (curr_uch == gunichar(L' ')) // suppress emission below, fall-through to substitute logic curr_uch = 0; } if (substitute_p) { for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) { *uptr++ = *sptr; since_start++; } in_url_p = in_num_p = false; } else if (curr_uch) { *uptr++ = curr_uch; since_start++; if (curr_type == G_UNICODE_UNASSIGNED) bad_length++; } ucs4 = nxt4; } glong nbytes = 0; gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free if (utf8[nbytes-1] == ' ') --nbytes; text.assign((const char *)utf8,(const char *)(utf8 + nbytes)); g_free(utf8); g_free(usrc); g_free(ubuf); // terminate token at superscript or subscript sequence when followed by lower-case if (supersub_p) RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3"); // restore prefix-protected strings num = 0; for (auto& prot : prot_stack) { char subst[32]; snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++); size_t loc = text.find(subst); while (loc != std::string::npos) { text.replace(loc,18,prot.data(),prot.size()); loc = text.find(subst,loc+18); } } // escape moses meta-characters if (escape_p) escape(text); return text; } std::size_t Tokenizer::tokenize(std::istream& is, std::ostream& os) { std::size_t line_no = 0; std::size_t perchunk = chunksize ? chunksize : 2000; std::vector< std::vector< std::string > > lines(nthreads); std::vector< std::vector< std::string > > results(nthreads); std::vector< boost::thread > workers(nthreads); bool done_p = !(is.good() && os.good()); for (std::size_t tranche = 0; !done_p; ++tranche) { // for loop starting threads for chunks of input for (std::size_t ithread = 0; ithread < nthreads; ++ithread) { lines[ithread].resize(perchunk); std::size_t line_pos = 0; for ( ; line_pos < perchunk; ++line_pos) { std::string istr; std::getline(is,istr); if (skip_alltags_p) { RE2::GlobalReplace(&istr,genl_tags_x,SPC_BYTE); istr = trim(istr); } line_no++; if (istr.empty()) { if (is.eof()) { done_p = true; lines[ithread].resize(line_pos); results[ithread].resize(line_pos); break; } lines[ithread][line_pos].clear(); } else if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) { lines[ithread][line_pos].clear(); } else { lines[ithread][line_pos] = std::string(SPC_BYTE).append(istr).append(SPC_BYTE); } } if (line_pos) { workers[ithread] = boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread])); } } // end for loop starting threads for (std::size_t ithread = 0; ithread < nthreads; ++ithread) { if (!workers[ithread].joinable()) continue; workers[ithread].join(); std::size_t nres = results[ithread].size(); std::size_t nlin = lines[ithread].size(); if (nlin != nres) { std::ostringstream emsg; emsg << "Tranche " << tranche << " worker " << ithread << "/" << nthreads << " |lines|==" << nlin << " != |results|==" << nres; throw std::runtime_error(emsg.str()); } for (std::size_t ires = 0; ires < nres; ++ires) os << results[ithread][ires] << std::endl; } // end loop over joined results if (verbose_p) { std::cerr << line_no << ' '; std::cerr.flush(); } } // end loop over chunks return line_no; } std::string Tokenizer::detokenize(const std::string& buf) { std::vector words = split(trim(buf)); std::size_t squotes = 0; std::size_t dquotes = 0; std::string prepends(""); std::ostringstream oss; std::size_t nwords = words.size(); std::size_t iword = 0; if (unescape_p) for (auto &word: words) unescape(word); for (auto &word: words) { if (RE2::FullMatch(word,right_x)) { if (iword) oss << SPC_BYTE; oss << word; prepends.clear(); } else if (RE2::FullMatch(word,left_x)) { oss << word; prepends = SPC_BYTE; } else if (english_p && iword && RE2::FullMatch(word,curr_en_x) && RE2::FullMatch(words[iword-1],pre_en_x)) { oss << word; prepends = SPC_BYTE; } else if (latin_p && iword < nwords - 2 && RE2::FullMatch(word,curr_fr_x) && RE2::FullMatch(words[iword+1],post_fr_x)) { oss << prepends << word; prepends.clear(); } else if (word.size() == 1) { if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) || (word.at(0) == '"' && ((dquotes % 2) == 0))) { if (english_p && iword && word.at(0) == '\'' && std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') { oss << word; prepends = SPC_BYTE; } else { oss << prepends << word; prepends.clear(); if (word.at(0) == '\'') squotes++; else dquotes++; } } else { if (std::isalnum(word.at(0))) oss << prepends; oss << word; prepends = SPC_BYTE; if (word.at(0) == '\'') squotes++; else if (word.at(0) == '"') dquotes++; } } else { oss << prepends << word; prepends = SPC_BYTE; } iword++; } std::string text(oss.str()); RE2::GlobalReplace(&text," +",SPC_BYTE); RE2::GlobalReplace(&text,"\n ","\n"); RE2::GlobalReplace(&text," \n","\n"); return trim(text); } std::size_t Tokenizer::detokenize(std::istream& is, std::ostream& os) { size_t line_no = 0; while (is.good() && os.good()) { std::string istr; std::getline(is,istr); line_no ++; if (istr.empty()) continue; if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) { os << istr << std::endl; } else { os << detokenize(istr) << std::endl; } } return line_no; } std::vector Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) { std::vector parts; glong ncp = 0; glong ocp = 0; glong icp = 0; gunichar *ucs4 = g_utf8_to_ucs4_fast((gchar *)istr.c_str(),istr.size(),&ncp); if (ncp == 0) { g_free(ucs4); return parts; } gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar)); const wchar_t GENL_HYPH = L'\u2010'; const wchar_t IDEO_STOP = L'\u3002'; const wchar_t KANA_MDOT = L'\u30FB'; const wchar_t WAVE_DASH = L'\u301C'; //const wchar_t WAVY_DASH = L'\u3030'; const wchar_t KANA_DHYP = L'\u30A0'; const wchar_t SMAL_HYPH = L'\uFE63'; const wchar_t WIDE_EXCL = L'\uFF01'; const wchar_t WIDE_PCTS = L'\uFF05'; //const wchar_t WIDE_HYPH = L'\uFF0D'; const wchar_t WIDE_STOP = L'\uFF0E'; const wchar_t WIDE_QUES = L'\uFF1F'; const wchar_t INVERT_QM = L'\u00BF'; const wchar_t INVERT_EX = L'\u00A1'; wchar_t currwc = 0; std::size_t init_word = 0; std::size_t fini_word = 0; std::size_t finilen = 0; std::size_t dotslen = 0; const std::size_t SEQ_LIM = 6; charclass_t prev_class = empty; charclass_t curr_class = empty; std::vector seq(SEQ_LIM, empty); std::vector pos(SEQ_LIM, 0); std::size_t seqpos = 0; GUnicodeType curr_type = G_UNICODE_UNASSIGNED; //bool prev_word_p = false; bool curr_word_p = false; std::vector breaks; std::set suppress; for (; icp <= ncp; ++icp) { currwc = wchar_t(ucs4[icp]); curr_type = g_unichar_type(currwc); prev_class = curr_class; //prev_word_p = curr_word_p; switch (curr_type) { case G_UNICODE_DECIMAL_NUMBER: case G_UNICODE_OTHER_NUMBER: curr_class = numba; curr_word_p = true; break; case G_UNICODE_LOWERCASE_LETTER: case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: curr_class = letta; curr_word_p = true; break; case G_UNICODE_UPPERCASE_LETTER: case G_UNICODE_TITLECASE_LETTER: curr_class = upper; curr_word_p = true; break; case G_UNICODE_OPEN_PUNCTUATION: case G_UNICODE_INITIAL_PUNCTUATION: curr_class = pinit; curr_word_p = false; break; case G_UNICODE_DASH_PUNCTUATION: curr_class = hyphn; if (currwc <= GENL_HYPH) { curr_word_p = true; } else if (currwc >= SMAL_HYPH) { curr_word_p = true; } else { curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP); } break; case G_UNICODE_CLOSE_PUNCTUATION: case G_UNICODE_FINAL_PUNCTUATION: curr_class = pfini; curr_word_p = false; break; case G_UNICODE_OTHER_PUNCTUATION: if (currwc == L'\'' || currwc == L'"') { curr_class = quote; curr_word_p = false; } else if (currwc == L'.' || currwc == IDEO_STOP || currwc == WIDE_STOP || currwc == KANA_MDOT) { curr_class = stops; curr_word_p = true; } else if (currwc == L'?' || currwc == '!' || currwc == WIDE_EXCL || currwc == WIDE_QUES) { curr_class = marks; curr_word_p = false; } else if (currwc == INVERT_QM || currwc == INVERT_EX) { curr_class = pinit; curr_word_p = false; } else if ( currwc == L'%' || currwc == WIDE_PCTS) { curr_class = pfpct; curr_word_p = true; } else { curr_class = empty; curr_word_p = false; } break; default: if (!g_unichar_isgraph(currwc)) { curr_class = blank; } else { curr_class = empty; } curr_word_p = false; break; } // # condition for prefix test // $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/ // $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/ bool check_abbr_p = false; if (curr_class == stops) { if (prev_class != stops) { dotslen = 1; } else { dotslen++; } } else if (curr_word_p) { if (!fini_word) { init_word = ocp; } fini_word = ocp+1; dotslen = finilen = 0; } else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) { finilen++; dotslen = 0; init_word = fini_word = 0; } else if (dotslen) { if (fini_word > init_word) { if (prev_class!=stops || seqpos<1 || (ocp-pos[seqpos-1])= SEQ_LIM) { seqpos = 0; } if (curr_class == stops || curr_class == marks) { if (!seqpos) { seq[seqpos] = curr_class; pos[seqpos] = ocp; seqpos++; uout[ocp++] = gunichar(currwc); continue; } else if (seqpos>1 && (seq[seqpos-1]==blank || seq[seqpos-1]==quote || seq[seqpos-1]==pfini)) { // handle "[?!.] ..." which is common in some corpora if (seq[seqpos-2] == curr_class || seq[seqpos-2] == marks) { seqpos--; uout[ocp++] = gunichar(currwc); continue; } seqpos = 0; } else if (seq[seqpos-1] != curr_class) { seqpos = 0; } else if (curr_class == marks) { seqpos = 0; } else { uout[ocp++] = gunichar(currwc); continue; } } if (!seqpos) { if (curr_class != blank) { uout[ocp++] = gunichar(currwc); } else if (curr_class != prev_class) { uout[ocp++] = L' '; } continue; } if (curr_class == blank) { if (prev_class != blank) { seq[seqpos] = blank; pos[seqpos] = ocp; seqpos++; uout[ocp++] = L' '; } if (icp < ncp) continue; } if (curr_class >= quote && curr_class <= pfini) { if (prev_class < quote || prev_class > pfini) { seq[seqpos] = curr_class; pos[seqpos] = ocp; seqpos++; } else if (curr_class == quote && prev_class != curr_class) { curr_class = prev_class; } else if (prev_class == quote) { seq[seqpos] = prev_class = curr_class; } uout[ocp++] = gunichar(currwc); continue; } // $text =~ s/([?!]) +([\'\"$\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; // #multi-dots followed by sentence starters 2 // $text =~ s/(\.[\.]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\p{IsUpper}])/$1\n$2/g; // # add breaks for sentences that end with some sort of punctuation inside a quote or parenthetical and are followed by a possible sentence starter punctuation and upper case 4 // $text =~ s/([?!\.][\ ]*[\'\"$\]\p{IsPf}]+) +([\'\"\(\[\¿\¡\p{IsPi}]*[\ ]*[\p{IsUpper}])/$1\n$2/g; // # add breaks for sentences that end with some sort of punctuation are followed by a sentence starter punctuation and upper case 8 // $text =~ s/([?!\.]) +([\'\"\(\[\¿\¡\p{IsPi}]+[\ ]*[\p{IsUpper}])/$1\n$2/g; std::size_t iblank = 0; if (curr_class == upper || icp == ncp) { if (seqpos && (seq[0] == stops || seq[0] == marks)) { switch (seqpos) { case 2: if (seq[1] == blank) iblank = 1; break; case 3: switch (seq[1]) { case blank: if (seq[2] == quote || seq[2] == pinit) iblank = 1; break; case quote: case pfini: if (seq[2] == blank) iblank = 2; break; default: break; } break; case 4: switch (seq[1]) { case blank: iblank = 1; switch (seq[2]) { case quote: switch (seq[3]) { case quote: case pinit: break; case blank: iblank = 3; break; default: iblank = 0; // invalid break; } break; case pinit: if (seq[3] != blank) iblank = 0; // invalid break; case pfini: if (seq[3] == blank) iblank = 3; break; default: iblank = 0; // invalid break; } break; case quote: case pfini: iblank = (seq[2] == blank && (seq[3] == quote || seq[3] == pinit)) ? 2 : 0; break; default: iblank = 0; // invalid break; } break; case 5: iblank = (seq[1] == blank) ? 2 : 1; if (seq[iblank] == quote || seq[iblank] == pfini) iblank++; if (seq[iblank] != blank) { iblank = 0; // invalid } else { if (seq[iblank+1] != quote && seq[iblank+1] != pinit) { iblank = 0; // invalid } else if (iblank+2 < seqpos) { if (seq[iblank+2] != blank) iblank = 0; // invalid } } break; } } if (iblank && suppress.find(pos[iblank]) == suppress.end()) { breaks.push_back(pos[iblank]); suppress.insert(pos[iblank]); } } uout[ocp++] = gunichar(currwc); seqpos = 0; } std::vector::iterator it = breaks.begin(); glong iop = 0; while (iop < ocp) { glong endpos = it == breaks.end() ? ocp : *it++; glong nextpos = endpos + 1; while (endpos > iop) { std::size_t chkpos = endpos-1; if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') { endpos = chkpos; continue; } if (g_unichar_isgraph(uout[chkpos])) break; endpos = chkpos; } if (endpos > iop) { gchar *pre = g_ucs4_to_utf8(uout+iop,endpos-iop,0,0,0); parts.push_back(std::string(pre)); g_free(pre); } if (continuation_ptr) *continuation_ptr = endpos > iop; iop = nextpos; } g_free(uout); g_free(ucs4); return parts; } std::pair Tokenizer::splitter(std::istream& is, std::ostream& os) { std::pair counts = { 0, 0 }; bool continuation_p = false; bool pending_gap = false; bool paragraph_p = false; while (is.good() && os.good()) { std::string istr; std::getline(is,istr); counts.first++; if (istr.empty() && (is.eof() ||!para_marks_p)) continue; if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) continue; std::vector sentences(splitter(istr,&continuation_p)); if (sentences.empty()) { if (!paragraph_p) { if (pending_gap) os << std::endl; pending_gap = false; if (para_marks_p) os << "

" << std::endl; paragraph_p = true; } continue; } paragraph_p = false; std::size_t nsents = sentences.size(); counts.second += nsents; if (pending_gap) { os << " "; pending_gap = false; } for (std::size_t ii = 0; ii < nsents-1; ++ii) os << sentences[ii] << std::endl; os << sentences[nsents-1]; if (continuation_p) pending_gap = !split_breaks_p; if (!pending_gap) os << std::endl; } if (pending_gap) os << std::endl; return counts; } #ifdef TOKENIZER_NAMESPACE }; // namespace #endif