From 915c29b0dd3a0e8333d29d2a1ab9b6b542ed8295 Mon Sep 17 00:00:00 2001 From: akimbal1 Date: Sun, 15 Feb 2015 17:19:47 -0500 Subject: detokenization fixes and features --- contrib/c++tokenizer/tokenizer.cpp | 442 ++++++++++++++++++++++++++++++-- contrib/c++tokenizer/tokenizer_main.cpp | 26 +- 2 files changed, 443 insertions(+), 25 deletions(-) (limited to 'contrib') diff --git a/contrib/c++tokenizer/tokenizer.cpp b/contrib/c++tokenizer/tokenizer.cpp index f9c32389e..40332ce85 100644 --- a/contrib/c++tokenizer/tokenizer.cpp +++ b/contrib/c++tokenizer/tokenizer.cpp @@ -52,13 +52,14 @@ RE2 ndndcomma_x("([^\\p{N}]),([^\\p{N}])"); // non-digit,non-digit RE2 pdndcomma_x("([\\p{N}]),([^\\p{N}])"); // digit,non-digit RE2 ndpdcomma_x("([^\\p{N}]),([\\p{N}])"); // non-digit,digit RE2 symbol_x("([;:@\\#\\$%&\\p{Sc}\\p{So}])"); // usable punctuation mark not a quote or a brace -RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms -RE2 right_x("[\\p{Sc}({¿¡]+"); // -RE2 left_x("[,.?!:;\\%})]+"); // -RE2 curr_en_x("^[\'][\\p{L}]"); // -RE2 pre_en_x("[\\p{L}\\p{N}]$"); // -RE2 curr_fr_x("[\\p{L}\\p{N}][\']$"); // -RE2 post_fr_x("^[\\p{L}\\p{N}]"); // +RE2 contract_x("'([sSmMdD]) "); // english single letter contraction forms, as embedded +RE2 contract_t_x("'([sSmMdD])$"); // english single letter contraction forms, as terminal +RE2 right_x("[({¿¡]+"); // symbols which conjoin to the right +RE2 left_x("[,.?!:;\\%\\p{Sc}})]+"); // symbols conjoin to the left +RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to the left +RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes +RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right +RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes RE2 quotes_x("^[\'\"]+$"); // RE2 endnum_x("[-\'\"]"); // @@ -88,6 +89,334 @@ class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) { return false; } +std::map entity_map; + +void init_entity_map() { + entity_map.insert(std::make_pair(std::wstring(L"""),L'"')); + entity_map.insert(std::make_pair(std::wstring(L"&"),L'&')); + entity_map.insert(std::make_pair(std::wstring(L"'"),L'\'')); + entity_map.insert(std::make_pair(std::wstring(L"<"),L'<')); + entity_map.insert(std::make_pair(std::wstring(L">"),L'>')); + entity_map.insert(std::make_pair(std::wstring(L" "),L'\u00A0')); + entity_map.insert(std::make_pair(std::wstring(L"¡"),L'\u00A1')); + entity_map.insert(std::make_pair(std::wstring(L"¢"),L'\u00A2')); + entity_map.insert(std::make_pair(std::wstring(L"£"),L'\u00A3')); + entity_map.insert(std::make_pair(std::wstring(L"¤"),L'\u00A4')); + entity_map.insert(std::make_pair(std::wstring(L"¥"),L'\u00A5')); + entity_map.insert(std::make_pair(std::wstring(L"¦"),L'\u00A6')); + entity_map.insert(std::make_pair(std::wstring(L"§"),L'\u00A7')); + entity_map.insert(std::make_pair(std::wstring(L"¨"),L'\u00A8')); + entity_map.insert(std::make_pair(std::wstring(L"©"),L'\u00A9')); + entity_map.insert(std::make_pair(std::wstring(L"ª"),L'\u00AA')); + entity_map.insert(std::make_pair(std::wstring(L"«"),L'\u00AB')); + entity_map.insert(std::make_pair(std::wstring(L"¬"),L'\u00AC')); + entity_map.insert(std::make_pair(std::wstring(L"­"),L'\u00AD')); + entity_map.insert(std::make_pair(std::wstring(L"®"),L'\u00AE')); + entity_map.insert(std::make_pair(std::wstring(L"¯"),L'\u00AF')); + entity_map.insert(std::make_pair(std::wstring(L"°"),L'\u00B0')); + entity_map.insert(std::make_pair(std::wstring(L"±"),L'\u00B1')); + entity_map.insert(std::make_pair(std::wstring(L"²"),L'\u00B2')); + entity_map.insert(std::make_pair(std::wstring(L"³"),L'\u00B3')); + entity_map.insert(std::make_pair(std::wstring(L"´"),L'\u00B4')); + entity_map.insert(std::make_pair(std::wstring(L"µ"),L'\u00B5')); + entity_map.insert(std::make_pair(std::wstring(L"¶"),L'\u00B6')); + entity_map.insert(std::make_pair(std::wstring(L"·"),L'\u00B7')); + entity_map.insert(std::make_pair(std::wstring(L"¸"),L'\u00B8')); + entity_map.insert(std::make_pair(std::wstring(L"¹"),L'\u00B9')); + entity_map.insert(std::make_pair(std::wstring(L"º"),L'\u00BA')); + entity_map.insert(std::make_pair(std::wstring(L"»"),L'\u00BB')); + entity_map.insert(std::make_pair(std::wstring(L"¼"),L'\u00BC')); + entity_map.insert(std::make_pair(std::wstring(L"½"),L'\u00BD')); + entity_map.insert(std::make_pair(std::wstring(L"¾"),L'\u00BE')); + entity_map.insert(std::make_pair(std::wstring(L"¿"),L'\u00BF')); + entity_map.insert(std::make_pair(std::wstring(L"À"),L'\u00C0')); + entity_map.insert(std::make_pair(std::wstring(L"Á"),L'\u00C1')); + entity_map.insert(std::make_pair(std::wstring(L"Â"),L'\u00C2')); + entity_map.insert(std::make_pair(std::wstring(L"Ã"),L'\u00C3')); + entity_map.insert(std::make_pair(std::wstring(L"Ä"),L'\u00C4')); + entity_map.insert(std::make_pair(std::wstring(L"Å"),L'\u00C5')); + entity_map.insert(std::make_pair(std::wstring(L"Æ"),L'\u00C6')); + entity_map.insert(std::make_pair(std::wstring(L"Ç"),L'\u00C7')); + entity_map.insert(std::make_pair(std::wstring(L"È"),L'\u00C8')); + entity_map.insert(std::make_pair(std::wstring(L"É"),L'\u00C9')); + entity_map.insert(std::make_pair(std::wstring(L"Ê"),L'\u00CA')); + entity_map.insert(std::make_pair(std::wstring(L"Ë"),L'\u00CB')); + entity_map.insert(std::make_pair(std::wstring(L"Ì"),L'\u00CC')); + entity_map.insert(std::make_pair(std::wstring(L"Í"),L'\u00CD')); + entity_map.insert(std::make_pair(std::wstring(L"Î"),L'\u00CE')); + entity_map.insert(std::make_pair(std::wstring(L"Ï"),L'\u00CF')); + entity_map.insert(std::make_pair(std::wstring(L"Ð"),L'\u00D0')); + entity_map.insert(std::make_pair(std::wstring(L"Ñ"),L'\u00D1')); + entity_map.insert(std::make_pair(std::wstring(L"Ò"),L'\u00D2')); + entity_map.insert(std::make_pair(std::wstring(L"Ó"),L'\u00D3')); + entity_map.insert(std::make_pair(std::wstring(L"Ô"),L'\u00D4')); + entity_map.insert(std::make_pair(std::wstring(L"Õ"),L'\u00D5')); + entity_map.insert(std::make_pair(std::wstring(L"Ö"),L'\u00D6')); + entity_map.insert(std::make_pair(std::wstring(L"×"),L'\u00D7')); + entity_map.insert(std::make_pair(std::wstring(L"Ø"),L'\u00D8')); + entity_map.insert(std::make_pair(std::wstring(L"Ù"),L'\u00D9')); + entity_map.insert(std::make_pair(std::wstring(L"Ú"),L'\u00DA')); + entity_map.insert(std::make_pair(std::wstring(L"Û"),L'\u00DB')); + entity_map.insert(std::make_pair(std::wstring(L"Ü"),L'\u00DC')); + entity_map.insert(std::make_pair(std::wstring(L"Ý"),L'\u00DD')); + entity_map.insert(std::make_pair(std::wstring(L"Þ"),L'\u00DE')); + entity_map.insert(std::make_pair(std::wstring(L"ß"),L'\u00DF')); + entity_map.insert(std::make_pair(std::wstring(L"à"),L'\u00E0')); + entity_map.insert(std::make_pair(std::wstring(L"á"),L'\u00E1')); + entity_map.insert(std::make_pair(std::wstring(L"â"),L'\u00E2')); + entity_map.insert(std::make_pair(std::wstring(L"ã"),L'\u00E3')); + entity_map.insert(std::make_pair(std::wstring(L"ä"),L'\u00E4')); + entity_map.insert(std::make_pair(std::wstring(L"å"),L'\u00E5')); + entity_map.insert(std::make_pair(std::wstring(L"æ"),L'\u00E6')); + entity_map.insert(std::make_pair(std::wstring(L"ç"),L'\u00E7')); + entity_map.insert(std::make_pair(std::wstring(L"è"),L'\u00E8')); + entity_map.insert(std::make_pair(std::wstring(L"é"),L'\u00E9')); + entity_map.insert(std::make_pair(std::wstring(L"ê"),L'\u00EA')); + entity_map.insert(std::make_pair(std::wstring(L"ë"),L'\u00EB')); + entity_map.insert(std::make_pair(std::wstring(L"ì"),L'\u00EC')); + entity_map.insert(std::make_pair(std::wstring(L"í"),L'\u00ED')); + entity_map.insert(std::make_pair(std::wstring(L"î"),L'\u00EE')); + entity_map.insert(std::make_pair(std::wstring(L"ï"),L'\u00EF')); + entity_map.insert(std::make_pair(std::wstring(L"ð"),L'\u00F0')); + entity_map.insert(std::make_pair(std::wstring(L"ñ"),L'\u00F1')); + entity_map.insert(std::make_pair(std::wstring(L"ò"),L'\u00F2')); + entity_map.insert(std::make_pair(std::wstring(L"ó"),L'\u00F3')); + entity_map.insert(std::make_pair(std::wstring(L"ô"),L'\u00F4')); + entity_map.insert(std::make_pair(std::wstring(L"õ"),L'\u00F5')); + entity_map.insert(std::make_pair(std::wstring(L"ö"),L'\u00F6')); + entity_map.insert(std::make_pair(std::wstring(L"÷"),L'\u00F7')); + entity_map.insert(std::make_pair(std::wstring(L"ø"),L'\u00F8')); + entity_map.insert(std::make_pair(std::wstring(L"ù"),L'\u00F9')); + entity_map.insert(std::make_pair(std::wstring(L"ú"),L'\u00FA')); + entity_map.insert(std::make_pair(std::wstring(L"û"),L'\u00FB')); + entity_map.insert(std::make_pair(std::wstring(L"ü"),L'\u00FC')); + entity_map.insert(std::make_pair(std::wstring(L"ý"),L'\u00FD')); + entity_map.insert(std::make_pair(std::wstring(L"þ"),L'\u00FE')); + entity_map.insert(std::make_pair(std::wstring(L"ÿ"),L'\u00FF')); + entity_map.insert(std::make_pair(std::wstring(L"Œ"),L'\u0152')); + entity_map.insert(std::make_pair(std::wstring(L"œ"),L'\u0153')); + entity_map.insert(std::make_pair(std::wstring(L"Š"),L'\u0160')); + entity_map.insert(std::make_pair(std::wstring(L"š"),L'\u0161')); + entity_map.insert(std::make_pair(std::wstring(L"Ÿ"),L'\u0178')); + entity_map.insert(std::make_pair(std::wstring(L"ƒ"),L'\u0192')); + entity_map.insert(std::make_pair(std::wstring(L"ˆ"),L'\u02C6')); + entity_map.insert(std::make_pair(std::wstring(L"˜"),L'\u02DC')); + entity_map.insert(std::make_pair(std::wstring(L"Α"),L'\u0391')); + entity_map.insert(std::make_pair(std::wstring(L"Β"),L'\u0392')); + entity_map.insert(std::make_pair(std::wstring(L"Γ"),L'\u0393')); + entity_map.insert(std::make_pair(std::wstring(L"Δ"),L'\u0394')); + entity_map.insert(std::make_pair(std::wstring(L"Ε"),L'\u0395')); + entity_map.insert(std::make_pair(std::wstring(L"Ζ"),L'\u0396')); + entity_map.insert(std::make_pair(std::wstring(L"Η"),L'\u0397')); + entity_map.insert(std::make_pair(std::wstring(L"Θ"),L'\u0398')); + entity_map.insert(std::make_pair(std::wstring(L"Ι"),L'\u0399')); + entity_map.insert(std::make_pair(std::wstring(L"Κ"),L'\u039A')); + entity_map.insert(std::make_pair(std::wstring(L"Λ"),L'\u039B')); + entity_map.insert(std::make_pair(std::wstring(L"Μ"),L'\u039C')); + entity_map.insert(std::make_pair(std::wstring(L"Ν"),L'\u039D')); + entity_map.insert(std::make_pair(std::wstring(L"Ξ"),L'\u039E')); + entity_map.insert(std::make_pair(std::wstring(L"Ο"),L'\u039F')); + entity_map.insert(std::make_pair(std::wstring(L"Π"),L'\u03A0')); + entity_map.insert(std::make_pair(std::wstring(L"Ρ"),L'\u03A1')); + entity_map.insert(std::make_pair(std::wstring(L"Σ"),L'\u03A3')); + entity_map.insert(std::make_pair(std::wstring(L"Τ"),L'\u03A4')); + entity_map.insert(std::make_pair(std::wstring(L"Υ"),L'\u03A5')); + entity_map.insert(std::make_pair(std::wstring(L"Φ"),L'\u03A6')); + entity_map.insert(std::make_pair(std::wstring(L"Χ"),L'\u03A7')); + entity_map.insert(std::make_pair(std::wstring(L"Ψ"),L'\u03A8')); + entity_map.insert(std::make_pair(std::wstring(L"Ω"),L'\u03A9')); + entity_map.insert(std::make_pair(std::wstring(L"α"),L'\u03B1')); + entity_map.insert(std::make_pair(std::wstring(L"β"),L'\u03B2')); + entity_map.insert(std::make_pair(std::wstring(L"γ"),L'\u03B3')); + entity_map.insert(std::make_pair(std::wstring(L"δ"),L'\u03B4')); + entity_map.insert(std::make_pair(std::wstring(L"ε"),L'\u03B5')); + entity_map.insert(std::make_pair(std::wstring(L"ζ"),L'\u03B6')); + entity_map.insert(std::make_pair(std::wstring(L"η"),L'\u03B7')); + entity_map.insert(std::make_pair(std::wstring(L"θ"),L'\u03B8')); + entity_map.insert(std::make_pair(std::wstring(L"ι"),L'\u03B9')); + entity_map.insert(std::make_pair(std::wstring(L"κ"),L'\u03BA')); + entity_map.insert(std::make_pair(std::wstring(L"λ"),L'\u03BB')); + entity_map.insert(std::make_pair(std::wstring(L"μ"),L'\u03BC')); + entity_map.insert(std::make_pair(std::wstring(L"ν"),L'\u03BD')); + entity_map.insert(std::make_pair(std::wstring(L"ξ"),L'\u03BE')); + entity_map.insert(std::make_pair(std::wstring(L"ο"),L'\u03BF')); + entity_map.insert(std::make_pair(std::wstring(L"π"),L'\u03C0')); + entity_map.insert(std::make_pair(std::wstring(L"ρ"),L'\u03C1')); + entity_map.insert(std::make_pair(std::wstring(L"ς"),L'\u03C2')); + entity_map.insert(std::make_pair(std::wstring(L"σ"),L'\u03C3')); + entity_map.insert(std::make_pair(std::wstring(L"τ"),L'\u03C4')); + entity_map.insert(std::make_pair(std::wstring(L"υ"),L'\u03C5')); + entity_map.insert(std::make_pair(std::wstring(L"φ"),L'\u03C6')); + entity_map.insert(std::make_pair(std::wstring(L"χ"),L'\u03C7')); + entity_map.insert(std::make_pair(std::wstring(L"ψ"),L'\u03C8')); + entity_map.insert(std::make_pair(std::wstring(L"ω"),L'\u03C9')); + entity_map.insert(std::make_pair(std::wstring(L"ϑ"),L'\u03D1')); + entity_map.insert(std::make_pair(std::wstring(L"ϒ"),L'\u03D2')); + entity_map.insert(std::make_pair(std::wstring(L"ϖ"),L'\u03D6')); + entity_map.insert(std::make_pair(std::wstring(L" "),L'\u2002')); + entity_map.insert(std::make_pair(std::wstring(L" "),L'\u2003')); + entity_map.insert(std::make_pair(std::wstring(L" "),L'\u2009')); + entity_map.insert(std::make_pair(std::wstring(L"‌"),L'\u200C')); + entity_map.insert(std::make_pair(std::wstring(L"‍"),L'\u200D')); + entity_map.insert(std::make_pair(std::wstring(L"‎"),L'\u200E')); + entity_map.insert(std::make_pair(std::wstring(L"‏"),L'\u200F')); + entity_map.insert(std::make_pair(std::wstring(L"–"),L'\u2013')); + entity_map.insert(std::make_pair(std::wstring(L"—"),L'\u2014')); + entity_map.insert(std::make_pair(std::wstring(L"‘"),L'\u2018')); + entity_map.insert(std::make_pair(std::wstring(L"’"),L'\u2019')); + entity_map.insert(std::make_pair(std::wstring(L"‚"),L'\u201A')); + entity_map.insert(std::make_pair(std::wstring(L"“"),L'\u201C')); + entity_map.insert(std::make_pair(std::wstring(L"”"),L'\u201D')); + entity_map.insert(std::make_pair(std::wstring(L"„"),L'\u201E')); + entity_map.insert(std::make_pair(std::wstring(L"†"),L'\u2020')); + entity_map.insert(std::make_pair(std::wstring(L"‡"),L'\u2021')); + entity_map.insert(std::make_pair(std::wstring(L"•"),L'\u2022')); + entity_map.insert(std::make_pair(std::wstring(L"…"),L'\u2026')); + entity_map.insert(std::make_pair(std::wstring(L"‰"),L'\u2030')); + entity_map.insert(std::make_pair(std::wstring(L"′"),L'\u2032')); + entity_map.insert(std::make_pair(std::wstring(L"″"),L'\u2033')); + entity_map.insert(std::make_pair(std::wstring(L"‹"),L'\u2039')); + entity_map.insert(std::make_pair(std::wstring(L"›"),L'\u203A')); + entity_map.insert(std::make_pair(std::wstring(L"‾"),L'\u203E')); + entity_map.insert(std::make_pair(std::wstring(L"⁄"),L'\u2044')); + entity_map.insert(std::make_pair(std::wstring(L"€"),L'\u20AC')); + entity_map.insert(std::make_pair(std::wstring(L"ℑ"),L'\u2111')); + entity_map.insert(std::make_pair(std::wstring(L"℘"),L'\u2118')); + entity_map.insert(std::make_pair(std::wstring(L"ℜ"),L'\u211C')); + entity_map.insert(std::make_pair(std::wstring(L"™"),L'\u2122')); + entity_map.insert(std::make_pair(std::wstring(L"ℵ"),L'\u2135')); + entity_map.insert(std::make_pair(std::wstring(L"←"),L'\u2190')); + entity_map.insert(std::make_pair(std::wstring(L"↑"),L'\u2191')); + entity_map.insert(std::make_pair(std::wstring(L"→"),L'\u2192')); + entity_map.insert(std::make_pair(std::wstring(L"↓"),L'\u2193')); + entity_map.insert(std::make_pair(std::wstring(L"↔"),L'\u2194')); + entity_map.insert(std::make_pair(std::wstring(L"↵"),L'\u21B5')); + entity_map.insert(std::make_pair(std::wstring(L"⇐"),L'\u21D0')); + entity_map.insert(std::make_pair(std::wstring(L"⇑"),L'\u21D1')); + entity_map.insert(std::make_pair(std::wstring(L"⇒"),L'\u21D2')); + entity_map.insert(std::make_pair(std::wstring(L"⇓"),L'\u21D3')); + entity_map.insert(std::make_pair(std::wstring(L"⇔"),L'\u21D4')); + entity_map.insert(std::make_pair(std::wstring(L"∀"),L'\u2200')); + entity_map.insert(std::make_pair(std::wstring(L"∂"),L'\u2202')); + entity_map.insert(std::make_pair(std::wstring(L"∃"),L'\u2203')); + entity_map.insert(std::make_pair(std::wstring(L"∅"),L'\u2205')); + entity_map.insert(std::make_pair(std::wstring(L"∇"),L'\u2207')); + entity_map.insert(std::make_pair(std::wstring(L"∈"),L'\u2208')); + entity_map.insert(std::make_pair(std::wstring(L"∉"),L'\u2209')); + entity_map.insert(std::make_pair(std::wstring(L"∋"),L'\u220B')); + entity_map.insert(std::make_pair(std::wstring(L"∏"),L'\u220F')); + entity_map.insert(std::make_pair(std::wstring(L"∑"),L'\u2211')); + entity_map.insert(std::make_pair(std::wstring(L"−"),L'\u2212')); + entity_map.insert(std::make_pair(std::wstring(L"∗"),L'\u2217')); + entity_map.insert(std::make_pair(std::wstring(L"√"),L'\u221A')); + entity_map.insert(std::make_pair(std::wstring(L"∝"),L'\u221D')); + entity_map.insert(std::make_pair(std::wstring(L"∞"),L'\u221E')); + entity_map.insert(std::make_pair(std::wstring(L"∠"),L'\u2220')); + entity_map.insert(std::make_pair(std::wstring(L"∧"),L'\u2227')); + entity_map.insert(std::make_pair(std::wstring(L"∨"),L'\u2228')); + entity_map.insert(std::make_pair(std::wstring(L"∩"),L'\u2229')); + entity_map.insert(std::make_pair(std::wstring(L"∪"),L'\u222A')); + entity_map.insert(std::make_pair(std::wstring(L"∫"),L'\u222B')); + entity_map.insert(std::make_pair(std::wstring(L"∴"),L'\u2234')); + entity_map.insert(std::make_pair(std::wstring(L"∼"),L'\u223C')); + entity_map.insert(std::make_pair(std::wstring(L"≅"),L'\u2245')); + entity_map.insert(std::make_pair(std::wstring(L"≈"),L'\u2248')); + entity_map.insert(std::make_pair(std::wstring(L"≠"),L'\u2260')); + entity_map.insert(std::make_pair(std::wstring(L"≡"),L'\u2261')); + entity_map.insert(std::make_pair(std::wstring(L"≤"),L'\u2264')); + entity_map.insert(std::make_pair(std::wstring(L"≥"),L'\u2265')); + entity_map.insert(std::make_pair(std::wstring(L"⊂"),L'\u2282')); + entity_map.insert(std::make_pair(std::wstring(L"⊃"),L'\u2283')); + entity_map.insert(std::make_pair(std::wstring(L"⊄"),L'\u2284')); + entity_map.insert(std::make_pair(std::wstring(L"⊆"),L'\u2286')); + entity_map.insert(std::make_pair(std::wstring(L"⊇"),L'\u2287')); + entity_map.insert(std::make_pair(std::wstring(L"⊕"),L'\u2295')); + entity_map.insert(std::make_pair(std::wstring(L"⊗"),L'\u2297')); + entity_map.insert(std::make_pair(std::wstring(L"⊥"),L'\u22A5')); + entity_map.insert(std::make_pair(std::wstring(L"⋅"),L'\u22C5')); + entity_map.insert(std::make_pair(std::wstring(L"⌈"),L'\u2308')); + entity_map.insert(std::make_pair(std::wstring(L"⌉"),L'\u2309')); + entity_map.insert(std::make_pair(std::wstring(L"⌊"),L'\u230A')); + entity_map.insert(std::make_pair(std::wstring(L"⌋"),L'\u230B')); + entity_map.insert(std::make_pair(std::wstring(L"⟨"),L'\u2329')); + entity_map.insert(std::make_pair(std::wstring(L"⟩"),L'\u232A')); + entity_map.insert(std::make_pair(std::wstring(L"◊"),L'\u25CA')); + entity_map.insert(std::make_pair(std::wstring(L"♠"),L'\u2660')); + entity_map.insert(std::make_pair(std::wstring(L"♣"),L'\u2663')); + entity_map.insert(std::make_pair(std::wstring(L"♥"),L'\u2665')); + entity_map.insert(std::make_pair(std::wstring(L"♦"),L'\u2666')); +} + +inline gunichar get_entity(gunichar *ptr, size_t len) { + // try hex, decimal entity first + gunichar ech(0); + if (ptr[1] == gunichar(L'#') && len > 3) { + std::wstringstream wss; + int wch = 0; + try { + wss << std::hex << std::wstring((wchar_t *)(ptr+2),len-3); + wss >> wch; + ech = gunichar(wch); + } catch (...) { + ech = 0; + } + } else if (g_unichar_type(ptr[1]) == G_UNICODE_DECIMAL_NUMBER) { + std::wstringstream wss; + int wch = 0; + try { + wss << std::dec << std::wstring((wchar_t *)(ptr+1),len-2); + wss >> wch; + ech = gunichar(wch); + } catch (...) { + ech = 0; + } + } + if (ech) + return ech; + + // otherwise require well-known name map + if (entity_map.empty()) { + init_entity_map(); + } + std::map::iterator it = entity_map.find(std::wstring((wchar_t *)(ptr),len)); + return it != entity_map.end() ? it->second : gunichar(0); +} + + +bool unescape(std::string& word) { + std::ostringstream oss; + std::size_t was = 0; // last processed + std::size_t pos = 0; // last unprocessed + std::size_t len = 0; // processed length + bool hit = false; + for (std::size_t endp=0; + (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos; + was = endp == std::string::npos ? pos : 1+endp) { + len = endp - pos + 1; + glong ulen(0); + gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)word.c_str()+pos, len, &ulen); + gunichar gbuf[2] = { 0 }; + if ((gbuf[0] = get_entity(gtmp,ulen)) != gunichar(0)) { + gchar *gstr = g_ucs4_to_utf8(gbuf,ulen,0,0,0); + if (was < pos) + oss << word.substr(was,pos-was); + oss << gstr; + g_free(gstr); + was += ulen; + hit = true; + } else { + oss << word.substr(was,1+endp-was); + } + g_free(gtmp); + } + if (was < word.size()) + oss << word.substr(was); + if (hit) + word = oss.str(); + return hit; +} + + }; // end anonymous namespace @@ -510,6 +839,8 @@ Tokenizer::tokenize(const std::string& buf) in_url_p = in_num_p = post_break_p = false; } + retry: + switch (curr_type) { case G_UNICODE_MODIFIER_LETTER: case G_UNICODE_OTHER_LETTER: @@ -654,10 +985,16 @@ Tokenizer::tokenize(const std::string& buf) break; case G_UNICODE_OTHER_PUNCTUATION: switch (curr_uch) { + case gunichar(L':'): + case gunichar(L'/'): + if (refined_p && !in_url_p + && prev_type == G_UNICODE_DECIMAL_NUMBER + && next_type == G_UNICODE_DECIMAL_NUMBER) { + break; + } + // fall-through case gunichar(L'!'): case gunichar(L'#'): - case gunichar(L'/'): - case gunichar(L':'): case gunichar(L';'): case gunichar(L'?'): case gunichar(L'@'): @@ -668,6 +1005,30 @@ Tokenizer::tokenize(const std::string& buf) in_num_p = in_num_p || since_start == 0; break; case gunichar(L'&'): + if (unescape_p) { + if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER + || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) { + gunichar *eptr = nxt4; + GUnicodeType eptr_type(G_UNICODE_UNASSIGNED); + for (++eptr; eptr < lim4 && *eptr != gunichar(L';'); ++eptr) { + eptr_type = g_unichar_type(*eptr); + if (eptr_type != G_UNICODE_LOWERCASE_LETTER + && eptr_type != G_UNICODE_UPPERCASE_LETTER + && eptr_type != G_UNICODE_DECIMAL_NUMBER) + break; + } + gunichar ech(0); + if (*eptr == gunichar(L';') && (ech = get_entity(ucs4,eptr-ucs4+1))) { + curr_uch = ech; + curr_type = g_unichar_type(ech); + ucs4 = eptr; + nxt4 = ++eptr; + next_uch = *nxt4; + next_type = nxt4 < lim4 ? g_unichar_type(*nxt4) : G_UNICODE_UNASSIGNED; + goto retry; + } + } + } post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR; if (escape_p) substitute_p = L"&"; @@ -675,9 +1036,19 @@ Tokenizer::tokenize(const std::string& buf) case gunichar(L'\''): if (english_p) { if (!in_url_p) { + bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER + || next_type == G_UNICODE_UPPERCASE_LETTER; pre_break_p = true; - post_break_p = since_start == 0 || - (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER); + if (next_letter_p && refined_p) { + // break sha n't instead of shan 't: + if (prev_uch == gunichar(L'n') || prev_uch == gunichar(L'N')) { + *(uptr - 1) = gunichar(L' '); + *(uptr++) = prev_uch; + pre_break_p = false; + } + } + post_break_p = since_start == 0 + || (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER); } } else if (latin_p) { post_break_p = !in_url_p; @@ -697,6 +1068,14 @@ Tokenizer::tokenize(const std::string& buf) pre_break_p = !in_num_p || next_type != G_UNICODE_DECIMAL_NUMBER; post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER; break; + case gunichar(L'%'): + if (refined_p) { + pre_break_p = !in_num_p; + post_break_p = !in_num_p && next_type != G_UNICODE_DECIMAL_NUMBER; + } else { + post_break_p = pre_break_p = true; + } + break; case gunichar(L'.'): if (prev_uch != '.') { if (!in_num_p) { @@ -841,8 +1220,19 @@ Tokenizer::tokenize(const std::string& buf) post_break_p = pre_break_p = !in_url_p; break; case gunichar(L'+'): - in_num_p = in_num_p || since_start == 0; - post_break_p = pre_break_p = !in_url_p; + post_break_p = pre_break_p = !in_url_p; + if (in_url_p) { + in_num_p = false; + } else if (refined_p) { + // handle floating point as e.g. 1.2e+3.4 + bool next_digit_p = next_type == G_UNICODE_DECIMAL_NUMBER || + next_uch == gunichar(L'.'); + pre_break_p = !in_num_p; + in_num_p = next_digit_p && prev_type != G_UNICODE_DECIMAL_NUMBER; + post_break_p = !in_num_p; + } else { + in_num_p = in_num_p || since_start == 0; + } break; default: post_break_p = pre_break_p = true; @@ -1142,30 +1532,40 @@ Tokenizer::detokenize(const std::string& buf) std::size_t squotes = 0; std::size_t dquotes = 0; - std::string prepends(SPC_BYTE); + std::string prepends(""); std::ostringstream oss; std::size_t nwords = words.size(); std::size_t iword = 0; - for (auto word: words) { + if (unescape_p) for (auto &word: words) unescape(word); + + for (auto &word: words) { if (RE2::FullMatch(word,right_x)) { - oss << prepends << word; + if (iword) + oss << SPC_BYTE; + oss << word; prepends.clear(); } else if (RE2::FullMatch(word,left_x)) { oss << word; prepends = SPC_BYTE; - } else if (english_p && iword && RE2::FullMatch(word,curr_en_x) && RE2::FullMatch(words[iword-1],pre_en_x)) { + } else if (english_p && iword + && RE2::FullMatch(word,curr_en_x) + && RE2::FullMatch(words[iword-1],pre_en_x)) { oss << word; prepends = SPC_BYTE; - } else if (latin_p && iword < nwords - 2 && RE2::FullMatch(word,curr_fr_x) && RE2::FullMatch(words[iword+1],post_fr_x)) { + } else if (latin_p && iword < nwords - 2 + && RE2::FullMatch(word,curr_fr_x) + && RE2::FullMatch(words[iword+1],post_fr_x)) { oss << prepends << word; prepends.clear(); } else if (word.size() == 1) { if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) || (word.at(0) == '"' && ((dquotes % 2) == 0))) { - if (english_p && iword && word.at(0) == '\'' && words[iword-1].at(words[iword-1].size()-1) == 's') { + if (english_p && iword + && word.at(0) == '\'' + && std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') { oss << word; prepends = SPC_BYTE; } else { @@ -1177,6 +1577,8 @@ Tokenizer::detokenize(const std::string& buf) dquotes++; } } else { + if (std::isalnum(word.at(0))) + oss << prepends; oss << word; prepends = SPC_BYTE; if (word.at(0) == '\'') @@ -1186,7 +1588,7 @@ Tokenizer::detokenize(const std::string& buf) } } else { oss << prepends << word; - prepends.clear(); + prepends = SPC_BYTE; } iword++; } diff --git a/contrib/c++tokenizer/tokenizer_main.cpp b/contrib/c++tokenizer/tokenizer_main.cpp index 7a6554a83..41496622c 100644 --- a/contrib/c++tokenizer/tokenizer_main.cpp +++ b/contrib/c++tokenizer/tokenizer_main.cpp @@ -18,6 +18,7 @@ usage(const char *path) std::cerr << " -b -- drop bad bytes" << std::endl; std::cerr << " -c DIR -- config (pattern) file directory" << std::endl; std::cerr << " -d -- downcase" << std::endl; + std::cerr << " -D -- detokenize" << std::endl; std::cerr << " -e -- do not escape entities during tokenization" << std::endl; std::cerr << " -k -- narrow kana" << std::endl; std::cerr << " -n -- narrow latin" << std::endl; @@ -27,7 +28,7 @@ usage(const char *path) std::cerr << " -r -- refined contraction and quantity conjoining" << std::endl; std::cerr << " -s -- super- and sub-script conjoining" << std::endl; std::cerr << " -u -- disable url handling" << std::endl; - std::cerr << " -U -- unescape entities before tokenization" << std::endl; + std::cerr << " -U -- unescape entities before tokenization, after detokenization" << std::endl; std::cerr << " -v -- verbose" << std::endl; std::cerr << " -w -- word filter" << std::endl; std::cerr << " -x -- skip xml tag lines" << std::endl; @@ -108,7 +109,8 @@ int main(int ac, char **av) const char *prog = av[0]; bool next_cfg_p = false; bool next_output_p = false; - + bool detokenize_p = std::strstr(av[0],"detokenize") != 0; + while (++av,--ac) { if (**av == '-') { switch (av[0][1]) { @@ -124,6 +126,9 @@ int main(int ac, char **av) case 'd': params.downcase_p = true; break; + case 'D': + detokenize_p = true; + break; case 'e': params.escape_p = false; break; @@ -234,6 +239,9 @@ int main(int ac, char **av) } std::ostream& ofs(pofs ? *pofs : std::cout); + if (params.lang_iso.empty()) + params.lang_iso = "en"; + Tokenizer tize(params); tize.init(); size_t nlines = 0; @@ -252,12 +260,20 @@ int main(int ac, char **av) } } } else if (params.args.empty()) { - nlines = tize.tokenize(std::cin,ofs); + if (detokenize_p) { + nlines = tize.detokenize(std::cin,ofs); + } else { + nlines = tize.tokenize(std::cin,ofs); + } } else { for (std::string& arg : params.args) { try { std::ifstream ifs(arg.c_str()); - nlines = tize.tokenize(ifs,ofs); + if (detokenize_p) { + nlines = tize.detokenize(ifs,ofs); + } else { + nlines = tize.tokenize(ifs,ofs); + } } catch (...) { std::cerr << "Exception tokenizing from path " << arg << std::endl; } @@ -265,7 +281,7 @@ int main(int ac, char **av) } if (params.verbose_p) - std::cerr << "%%% tokenized lines: " << nlines << std::endl; + std::cerr << "%%% " << nlines << " lines." << std::endl; return rc; } -- cgit v1.2.3