Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorakimbal1 <akimball2@bloomberg.net>2015-01-23 21:35:09 +0300
committerakimbal1 <akimball2@bloomberg.net>2015-01-23 21:35:09 +0300
commitd38dcd89bbe3f3d1342d30c0c0778d8afa3bf938 (patch)
treec1cb01e2c63dc31858f6439e1abc3e7c825b0359 /contrib
parente30065072e1f43ce725c96ae17e7d59a85295173 (diff)
add glib-2.0 for better unicodification and faster implementation
Diffstat (limited to 'contrib')
-rw-r--r--contrib/c++tokenizer/Jamfile4
-rw-r--r--contrib/c++tokenizer/tokenizer.cpp785
-rw-r--r--contrib/c++tokenizer/tokenizer.h10
-rw-r--r--contrib/c++tokenizer/tokenizer_main.cpp91
4 files changed, 691 insertions, 199 deletions
diff --git a/contrib/c++tokenizer/Jamfile b/contrib/c++tokenizer/Jamfile
index f6a74a9df..78b2dd531 100644
--- a/contrib/c++tokenizer/Jamfile
+++ b/contrib/c++tokenizer/Jamfile
@@ -1,2 +1,4 @@
external-lib re2 ;
-exe tokenizer : tokenizer.cpp tokenizer_main.cpp re2 : <cflags>-std=c++11 ;
+external-lib glib-2.0 ;
+glib-cflags = [ _shell "pkg-config --cflags glib-2.0" ] ;
+exe tokenizer : tokenizer.cpp tokenizer_main.cpp re2 glib-2.0 : <cflags>-std=c++11 <cflags>$(glib-cflags) ;
diff --git a/contrib/c++tokenizer/tokenizer.cpp b/contrib/c++tokenizer/tokenizer.cpp
index ca5065046..f016cd2f1 100644
--- a/contrib/c++tokenizer/tokenizer.cpp
+++ b/contrib/c++tokenizer/tokenizer.cpp
@@ -1,9 +1,12 @@
#include "tokenizer.h"
+#include <re2/stringpiece.h>
#include <sstream>
#include <iterator>
#include <memory>
#include <vector>
#include <algorithm>
+#include <cstring>
+#include <glib.h>
namespace {
@@ -61,6 +64,30 @@ RE2 endnum_x("[-\'\"]"); //
// anything rarely used will just be given as a string and compiled on demand by RE2
+const char *SPC_BYTE = " ";
+//const char *URL_VALID_SYM_CHARS = "-._~:/?#[]@!$&'()*+,;=";
+
+inline bool
+class_follows_p(gunichar *s, gunichar *e, GUnicodeType gclass) {
+ while (s < e) {
+ GUnicodeType tclass = g_unichar_type(*s);
+ if (tclass == gclass)
+ return true;
+ switch (tclass) {
+ case G_UNICODE_SPACING_MARK:
+ case G_UNICODE_LINE_SEPARATOR:
+ case G_UNICODE_PARAGRAPH_SEPARATOR:
+ case G_UNICODE_SPACE_SEPARATOR:
+ ++s;
+ continue;
+ break;
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
}; // end anonymous namespace
@@ -90,6 +117,10 @@ Tokenizer::Tokenizer(const std::string& _lang_iso,
bool _skip_alltags_p,
bool _non_escape_p,
bool _aggressive_hyphen_p,
+ bool _supersub_p,
+ bool _url_p,
+ bool _downcase_p,
+ bool _normalize_p,
bool _penn_p,
bool _verbose_p)
: lang_iso(_lang_iso)
@@ -99,6 +130,10 @@ Tokenizer::Tokenizer(const std::string& _lang_iso,
, skip_alltags_p(_skip_alltags_p)
, non_escape_p(_non_escape_p)
, aggressive_hyphen_p(_aggressive_hyphen_p)
+ , supersub_p(_supersub_p)
+ , url_p(_url_p)
+ , downcase_p(_downcase_p)
+ , normalize_p(_normalize_p)
, penn_p(_penn_p)
, verbose_p(_verbose_p)
{
@@ -131,13 +166,19 @@ Tokenizer::load_prefixes(std::ifstream& ifs)
int nnum = 0;
while (std::getline(ifs,line)) {
- if (!line.empty() && line.at(0) != '#') {
+ if (!line.empty() && line[0] != '#') {
std::string prefix;
if (RE2::PartialMatch(line,numonly,&prefix)) {
nbpre_num_set.insert(prefix);
+ gunichar * x=g_utf8_to_ucs4_fast((const gchar *)prefix.c_str(),prefix.size(),0);
+ nbpre_num_ucs4.insert(std::wstring((wchar_t *)x));
+ g_free(x);
nnum++;
} else {
nbpre_gen_set.insert(line);
+ gunichar * x=g_utf8_to_ucs4_fast((const gchar *)line.c_str(),line.size(),0);
+ nbpre_gen_ucs4.insert(std::wstring((wchar_t *)x));
+ g_free(x);
nnon++;
}
}
@@ -223,117 +264,158 @@ Tokenizer::init() {
//
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
-// assumes protections are applied already, some invariants are in place
+// assumes protections are applied already, some invariants are in place,
+// e.g. that successive chars <= ' ' have been normalized to a single ' '
//
void
Tokenizer::protected_tokenize(std::string& text) {
- std::vector<std::string> words;
- size_t pos = 0;
- if (text.at(pos) == ' ')
+ std::vector<re2::StringPiece> words;
+ re2::StringPiece textpc(text);
+ int pos = 0;
+ if (textpc[pos] == ' ')
++pos;
size_t next = text.find(' ',pos);
while (next != std::string::npos) {
if (next - pos)
- words.push_back(text.substr(pos,next-pos));
+ words.push_back(textpc.substr(pos,next-pos));
pos = next + 1;
- while (pos < text.size() && text.at(pos) == ' ')
+ while (pos < textpc.size() && textpc[pos] == ' ')
++pos;
- next = text.find(' ',pos);
+ next = textpc.find(' ',pos);
}
- if (pos < text.size() && text.at(pos) != ' ')
- words.push_back(text.substr(pos,text.size()-pos));
+ if (pos < textpc.size() && textpc[pos] != ' ')
+ words.push_back(textpc.substr(pos,textpc.size()-pos));
- text.clear();
-
- // regurgitate words with look-ahead handling for tokens with final .
- for (size_t ii = 0; ii < words.size(); ++ii) {
+ // regurgitate words with look-ahead handling for tokens with final mumble
+ std::string outs;
+ std::size_t nwords(words.size());
+ for (size_t ii = 0; ii < nwords; ++ii) {
+ bool more_p = ii < nwords - 1;
size_t len = words[ii].size();
-
- if (len > 1 && words[ii].at(len-1) == '.') {
- std::string prefix(words[ii].substr(0,len-1));
- bool gen_prefix_p = nbpre_gen_set.find(prefix) != nbpre_gen_set.end();
- bool embeds_p = prefix.find('.') != std::string::npos;
- bool letter_p = RE2::PartialMatch(prefix.c_str(),letter_x);
- bool more_p = ii < words.size() - 1;
- bool nlower_p = more_p && RE2::PartialMatch(words[ii+1].c_str(),lower_x);
- bool num_prefix_p = (!gen_prefix_p) && nbpre_num_set.find(prefix) != nbpre_num_set.end();
- bool nint_p = more_p && RE2::PartialMatch(words[ii+1].c_str(),sinteger_x);
- bool isolate_p = true;
- if (gen_prefix_p) {
- isolate_p = false;
- } else if (num_prefix_p && nint_p) {
- isolate_p = false;
- } else if (embeds_p && letter_p) {
- isolate_p = false;
- } else if (nlower_p) {
- isolate_p = false;
- }
- if (isolate_p) {
- words[ii].assign(prefix);
- words[ii].append(" .");
+ bool sentence_break_p = len > 1 && words[ii][len-1] == '.';
+
+ // suppress break if it is an non-breaking prefix
+ if (sentence_break_p) {
+ re2::StringPiece pfx(words[ii].substr(0,len-1));
+ std::string pfxs(pfx.as_string());
+ if (nbpre_gen_set.find(pfxs) != nbpre_gen_set.end()) {
+ // general non-breaking prefix
+ sentence_break_p = false;
+ } else if (more_p && nbpre_num_set.find(pfxs) != nbpre_num_set.end() && RE2::PartialMatch(words[ii+1],sinteger_x)) {
+ // non-breaking before numeric
+ sentence_break_p = false;
+ } else if (pfxs.find('.') != std::string::npos && RE2::PartialMatch(pfx,letter_x)) {
+ // terminal isolated letter does not break
+ sentence_break_p = false;
+ } else if (more_p && RE2::PartialMatch(words[ii+1],lower_x)) {
+ // lower-case look-ahead does not break
+ sentence_break_p = false;
}
}
- text.append(words[ii]);
- if (ii < words.size() - 1)
- text.append(" ");
+ outs.append(words[ii].data(),len);
+ if (sentence_break_p)
+ outs.append(" .");
+ if (more_p)
+ outs.append(SPC_BYTE,1);
}
+ text.assign(outs.begin(),outs.end());
}
bool
Tokenizer::escape(std::string& text) {
- static const char escaping[] = "&|<>'\"[]";
+ bool mod_p = false;
+ std::string outs;
+
static const char *replacements[] = {
- "&amp;",
- "&#124;",
- "&lt;",
- "&gt;",
- "&apos;",
- "&quot;",
- "&#91;",
- "&#93;"
+ "&#124;", // | 0
+ "&#91;", // [ 1
+ "&#93;", // ] 2
+ "&amp;", // & 3
+ "&lt;", // < 4
+ "&gt;", // > 5
+ "&apos;", // ' 6
+ "&quot;", // " 7
};
- bool modified = false;
- const char *next = escaping;
- for (int ii = 0; *next; ++ii, ++next) {
- size_t pos = 0;
- for (pos = text.find(*next,pos); pos != std::string::npos;
- pos = (++pos < text.size() ? text.find(*next,pos) : std::string::npos)) {
- std::string replacement(replacements[ii]);
- if (*next != '\'') {
- if (pos > 0 && text.at(pos-1) == ' ' && pos < text.size()-1 && text.at(pos+1) != ' ')
- replacement.append(" ");
+ const char *pp = text.c_str(); // from pp to pt is uncopied
+ const char *ep = pp + text.size();
+ const char *pt = pp;
+
+ while (pt < ep) {
+ if (*pt & 0x80) {
+ const char *mk = (const char *)g_utf8_find_next_char((const gchar *)pt,(const gchar *)ep);
+ if (!mk) {
+ if (mod_p)
+ outs.append(pp,pt-pp+1);
+ } else {
+ if (mod_p)
+ outs.append(pp,mk-pp);
+ pt = --mk;
+ }
+ pp = ++pt;
+ continue;
+ }
+
+ const char *sequence_p = 0;
+ if (*pt < '?') {
+ if (*pt == '&') {
+ sequence_p = replacements[3];
+ } else if (*pt == '\'') {
+ sequence_p = replacements[6];
+ } else if (*pt == '"') {
+ sequence_p = replacements[7];
}
- text.replace(pos,1,replacement);
- modified = true;
+ } else if (*pt > ']') {
+ if (*pt =='|') { // 7c
+ sequence_p = replacements[0];
+ }
+ } else if (*pt > 'Z') {
+ if (*pt == '<') { // 3e
+ sequence_p = replacements[4];
+ } else if (*pt == '>') { // 3c
+ sequence_p = replacements[5];
+ } else if (*pt == '[') { // 5b
+ sequence_p = replacements[1];
+ } else if (*pt == ']') { // 5d
+ sequence_p = replacements[2];
+ }
+ }
+
+ if (sequence_p) {
+ if (pt > pp)
+ outs.append(pp,pt-pp);
+ outs.append(sequence_p);
+ mod_p = true;
+ pp = ++pt;
+ } else {
+ ++pt;
}
}
- return modified;
+ if (mod_p) {
+ if (pp < pt) {
+ outs.append(pp,pt-pp);
+ }
+ text.assign(outs.begin(),outs.end());
+ }
+
+ return mod_p;
}
std::string
Tokenizer::tokenize(const std::string& buf)
{
- static const char *apos_refs = "\\1 ' \\2";
- static const char *right_refs = "\\1 '\\2";
- static const char *left_refs = "\\1' \\2";
static const char *comma_refs = "\\1 , \\2";
static const char *isolate_ref = " \\1 ";
static const char *special_refs = "\\1 @\\2@ \\3";
- std::string outs;
std::string text(buf);
-
- if (skip_alltags_p) {
- RE2::GlobalReplace(&text,genl_tags_x," ");
- }
-
- RE2::GlobalReplace(&text,genl_spc_x," ");
- RE2::GlobalReplace(&text,ctrls_x,"");
+ std::string outs;
+ if (skip_alltags_p)
+ RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
size_t pos;
int num = 0;
@@ -344,6 +426,7 @@ Tokenizer::tokenize(const std::string& buf)
// push all the prefixes matching protected patterns
std::vector<std::string> prot_stack;
std::string match;
+
for (auto& pat : prot_pat_vec) {
pos = 0;
while (RE2::PartialMatch(text.substr(pos),*pat,&match)) {
@@ -363,75 +446,446 @@ Tokenizer::tokenize(const std::string& buf)
}
}
- // collapse spaces
- RE2::GlobalReplace(&text,mult_spc_x," ");
-
- // strip leading space
- if (text.at(0) == ' ')
- text = text.substr(1);
-
- // strip trailing space
- if (text.at(text.size()-1) == ' ')
- text = text.substr(0,text.size()-1);
-
- // isolate hyphens, if non-default option is set
- if (aggressive_hyphen_p)
- RE2::GlobalReplace(&text,hyphen_x,special_refs);
-
- // find successive dots, protect them
- pos = text.find("..");
- while (pos != std::string::npos && pos < text.size()) {
- char subst[12];
- size_t lim = pos + 2;
- while (lim < text.size() && text.at(lim) == '.') ++lim;
- snprintf(subst,sizeof(subst),"MANYDOTS%.3d",lim-pos);
- text.replace(pos,lim-pos,subst,11);
- pos = text.find("..",pos+11);
-
- }
+ const char *pt(text.c_str());
+ const char *ep(pt + text.size());
+ while (pt < ep && *pt >= 0 && *pt <= ' ')
+ ++pt;
+ glong ulen(0);
+ gunichar *usrc(g_utf8_to_ucs4_fast((const gchar *)pt,ep - pt, &ulen)); // g_free
+ gunichar *ucs4(usrc);
+ gunichar *lim4(ucs4 + ulen);
+
+ gunichar *nxt4 = ucs4;
+ gunichar *ubuf(g_new0(gunichar,ulen*6+1)); // g_free
+ gunichar *uptr(ubuf);
+
+ gunichar prev_uch(0L);
+ gunichar next_uch(*ucs4);
+ gunichar curr_uch(0L);
+
+ GUnicodeType curr_type(G_UNICODE_UNASSIGNED);
+ GUnicodeType next_type((ucs4 && *ucs4) ? g_unichar_type(*ucs4) : G_UNICODE_UNASSIGNED);
+ GUnicodeType prev_type(G_UNICODE_UNASSIGNED);
+
+ bool post_break_p = false;
+ bool in_num = next_uch <= gunichar('9') && next_uch >= gunichar('0');
+ bool in_url_p = false;
+ bool final_p = false;
+ int since_start = 0;
+ int alpha_prefix = 0;
+
+ while (ucs4 < lim4) {
+ prev_uch = curr_uch;
+ prev_type = curr_type;
+ curr_uch = next_uch;
+ curr_type = next_type;
+
+ final_p = ++nxt4 >= lim4;
+
+ if (final_p) {
+ next_uch = gunichar(0L);
+ next_type = G_UNICODE_UNASSIGNED;
+ } else {
+ next_uch = *nxt4;
+ next_type = g_unichar_type(next_uch);
+ }
- // terminate token at superscript or subscript sequence when followed by lower-case
- RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
+ bool is_basic = *ucs4 < 0x80L;
- // isolate commas after non-digits
- RE2::GlobalReplace(&text,postncomma_x,"\\1 , ");
+ if (url_p) {
+ if (!in_url_p) {
+ if (!since_start) {
+ if (is_basic && std::isalpha(char(*ucs4)))
+ alpha_prefix++;
+ } else if (alpha_prefix == since_start && is_basic && char(*ucs4) == ':' && next_type != G_UNICODE_SPACE_SEPARATOR) {
+ in_url_p = true;
+ }
+ }
+ }
- // isolate commas before non-digits
- RE2::GlobalReplace(&text,prencomma_x," , \\1");
+ bool break_p = false;
+ const wchar_t *substitute_p = 0;
- // replace backtick with single-quote
- pos = text.find("`");
- while (pos != std::string::npos) {
- text.replace(pos,1,"'",1);
- pos = text.find("`");
- }
+ if (post_break_p) {
+ *uptr++ = gunichar(' ');
+ since_start = 0;
+ in_url_p = in_num = post_break_p = false;
+ }
- // replace doubled single-quotes with double-quotes
- pos = text.find("''");
- while (pos != std::string::npos) {
- text.replace(pos,2,"\"",1);
- pos = text.find("''",pos+1);
- }
+ switch (curr_type) {
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ if (in_url_p || in_num)
+ break_p = true;
+ // fallthough
+ case G_UNICODE_UPPERCASE_LETTER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
+ curr_uch = g_unichar_tolower(*ucs4);
+ break;
+ case G_UNICODE_SPACING_MARK:
+ break_p = true;
+ in_num = false;
+ curr_uch = gunichar(0L);
+ break;
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ if (!in_num && !in_url_p) {
+ switch (prev_type) {
+ case G_UNICODE_DASH_PUNCTUATION:
+ case G_UNICODE_FORMAT:
+ case G_UNICODE_OTHER_PUNCTUATION:
+ case G_UNICODE_UPPERCASE_LETTER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_DECIMAL_NUMBER:
+ break;
+ default:
+ break_p = true;
+ }
+ }
+ in_num = true;
+ break;
+ case G_UNICODE_CONNECT_PUNCTUATION:
+ if (curr_uch != gunichar(L'_')) {
+ if (in_url_p) {
+ in_url_p = false;
+ post_break_p = break_p = true;
+ }
+ }
+ if (in_num) {
+ post_break_p = break_p = true;
+ } else {
+ switch (next_type) {
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ break;
+ default:
+ post_break_p = break_p = true;
+ }
+ switch (prev_type) {
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ break;
+ default:
+ post_break_p = break_p = true;
+ }
+ }
+ break;
+ case G_UNICODE_DASH_PUNCTUATION:
+ case G_UNICODE_FORMAT:
+ if (aggressive_hyphen_p) {
+ substitute_p = L"@-@";
+ break_p = post_break_p = !in_url_p;
+ } else if (next_type == G_UNICODE_SPACE_SEPARATOR) {
+ } else if (prev_type == curr_type) {
+ if (next_type != curr_type) {
+ post_break_p = !in_url_p;
+ }
+ } else if (next_type == curr_type) {
+ break_p = !in_url_p;
+ } else if ((prev_type == G_UNICODE_UPPERCASE_LETTER ||
+ prev_type == G_UNICODE_LOWERCASE_LETTER) &&
+ next_type == G_UNICODE_DECIMAL_NUMBER) {
+ in_num = false;
+ } else if (in_num || since_start == 0) {
+ switch (next_type) {
+ case G_UNICODE_UPPERCASE_LETTER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ case G_UNICODE_SPACE_SEPARATOR:
+ break;
+ default:
+ post_break_p = break_p = prev_uch != curr_uch;
+ }
+ } else if (in_url_p) {
+ break_p = curr_uch != gunichar('-');
+ } else {
+ switch (prev_type) {
+ case G_UNICODE_UPPERCASE_LETTER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ case G_UNICODE_OTHER_PUNCTUATION:
+ switch (next_type) {
+ case G_UNICODE_UPPERCASE_LETTER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_MODIFIER_LETTER:
+ case G_UNICODE_OTHER_LETTER:
+ case G_UNICODE_TITLECASE_LETTER:
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LETTER_NUMBER:
+ case G_UNICODE_OTHER_NUMBER:
+ break;
+ default:
+ post_break_p = break_p = prev_uch != curr_uch;
+ }
+ break;
+ default:
+ post_break_p = break_p = prev_uch != curr_uch;
+ break;
+ }
+ }
+ break;
+ case G_UNICODE_OTHER_PUNCTUATION:
+ switch (curr_uch) {
+ case gunichar('!'):
+ case gunichar('#'):
+ case gunichar('/'):
+ case gunichar(':'):
+ case gunichar(';'):
+ case gunichar('?'):
+ case gunichar('@'):
+ post_break_p = break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
+ break;
+ case gunichar('+'):
+ post_break_p = break_p = !in_num && since_start > 0;
+ in_num = in_num || since_start == 0;
+ break;
+ case gunichar('&'):
+ post_break_p = break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
+ if (!non_escape_p)
+ substitute_p = L"&amp;";
+ break;
+ case gunichar('\''):
+ if (english_p) {
+ if (!in_url_p) {
+ break_p = true;
+ post_break_p = since_start == 0 ||
+ (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
+ }
+ } else if (latin_p) {
+ post_break_p = !in_url_p;
+ break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
+ } else {
+ post_break_p = break_p = !in_url_p;
+ }
+ if (!non_escape_p)
+ substitute_p = L"&apos;";
+ break;
+ case gunichar('"'):
+ post_break_p = break_p = true;
+ if (!non_escape_p)
+ substitute_p = L"&quot;";
+ break;
+ case gunichar(','):
+ break_p = !in_num || next_type != G_UNICODE_DECIMAL_NUMBER;
+ break;
+ case gunichar('.'):
+ if (prev_uch != '.') {
+ if (!in_num) {
+ switch (next_type) {
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER:
+ break;
+ default:
+ if (since_start > 0) {
+ switch (prev_type) {
+ case G_UNICODE_LOWERCASE_LETTER:
+ case G_UNICODE_UPPERCASE_LETTER: {
+ std::wstring k((wchar_t *)(uptr-since_start),since_start);
+ if (nbpre_gen_ucs4.find(k) != nbpre_gen_ucs4.end()) {
+ // general non-breaking prefix
+ } else if (nbpre_num_ucs4.find(k) != nbpre_num_ucs4.end() && class_follows_p(nxt4,lim4,G_UNICODE_DECIMAL_NUMBER)) {
+ // non-breaking before numeric
+ } else if (k.find(curr_uch) != std::wstring::npos) {
+ if (since_start > 1) {
+ GUnicodeType tclass = g_unichar_type(*(uptr-2));
+ switch (tclass) {
+ case G_UNICODE_UPPERCASE_LETTER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ break_p = true;
+ break;
+ default:
+ break;
+ }
+ }
+ // terminal isolated letter does not break
+ } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
+ g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
+ // lower-case look-ahead does not break
+ } else {
+ break_p = true;
+ }
+ break;
+ }
+ default:
+ break_p = true;
+ break;
+ }
+ }
+ break;
+ }
+ } else {
+ switch (next_type) {
+ case G_UNICODE_DECIMAL_NUMBER:
+ case G_UNICODE_LOWERCASE_LETTER:
+ break;
+ default:
+ break_p = true;
+ }
+ }
+ } else if (next_uch != '.') {
+ post_break_p = true;
+ }
+ break;
+ default:
+ post_break_p = break_p = true;
+ break;
+ }
+ break;
+ case G_UNICODE_CLOSE_PUNCTUATION:
+ case G_UNICODE_FINAL_PUNCTUATION:
+ case G_UNICODE_INITIAL_PUNCTUATION:
+ case G_UNICODE_OPEN_PUNCTUATION:
+ switch (curr_uch) {
+ case gunichar('('):
+ case gunichar(')'):
+ break;
+ case gunichar('['):
+ if (!non_escape_p)
+ substitute_p = L"&#91;";
+ break;
+ case gunichar(']'):
+ if (!non_escape_p)
+ substitute_p = L"&#93;";
+ break;
+ default:
+ in_url_p = false;
+ }
+ post_break_p = break_p = !in_url_p;
+ break;
+ case G_UNICODE_CURRENCY_SYMBOL:
+ post_break_p = in_num; // was in number, so break it
+ break_p = !in_num;
+ in_num = in_num || next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar('.') || next_uch == gunichar(',');
+ if (curr_uch != gunichar('$'))
+ in_url_p = false;
+ break;
+ case G_UNICODE_MODIFIER_SYMBOL:
+ case G_UNICODE_MATH_SYMBOL:
+ switch (curr_uch) {
+ case gunichar('`'):
+ if (english_p) {
+ if (!in_url_p) {
+ break_p = true;
+ post_break_p = since_start == 0 ||
+ (next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
+ }
+ } else if (latin_p) {
+ post_break_p = !in_url_p;
+ break_p = !in_url_p && prev_type != G_UNICODE_LOWERCASE_LETTER && prev_type != G_UNICODE_UPPERCASE_LETTER;
+ } else {
+ post_break_p = break_p = !in_url_p;
+ }
+ if (!non_escape_p)
+ substitute_p = L"&apos;";
+ else
+ curr_uch = gunichar('\'');
+ break;
+ case gunichar('|'):
+ if (!non_escape_p)
+ substitute_p = L"&#124;";
+ post_break_p = break_p = true;
+ break;
+ case gunichar('<'):
+ if (!non_escape_p)
+ substitute_p = L"&lt;";
+ post_break_p = break_p = true;
+ break;
+ case gunichar('>'):
+ if (!non_escape_p)
+ substitute_p = L"&gt;";
+ post_break_p = break_p = true;
+ break;
+ case gunichar('%'):
+ post_break_p = in_num;
+ break_p = !in_num && !in_url_p;
+ in_num = false;
+ break;
+ case gunichar('='):
+ case gunichar('~'):
+ in_num = false;
+ post_break_p = break_p = !in_url_p;
+ break;
+ case gunichar('+'):
+ in_num = in_num || since_start == 0;
+ post_break_p = break_p = !in_url_p;
+ break;
+ default:
+ post_break_p = break_p = true;
+ break;
+ }
+ break;
+ case G_UNICODE_OTHER_SYMBOL:
+ post_break_p = break_p = true;
+ break;
+ case G_UNICODE_LINE_SEPARATOR:
+ curr_uch = gunichar(' ');
+ in_url_p = in_num = false;
+ break;
+ case G_UNICODE_SPACE_SEPARATOR:
+ curr_uch = gunichar(' ');
+ in_url_p = in_num = false;
+ break;
+ default:
+ curr_uch = 0;
+ in_url_p = in_num = false;
+ break;
+ }
+
+ if ((break_p || curr_uch == gunichar(' '))) {
+ if (since_start) {
+ *uptr++ = gunichar(' ');
+ in_url_p = false;
+ in_num = in_num && !post_break_p;
+ since_start = 0;
+ }
+ if (curr_uch == gunichar(' '))
+ curr_uch = gunichar(0L);
+ }
+
+ if (substitute_p) {
+ for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
+ *uptr++ = *sptr;
+ since_start++;
+ }
+ in_url_p = in_num = false;
+ } else if (curr_uch) {
+ *uptr++ = curr_uch;
+ since_start++;
+ }
- // isolate special characters
- RE2::GlobalReplace(&text,specials_x,isolate_ref);
-
- if (english_p) {
- // english contractions to the right
- RE2::GlobalReplace(&text,nanaapos_x,apos_refs);
- RE2::GlobalReplace(&text,nxpaapos_x,apos_refs);
- RE2::GlobalReplace(&text,panaapos_x,apos_refs);
- RE2::GlobalReplace(&text,papaapos_x,right_refs);
- RE2::GlobalReplace(&text,pnsapos_x,"\\1 's");
- } else if (latin_p) {
- // italian,french contractions to the left
- RE2::GlobalReplace(&text,nanaapos_x,apos_refs);
- RE2::GlobalReplace(&text,napaapos_x,apos_refs);
- RE2::GlobalReplace(&text,panaapos_x,apos_refs);
- RE2::GlobalReplace(&text,papaapos_x,left_refs);
+ ucs4 = nxt4;
}
- protected_tokenize(text);
+ glong nbytes = 0;
+ gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
+ if (utf8[nbytes-1] == ' ')
+ --nbytes;
+ text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
+ g_free(utf8);
+ g_free(usrc);
+ g_free(ubuf);
+
+ // terminate token at superscript or subscript sequence when followed by lower-case
+ if (supersub_p)
+ RE2::GlobalReplace(&text,numscript_x,"\\1\\2 \\3");
// restore prefix-protected strings
num = 0;
@@ -440,36 +894,11 @@ Tokenizer::tokenize(const std::string& buf)
snprintf(subst,sizeof(subst),"THISISPROTECTED%.3d",num++);
size_t loc = text.find(subst);
while (loc != std::string::npos) {
- text.replace(loc,18,prot);
+ text.replace(loc,18,prot.data(),prot.size());
loc = text.find(subst,loc+18);
}
}
- // restore dot-sequences with correct length
- std::string numstr;
- pos = 0;
- while (RE2::PartialMatch(text,dotskey_x,&numstr)) {
- int count = std::strtoul(numstr.c_str(),0,0);
- int loc = text.find("MANYDOTS",pos);
- std::ostringstream fss;
- fss << text.substr(0,loc);
- if (loc > 0 && text.at(loc-1) != ' ')
- fss << ' ';
- for (int ii = 0; ii < count; ++ii)
- fss << '.';
- int sublen = 8 + numstr.size();
- pos = loc + sublen;
- if (pos < text.size() && text.at(pos) != ' ')
- fss << ' ';
- fss << text.substr(pos);
- pos = loc;
- text.assign(fss.str());
- }
-
- // escape moses mark-up
- if (!non_escape_p)
- escape(text);
-
// return value
outs.assign(text);
@@ -480,9 +909,9 @@ Tokenizer::tokenize(const std::string& buf)
size_t len = text.size();
if (len > 2 && text.substr(0,2) == "``")
text.replace(0,2,"`` ",3);
- else if (text.at(0) == '"')
+ else if (text[0] == '"')
text.replace(0,1,"`` ",3);
- else if (text.at(0) == '`' || text.at(0) == '\'')
+ else if (text[0] == '`' || text[0] == '\'')
text.replace(0,1,"` ",2);
static char one_gg[] = "\\1 ``";
RE2::GlobalReplace(&text,x1_v_d,one_gg);
@@ -528,11 +957,11 @@ Tokenizer::tokenize(const std::string& buf)
// insure leading and trailing space on line, to simplify exprs
// also make sure final . has one space on each side
len = text.size();
- while (len > 1 && text.at(len-1) == ' ') --len;
+ while (len > 1 && text[len-1] == ' ') --len;
if (len < text.size())
text.assign(text.substr(0,len));
- if (len > 2 && text.at(len-1) == '.') {
- if (text.at(len-2) != ' ') {
+ if (len > 2 && text[len-1] == '.') {
+ if (text[len-2] != ' ') {
text.assign(text.substr(0,len-1));
text.append(" . ");
} else {
@@ -540,9 +969,9 @@ Tokenizer::tokenize(const std::string& buf)
text.append(". ");
}
} else {
- text.append(" ");
+ text.append(SPC_BYTE,1);
}
- std::string ntext(" ");
+ std::string ntext(SPC_BYTE);
ntext.append(text);
// convert double quote to paired single-quotes
@@ -577,7 +1006,7 @@ Tokenizer::tokenize(const std::string& buf)
RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
// collapse spaces
- RE2::GlobalReplace(&ntext,mult_spc_x," ");
+ RE2::GlobalReplace(&ntext,mult_spc_x,SPC_BYTE);
// escape moses meta-characters
if (!non_escape_p)
@@ -601,11 +1030,11 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
line_no ++;
if (istr.empty())
continue;
- if (skip_xml_p && RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)) {
+ if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
os << istr << std::endl;
} else {
- std::string bstr(" ");
- bstr.append(istr).append(" ");
+ std::string bstr(SPC_BYTE);
+ bstr.append(istr).append(SPC_BYTE);
os << tokenize(bstr) << std::endl;
}
if (verbose_p && ((line_no % 1000) == 0)) {
@@ -652,7 +1081,7 @@ Tokenizer::detokenize(const std::string& buf)
std::size_t squotes = 0;
std::size_t dquotes = 0;
- std::string prepends(" ");
+ std::string prepends(SPC_BYTE);
std::ostringstream oss;
@@ -665,10 +1094,10 @@ Tokenizer::detokenize(const std::string& buf)
prepends.clear();
} else if (RE2::FullMatch(word,left_x)) {
oss << word;
- prepends = " ";
+ prepends = SPC_BYTE;
} else if (english_p && iword && RE2::FullMatch(word,curr_en_x) && RE2::FullMatch(words[iword-1],pre_en_x)) {
oss << word;
- prepends = " ";
+ prepends = SPC_BYTE;
} else if (latin_p && iword < nwords - 2 && RE2::FullMatch(word,curr_fr_x) && RE2::FullMatch(words[iword+1],post_fr_x)) {
oss << prepends << word;
prepends.clear();
@@ -677,7 +1106,7 @@ Tokenizer::detokenize(const std::string& buf)
(word.at(0) == '"' && ((dquotes % 2) == 0))) {
if (english_p && iword && word.at(0) == '\'' && words[iword-1].at(words[iword-1].size()-1) == 's') {
oss << word;
- prepends = " ";
+ prepends = SPC_BYTE;
} else {
oss << prepends << word;
prepends.clear();
@@ -688,7 +1117,7 @@ Tokenizer::detokenize(const std::string& buf)
}
} else {
oss << word;
- prepends = " ";
+ prepends = SPC_BYTE;
if (word.at(0) == '\'')
squotes++;
else if (word.at(0) == '"')
@@ -703,7 +1132,7 @@ Tokenizer::detokenize(const std::string& buf)
std::string text(oss.str());
- RE2::GlobalReplace(&text," +"," ");
+ RE2::GlobalReplace(&text," +",SPC_BYTE);
RE2::GlobalReplace(&text,"\n ","\n");
RE2::GlobalReplace(&text," \n","\n");
return trim(text);
@@ -720,7 +1149,7 @@ Tokenizer::detokenize(std::istream& is, std::ostream& os)
line_no ++;
if (istr.empty())
continue;
- if (skip_xml_p && RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)) {
+ if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
os << istr << std::endl;
} else {
os << detokenize(istr) << std::endl;
diff --git a/contrib/c++tokenizer/tokenizer.h b/contrib/c++tokenizer/tokenizer.h
index aab673cc0..017d38a95 100644
--- a/contrib/c++tokenizer/tokenizer.h
+++ b/contrib/c++tokenizer/tokenizer.h
@@ -28,6 +28,8 @@ private:
std::set<std::string> nbpre_num_set;
std::set<std::string> nbpre_gen_set;
+ std::set<std::wstring> nbpre_num_ucs4;
+ std::set<std::wstring> nbpre_gen_ucs4;
std::vector<re2::RE2 *> prot_pat_vec;
protected:
@@ -40,6 +42,10 @@ protected:
bool skip_alltags_p;
bool non_escape_p;
bool aggressive_hyphen_p;
+ bool supersub_p;
+ bool url_p;
+ bool downcase_p;
+ bool normalize_p;
bool penn_p;
bool verbose_p;
@@ -62,6 +68,10 @@ public:
bool _skip_alltags_p = true, // skip all xml style tags
bool _non_escape_p = false, // default is to call escape method before return
bool _aggressive_hyphen_p = false, // hyphens become tokens when true
+ bool _supersub_p = false, // handle super/subscript numerics
+ bool _url_p = true,
+ bool _downcase_p = false,
+ bool _normalize_p = true,
bool _penn_p = false, // Treebank-3 compatible tokenization when true
bool _verbose_p = false);
diff --git a/contrib/c++tokenizer/tokenizer_main.cpp b/contrib/c++tokenizer/tokenizer_main.cpp
index a4fc8f97b..addd533d3 100644
--- a/contrib/c++tokenizer/tokenizer_main.cpp
+++ b/contrib/c++tokenizer/tokenizer_main.cpp
@@ -2,6 +2,7 @@
#include <memory>
#include <vector>
#include <cctype>
+#include <cstring>
#ifdef TOKENIZER_NAMESPACE
using namespace TOKENIZER_NAMESPACE ;
@@ -11,16 +12,19 @@ using namespace TOKENIZER_NAMESPACE ;
void
usage(const char *path)
{
- std::cerr << "Usage: " << path << "[-{v|x|p|a|e|]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
+ std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
+ std::cerr << " -a -- aggressive hyphenization" << std::endl;
+ std::cerr << " -e -- escape entities" << std::endl;
+ std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
+ std::cerr << " -d -- downcase" << std::endl;
+ std::cerr << " -o OUT -- output file path" << std::endl;
+ std::cerr << " -p -- penn treebank style" << std::endl;
+ std::cerr << " -s -- super- and sub-script conjoining" << std::endl;
+ std::cerr << " -u -- disable url handling" << std::endl;
std::cerr << " -v -- verbose" << std::endl;
std::cerr << " -w -- word filter" << std::endl;
std::cerr << " -x -- skip xml tag lines" << std::endl;
std::cerr << " -y -- skip all xml tags" << std::endl;
- std::cerr << " -e -- escape entities" << std::endl;
- std::cerr << " -a -- aggressive hyphenization" << std::endl;
- std::cerr << " -p -- treebank-3 style" << std::endl;
- std::cerr << " -c DIR -- config (pattern) file directory" << std::endl;
- std::cerr << " -o OUT -- output file path" << std::endl;
std::cerr << "Default is -c ., stdin, stdout." << std::endl;
std::cerr << "LL in en,fr,it affect contraction." << std::endl;
}
@@ -58,7 +62,7 @@ std::string token_word(const std::string& in) {
}
}
}
- if (last_quirk == pos || digits_prefixed > 0 && nalpha == 0)
+ if (last_quirk == pos || (digits_prefixed > 0 && nalpha == 0))
cv.clear(); // invalid word
return std::string(cv.begin(),cv.end());
}
@@ -93,7 +97,7 @@ int main(int ac, char **av)
std::string lang_iso;
std::vector<std::string> args;
std::string out_path;
- char *cfg_path = 0;
+ const char *cfg_path = 0;
bool next_cfg_p = false;
bool next_output_p = false;
bool verbose_p = false;
@@ -101,28 +105,47 @@ int main(int ac, char **av)
bool alltag_p = false;
bool escape_p = true;
bool aggro_p = false;
+ bool supersub_p = false;
+ bool url_p = true;
+ bool downcase_p = false;
bool penn_p = false;
bool words_p = false;
const char *prog = av[0];
+
while (++av,--ac) {
if (**av == '-') {
switch (av[0][1]) {
+ case 'a':
+ aggro_p = true;
+ break;
case 'h':
usage(prog);
exit(0);
case 'c':
next_cfg_p = true;
break;
+ case 'd':
+ downcase_p = true;
+ break;
+ case 'e':
+ escape_p = false;
+ break;
case 'o':
next_output_p = true;
break;
+ case 'p':
+ penn_p = true;
+ break;
+ case 's':
+ supersub_p = true;
+ break;
+ case 'u':
+ url_p = false;
+ break;
case 'v':
verbose_p = true;
break;
- case 'e':
- escape_p = false;
- break;
case 'w':
words_p = true;
break;
@@ -132,23 +155,15 @@ int main(int ac, char **av)
case 'y':
alltag_p = true;
break;
- case 'a':
- aggro_p = true;
- break;
case 'l':
// ignored
break;
- case 'p':
- penn_p = true;
- break;
default:
std::cerr << "Unknown option: " << *av << std::endl;
::exit(1);
}
} else if (lang_iso.empty() && strlen(*av) == 2) {
lang_iso = *av;
- } else if (**av == '-') {
- ++*av;
} else if (next_output_p) {
next_output_p = false;
out_path = *av;
@@ -163,7 +178,43 @@ int main(int ac, char **av)
if (!cfg_path) {
cfg_path = getenv("TOKENIZER_SHARED_DIR");
}
+ if (!cfg_path) {
+ if (!::access("../shared/.",X_OK)) {
+ if (!::access("../shared/moses/.",X_OK)) {
+ cfg_path = "../shared/moses";
+ } else {
+ cfg_path = "../shared";
+ }
+ } else if (!::access("./shared/.",X_OK)) {
+ if (!::access("./shared/moses/.",X_OK)) {
+ cfg_path = "./shared/moses";
+ } else {
+ cfg_path = "./shared";
+ }
+ } else if (!::access("./nonbreaking_prefix.en",R_OK)) {
+ cfg_path = ".";
+ } else {
+ const char *slash = std::strrchr(prog,'/');
+ if (slash) {
+ std::string cfg_dir_str(prog,slash-prog);
+ std::string cfg_shr_str(cfg_dir_str);
+ cfg_shr_str.append("/shared");
+ std::string cfg_mos_str(cfg_shr_str);
+ cfg_mos_str.append("/moses");
+ if (!::access(cfg_mos_str.c_str(),X_OK)) {
+ cfg_path = strdup(cfg_mos_str.c_str());
+ } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
+ cfg_path = strdup(cfg_shr_str.c_str());
+ } else if (!::access(cfg_dir_str.c_str(),X_OK)) {
+ cfg_path = strdup(cfg_dir_str.c_str());
+ }
+ }
+ }
+ }
if (cfg_path) {
+ if (verbose_p) {
+ std::cerr << "config path: " << cfg_path << std::endl;
+ }
Tokenizer::set_config_dir(std::string(cfg_path));
}
@@ -173,7 +224,7 @@ int main(int ac, char **av)
}
std::ostream& ofs(pofs ? *pofs : std::cout);
- Tokenizer tize(lang_iso,detag_p,alltag_p,!escape_p,aggro_p,penn_p,verbose_p);
+ Tokenizer tize(lang_iso,detag_p,alltag_p,!escape_p,aggro_p,supersub_p,url_p,downcase_p,penn_p,verbose_p);
tize.init();
size_t nlines = 0;