Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-04-30 08:05:11 +0300
committerJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-04-30 08:05:11 +0300
commiteca582410006443d0b101a9ae188e302f34f8a03 (patch)
tree35212762fbe666330205e2a9ef09d16a918d077c /contrib/c++tokenizer
parent85acdc62b1548863a6db18bebb538406cfcfa038 (diff)
Remove trailing whitespace in C++ files.
Diffstat (limited to 'contrib/c++tokenizer')
-rw-r--r--contrib/c++tokenizer/tokenizer.cpp254
-rw-r--r--contrib/c++tokenizer/tokenizer.h16
-rw-r--r--contrib/c++tokenizer/tokenizer_main.cpp18
3 files changed, 144 insertions, 144 deletions
diff --git a/contrib/c++tokenizer/tokenizer.cpp b/contrib/c++tokenizer/tokenizer.cpp
index 035ba2e97..6d3dd7046 100644
--- a/contrib/c++tokenizer/tokenizer.cpp
+++ b/contrib/c++tokenizer/tokenizer.cpp
@@ -46,7 +46,7 @@ RE2 curr_en_x("^[Nn]?[\'][\\p{L}]"); // english contraction suffixes conjoin to
RE2 pre_en_x(".*[\\p{L}\\p{N}]+$"); // valid english contraction prefixes
RE2 curr_fr_x(".*[\\p{L}\\p{N}]+[\']"); // french/italian contraction prefixes conjoin to the right
RE2 post_fr_x("^[\\p{L}\\p{N}]*"); // valid french/italian contraction suffixes
-// anything rarely used will just be given as a string and compiled on demand by RE2
+// anything rarely used will just be given as a string and compiled on demand by RE2
const char *
SPC_BYTE = " ";
@@ -85,8 +85,8 @@ const char *ESCAPE_MOSES[] = {
"&apos;", // ' 6 (27)
"&quot;", // " 7 (22)
};
-
-const std::set<std::string>
+
+const std::set<std::string>
ESCAPE_SET = {
std::string(ESCAPE_MOSES[0]),
std::string(ESCAPE_MOSES[1]),
@@ -98,7 +98,7 @@ ESCAPE_SET = {
std::string(ESCAPE_MOSES[7]),
};
-const std::map<std::wstring,gunichar>
+const std::map<std::wstring,gunichar>
ENTITY_MAP = {
{ std::wstring(L"&quot;"), L'"' },
{ std::wstring(L"&amp;"), L'&' },
@@ -355,7 +355,7 @@ ENTITY_MAP = {
{ std::wstring(L"&diams;"), L'\u2666' }
};
-inline gunichar
+inline gunichar
get_entity(gunichar *ptr, size_t len) {
// try hex, decimal entity first
gunichar ech(0);
@@ -380,16 +380,16 @@ get_entity(gunichar *ptr, size_t len) {
ech = 0;
}
}
- if (ech)
+ if (ech)
return ech;
- std::map<std::wstring,gunichar>::const_iterator it =
+ std::map<std::wstring,gunichar>::const_iterator it =
ENTITY_MAP.find(std::wstring((wchar_t *)(ptr),len));
return it != ENTITY_MAP.end() ? it->second : gunichar(0);
}
-inline gunichar
+inline gunichar
get_entity(char *ptr, size_t len) {
glong ulen = 0;
gunichar *gtmp = g_utf8_to_ucs4_fast((const gchar *)ptr, len, &ulen);
@@ -399,7 +399,7 @@ get_entity(char *ptr, size_t len) {
}
-inline std::string
+inline std::string
trim(const std::string& in)
{
std::size_t start = 0;
@@ -413,7 +413,7 @@ trim(const std::string& in)
}
-inline std::vector<std::string>
+inline std::vector<std::string>
split(const std::string& in)
{
std::vector<std::string> outv;
@@ -476,7 +476,7 @@ Tokenizer::Tokenizer(const Parameters& _)
//
// dtor deletes dynamically allocated per-language RE2 compiled expressions
//
-Tokenizer::~Tokenizer()
+Tokenizer::~Tokenizer()
{
for (auto& ptr : prot_pat_vec) {
if (ptr == &numprefixed_x || ptr == &quasinumeric_x)
@@ -491,7 +491,7 @@ Tokenizer::~Tokenizer()
// others into nbpre_gen_set
//
std::pair<int,int>
-Tokenizer::load_prefixes(std::ifstream& ifs)
+Tokenizer::load_prefixes(std::ifstream& ifs)
{
RE2 numonly("(.*)[\\s]+(\\#NUMERIC_ONLY\\#)");
std::string line;
@@ -547,7 +547,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
try {
std::pair<int,int> counts = load_prefixes(cfg);
if (verbose_p) {
- std::cerr << "loaded " << counts.first << " non-numeric, "
+ std::cerr << "loaded " << counts.first << " non-numeric, "
<< counts.second << " numeric prefixes from "
<< nbpre_path << std::endl;
}
@@ -570,7 +570,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
std::string protpat_path(cfg_dir);
protpat_path.append("/protected_pattern.").append(lang_iso);
// default to generic version
- if (::access(protpat_path.c_str(),R_OK))
+ if (::access(protpat_path.c_str(),R_OK))
protpat_path = protpat_path.substr(0,protpat_path.size()-lang_iso.size()-1);
prot_pat_vec.push_back(&numprefixed_x);
@@ -596,7 +596,7 @@ Tokenizer::init(const char *cfg_dir_optional) {
throw std::runtime_error(ess.str());
}
if (verbose_p) {
- std::cerr << "loaded " << npat << " protected patterns from "
+ std::cerr << "loaded " << npat << " protected patterns from "
<< protpat_path << std::endl;
}
} else if (verbose_p) {
@@ -612,7 +612,7 @@ Tokenizer::reset() {
//
// apply ctor-selected tokenization to a string, in-place, no newlines allowed,
-// assumes protections are applied already, some invariants are in place,
+// assumes protections are applied already, some invariants are in place,
// e.g. that successive chars <= ' ' have been normalized to a single ' '
//
void
@@ -633,7 +633,7 @@ Tokenizer::protected_tokenize(std::string& text) {
}
if (pos < textpc.size() && textpc[pos] != ' ')
words.push_back(textpc.substr(pos,textpc.size()-pos));
-
+
// regurgitate words with look-ahead handling for tokens with final mumble
std::string outs;
std::size_t nwords(words.size());
@@ -659,7 +659,7 @@ Tokenizer::protected_tokenize(std::string& text) {
// lower-case look-ahead does not break
sentence_break_p = false;
}
- }
+ }
outs.append(words[ii].data(),len);
if (sentence_break_p)
@@ -671,15 +671,15 @@ Tokenizer::protected_tokenize(std::string& text) {
}
-bool
+bool
Tokenizer::unescape(std::string& word) {
std::ostringstream oss;
std::size_t was = 0; // last processed
std::size_t pos = 0; // last unprocessed
std::size_t len = 0; // processed length
bool hit = false;
- for (std::size_t endp=0;
- (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
+ for (std::size_t endp=0;
+ (pos = word.find('&',was)) != std::string::npos && (endp = word.find(';',pos)) != std::string::npos;
was = endp == std::string::npos ? pos : 1+endp) {
len = endp - pos + 1;
glong ulen(0);
@@ -703,7 +703,7 @@ Tokenizer::unescape(std::string& word) {
}
g_free(gtmp);
}
- if (was < word.size())
+ if (was < word.size())
oss << word.substr(was);
if (hit)
word = oss.str();
@@ -727,7 +727,7 @@ Tokenizer::escape(std::string& text) {
if (mod_p)
outs.append(pp,pt-pp+1);
} else {
- if (mod_p)
+ if (mod_p)
outs.append(pp,mk-pp);
pt = --mk;
}
@@ -751,7 +751,7 @@ Tokenizer::escape(std::string& text) {
} else if (*pt > ']') {
if (*pt =='|') { // 7c
sequence_p = ESCAPE_MOSES[0];
- }
+ }
} else if (*pt > 'Z') {
if (*pt == '<') { // 3e
sequence_p = ESCAPE_MOSES[4];
@@ -761,11 +761,11 @@ Tokenizer::escape(std::string& text) {
sequence_p = ESCAPE_MOSES[1];
} else if (*pt == ']') { // 5d
sequence_p = ESCAPE_MOSES[2];
- }
+ }
}
if (sequence_p) {
- if (pt > pp)
+ if (pt > pp)
outs.append(pp,pt-pp);
outs.append(sequence_p);
mod_p = true;
@@ -774,7 +774,7 @@ Tokenizer::escape(std::string& text) {
++pt;
}
}
-
+
if (mod_p) {
if (pp < pt) {
outs.append(pp,pt-pp);
@@ -795,13 +795,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
std::string text(buf);
std::string outs;
- if (skip_alltags_p)
+ if (skip_alltags_p)
RE2::GlobalReplace(&text,genl_tags_x,SPC_BYTE);
// directed quote patches
size_t len = text.size();
- if (len > 2 && text.substr(0,2) == "``")
- text.replace(0,2,"`` ",3);
+ if (len > 2 && text.substr(0,2) == "``")
+ text.replace(0,2,"`` ",3);
else if (text[0] == '"')
text.replace(0,1,"`` ",3);
else if (text[0] == '`' || text[0] == '\'')
@@ -811,9 +811,9 @@ Tokenizer::penn_tokenize(const std::string& buf)
RE2::GlobalReplace(&text,x1_v_gg,one_gg);
RE2::GlobalReplace(&text,x1_v_g,"\\1 ` \\2");
RE2::GlobalReplace(&text,x1_v_q,"\\1 ` ");
-
+
// protect ellipsis
- for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
+ for (size_t pos = text.find("..."); pos != std::string::npos; pos = text.find("...",pos+11))
text.replace(pos,3,"MANYELIPSIS",11);
// numeric commas
@@ -826,13 +826,13 @@ Tokenizer::penn_tokenize(const std::string& buf)
// isolable slash
RE2::GlobalReplace(&text,slash_x,special_refs);
-
+
// isolate final period
RE2::GlobalReplace(&text,final_x,"\\1 \\2\\3");
-
+
// isolate q.m., e.m.
RE2::GlobalReplace(&text,qx_x,isolate_ref);
-
+
// isolate braces
RE2::GlobalReplace(&text,braces_x,isolate_ref);
@@ -866,7 +866,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
}
std::string ntext(SPC_BYTE);
ntext.append(text);
-
+
// convert double quote to paired single-quotes
RE2::GlobalReplace(&ntext,"\""," '' ");
@@ -894,7 +894,7 @@ Tokenizer::penn_tokenize(const std::string& buf)
RE2::GlobalReplace(&ntext," ([Ww])anna "," \\1an na ");
protected_tokenize(ntext);
-
+
// restore ellipsis
RE2::GlobalReplace(&ntext,"MANYELIPSIS","...");
@@ -919,7 +919,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
int num = 0;
// this is the main moses-compatible tokenizer
-
+
// push all the prefixes matching protected patterns
std::vector<std::string> prot_stack;
std::string match;
@@ -942,7 +942,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
}
}
-
+
const char *pt(text.c_str());
const char *ep(pt + text.size());
while (pt < ep && *pt >= 0 && *pt <= ' ')
@@ -990,8 +990,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (!since_start) {
if (std::isalpha(char(*ucs4)))
alpha_prefix++;
- } else if (alpha_prefix == since_start
- && char(*ucs4) == ':'
+ } else if (alpha_prefix == since_start
+ && char(*ucs4) == ':'
&& next_type != G_UNICODE_SPACE_SEPARATOR) {
in_url_p = true;
}
@@ -1018,7 +1018,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
// fallthough
case G_UNICODE_UPPERCASE_LETTER:
case G_UNICODE_LOWERCASE_LETTER:
- if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
+ if (downcase_p && curr_type == G_UNICODE_UPPERCASE_LETTER)
curr_uch = g_unichar_tolower(*ucs4);
break;
case G_UNICODE_SPACING_MARK:
@@ -1082,8 +1082,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
substitute_p = L"@-@";
post_break_p = pre_break_p = true;
} else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
- ( curr_uch > gunichar(L'\u2011')
- && curr_uch != gunichar(L'\u30A0')
+ ( curr_uch > gunichar(L'\u2011')
+ && curr_uch != gunichar(L'\u30A0')
&& curr_uch < gunichar(L'\uFE63') ) ) {
// dash, not a hyphen
post_break_p = pre_break_p = true;
@@ -1151,7 +1151,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
default:
post_break_p = pre_break_p = prev_uch != curr_uch;
break;
- }
+ }
}
}
break;
@@ -1159,8 +1159,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
switch (curr_uch) {
case gunichar(L':'):
case gunichar(L'/'):
- if (refined_p && !in_url_p
- && prev_type == G_UNICODE_DECIMAL_NUMBER
+ if (refined_p && !in_url_p
+ && prev_type == G_UNICODE_DECIMAL_NUMBER
&& next_type == G_UNICODE_DECIMAL_NUMBER) {
break;
}
@@ -1178,7 +1178,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
break;
case gunichar(L'&'):
if (unescape_p) {
- if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
+ if (next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER
|| next_type == G_UNICODE_DECIMAL_NUMBER || next_uch == gunichar(L'#')) {
gunichar *eptr = nxt4;
GUnicodeType eptr_type(G_UNICODE_UNASSIGNED);
@@ -1223,16 +1223,16 @@ Tokenizer::quik_tokenize(const std::string& buf)
next_type = nxt4 < lim4 ? g_unichar_type(next_uch) : G_UNICODE_UNASSIGNED;
goto retry;
}
-
+
}
post_break_p = pre_break_p = !in_url_p || next_type != G_UNICODE_SPACE_SEPARATOR;
- if (escape_p)
+ if (escape_p)
substitute_p = L"&amp;";
break;
case gunichar(L'\''):
if (english_p) {
if (!in_url_p) {
- bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
+ bool next_letter_p = next_type == G_UNICODE_LOWERCASE_LETTER
|| next_type == G_UNICODE_UPPERCASE_LETTER;
pre_break_p = true;
if (next_letter_p && refined_p) {
@@ -1241,9 +1241,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
*(uptr - 1) = gunichar(L' ');
*(uptr++) = prev_uch;
pre_break_p = false;
- }
+ }
}
- post_break_p = since_start == 0
+ post_break_p = since_start == 0
|| (!next_letter_p && next_type != G_UNICODE_DECIMAL_NUMBER);
}
} else if (latin_p) {
@@ -1252,12 +1252,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = !in_url_p;
}
- if (escape_p)
+ if (escape_p)
substitute_p = L"&apos;";
break;
case gunichar(L'"'):
post_break_p = pre_break_p = true;
- if (escape_p)
+ if (escape_p)
substitute_p = L"&quot;";
break;
case gunichar(L','):
@@ -1303,7 +1303,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
}
// terminal isolated letter does not break
- } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
+ } else if (class_follows_p(nxt4,lim4,G_UNICODE_LOWERCASE_LETTER) ||
g_unichar_type(*nxt4) == G_UNICODE_DASH_PUNCTUATION) {
// lower-case look-ahead does not break
} else {
@@ -1315,7 +1315,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
pre_break_p = true;
break;
}
- }
+ }
break;
}
} else {
@@ -1346,11 +1346,11 @@ Tokenizer::quik_tokenize(const std::string& buf)
case gunichar(L')'):
break;
case gunichar(L'['):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&#91;";
break;
case gunichar(L']'):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&#93;";
break;
default:
@@ -1377,7 +1377,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (english_p) {
if (!in_url_p) {
pre_break_p = true;
- post_break_p = since_start == 0 ||
+ post_break_p = since_start == 0 ||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
}
} else if (latin_p) {
@@ -1386,23 +1386,23 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = !in_url_p;
}
- if (escape_p)
+ if (escape_p)
substitute_p = L"&apos;";
- else
+ else
curr_uch = gunichar(L'\'');
break;
case gunichar(L'|'):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&#124;";
post_break_p = pre_break_p = true;
break;
case gunichar(L'<'):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&lt;";
post_break_p = pre_break_p = true;
break;
case gunichar(L'>'):
- if (escape_p)
+ if (escape_p)
substitute_p = L"&gt;";
post_break_p = pre_break_p = true;
break;
@@ -1414,7 +1414,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
case gunichar(L'='):
case gunichar(L'~'):
in_num_p = false;
- post_break_p = pre_break_p = !in_url_p;
+ post_break_p = pre_break_p = !in_url_p;
break;
case gunichar(L'+'):
post_break_p = pre_break_p = !in_url_p;
@@ -1444,12 +1444,12 @@ Tokenizer::quik_tokenize(const std::string& buf)
curr_uch = gunichar(L' ');
} else if (curr_uch < gunichar(L' ')) {
curr_uch = gunichar(L' ');
- } else if (curr_uch == gunichar(L'\u0092') &&
+ } else if (curr_uch == gunichar(L'\u0092') &&
(next_type == G_UNICODE_LOWERCASE_LETTER || next_type == G_UNICODE_UPPERCASE_LETTER)) {
// observed corpus corruption case
if (english_p) {
pre_break_p = true;
- post_break_p = since_start == 0 ||
+ post_break_p = since_start == 0 ||
(next_type != G_UNICODE_LOWERCASE_LETTER && next_type != G_UNICODE_UPPERCASE_LETTER && next_type != G_UNICODE_DECIMAL_NUMBER);
} else if (latin_p) {
post_break_p = true;
@@ -1457,9 +1457,9 @@ Tokenizer::quik_tokenize(const std::string& buf)
} else {
post_break_p = pre_break_p = true;
}
- if (escape_p)
+ if (escape_p)
substitute_p = L"&apos;";
- else
+ else
curr_uch = gunichar(L'\'');
} else {
post_break_p = pre_break_p = true;
@@ -1491,7 +1491,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
in_url_p = in_num_p = false;
break;
}
-
+
if (pre_break_p || curr_uch == gunichar(L' ') || (bad_length && curr_type != G_UNICODE_UNASSIGNED)) {
if (since_start) {
// non-empty token emitted previously, so pre-break must emit token separator
@@ -1501,8 +1501,8 @@ Tokenizer::quik_tokenize(const std::string& buf)
if (curr_uch == gunichar(L' '))
// suppress emission below, fall-through to substitute logic
curr_uch = 0;
- }
-
+ }
+
if (substitute_p) {
for (gunichar *sptr = (gunichar *)substitute_p; *sptr; ++sptr) {
*uptr++ = *sptr;
@@ -1521,7 +1521,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
glong nbytes = 0;
gchar *utf8 = g_ucs4_to_utf8(ubuf,uptr-ubuf,0,&nbytes,0); // g_free
- if (utf8[nbytes-1] == ' ')
+ if (utf8[nbytes-1] == ' ')
--nbytes;
text.assign((const char *)utf8,(const char *)(utf8 + nbytes));
g_free(utf8);
@@ -1552,7 +1552,7 @@ Tokenizer::quik_tokenize(const std::string& buf)
}
-std::size_t
+std::size_t
Tokenizer::tokenize(std::istream& is, std::ostream& os)
{
std::size_t line_no = 0;
@@ -1561,10 +1561,10 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
std::vector< std::vector< std::string > > results(nthreads);
std::vector< boost::thread > workers(nthreads);
bool done_p = !(is.good() && os.good());
-
+
for (std::size_t tranche = 0; !done_p; ++tranche) {
-
+
// for loop starting threads for chunks of input
for (std::size_t ithread = 0; ithread < nthreads; ++ithread) {
@@ -1589,19 +1589,19 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
results[ithread].resize(line_pos);
break;
}
- lines[ithread][line_pos].clear();
- } else if (skip_xml_p &&
- (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
- lines[ithread][line_pos].clear();
+ lines[ithread][line_pos].clear();
+ } else if (skip_xml_p &&
+ (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
+ lines[ithread][line_pos].clear();
} else {
- lines[ithread][line_pos] =
- std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
+ lines[ithread][line_pos] =
+ std::string(SPC_BYTE).append(istr).append(SPC_BYTE);
}
- }
+ }
if (line_pos) {
- workers[ithread] =
- boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
+ workers[ithread] =
+ boost::thread(VectorTokenizerCallable(this,lines[ithread],results[ithread]));
}
} // end for loop starting threads
@@ -1616,22 +1616,22 @@ Tokenizer::tokenize(std::istream& is, std::ostream& os)
if (nlin != nres) {
std::ostringstream emsg;
- emsg << "Tranche " << tranche
- << " worker " << ithread << "/" << nthreads
+ emsg << "Tranche " << tranche
+ << " worker " << ithread << "/" << nthreads
<< " |lines|==" << nlin << " != |results|==" << nres;
throw std::runtime_error(emsg.str());
}
- for (std::size_t ires = 0; ires < nres; ++ires)
+ for (std::size_t ires = 0; ires < nres; ++ires)
os << results[ithread][ires] << std::endl;
} // end loop over joined results
-
+
if (verbose_p) {
std::cerr << line_no << ' ';
std::cerr.flush();
}
-
+
} // end loop over chunks
return line_no;
@@ -1642,18 +1642,18 @@ std::string
Tokenizer::detokenize(const std::string& buf)
{
std::vector<std::string> words = split(trim(buf));
-
+
std::size_t squotes = 0;
std::size_t dquotes = 0;
std::string prepends("");
std::ostringstream oss;
-
+
std::size_t nwords = words.size();
std::size_t iword = 0;
- if (unescape_p)
- for (auto &word: words)
+ if (unescape_p)
+ for (auto &word: words)
unescape(word);
for (auto &word: words) {
@@ -1665,13 +1665,13 @@ Tokenizer::detokenize(const std::string& buf)
} else if (RE2::FullMatch(word,left_x)) {
oss << word;
prepends = SPC_BYTE;
- } else if (english_p && iword
- && RE2::FullMatch(word,curr_en_x)
+ } else if (english_p && iword
+ && RE2::FullMatch(word,curr_en_x)
&& RE2::FullMatch(words[iword-1],pre_en_x)) {
oss << word;
prepends = SPC_BYTE;
- } else if (latin_p && iword < nwords - 2
- && RE2::FullMatch(word,curr_fr_x)
+ } else if (latin_p && iword < nwords - 2
+ && RE2::FullMatch(word,curr_fr_x)
&& RE2::FullMatch(words[iword+1],post_fr_x)) {
oss << prepends << word;
prepends.clear();
@@ -1679,7 +1679,7 @@ Tokenizer::detokenize(const std::string& buf)
if ((word.at(0) == '\'' && ((squotes % 2) == 0 )) ||
(word.at(0) == '"' && ((dquotes % 2) == 0))) {
if (english_p && iword
- && word.at(0) == '\''
+ && word.at(0) == '\''
&& std::tolower(words[iword-1].at(words[iword-1].size()-1)) == 's') {
oss << word;
prepends = SPC_BYTE;
@@ -1698,7 +1698,7 @@ Tokenizer::detokenize(const std::string& buf)
prepends = SPC_BYTE;
if (word.at(0) == '\'')
squotes++;
- else if (word.at(0) == '"')
+ else if (word.at(0) == '"')
dquotes++;
}
} else {
@@ -1707,8 +1707,8 @@ Tokenizer::detokenize(const std::string& buf)
}
iword++;
}
-
-
+
+
std::string text(oss.str());
RE2::GlobalReplace(&text," +",SPC_BYTE);
RE2::GlobalReplace(&text,"\n ","\n");
@@ -1718,14 +1718,14 @@ Tokenizer::detokenize(const std::string& buf)
std::size_t
-Tokenizer::detokenize(std::istream& is, std::ostream& os)
+Tokenizer::detokenize(std::istream& is, std::ostream& os)
{
size_t line_no = 0;
while (is.good() && os.good()) {
std::string istr;
std::getline(is,istr);
line_no ++;
- if (istr.empty())
+ if (istr.empty())
continue;
if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x))) {
os << istr << std::endl;
@@ -1749,7 +1749,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
return parts;
}
gunichar *uout = (gunichar *)g_malloc0(2*ncp*sizeof(gunichar));
-
+
const wchar_t GENL_HYPH = L'\u2010';
const wchar_t IDEO_STOP = L'\u3002';
const wchar_t KANA_MDOT = L'\u30FB';
@@ -1786,7 +1786,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
std::vector<std::size_t> breaks;
std::set<std::size_t> suppress;
-
+
for (; icp <= ncp; ++icp) {
currwc = wchar_t(ucs4[icp]);
curr_type = g_unichar_type(currwc);
@@ -1798,7 +1798,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
case G_UNICODE_OTHER_NUMBER:
curr_class = numba;
curr_word_p = true;
- break;
+ break;
case G_UNICODE_LOWERCASE_LETTER:
case G_UNICODE_MODIFIER_LETTER:
case G_UNICODE_OTHER_LETTER:
@@ -1822,7 +1822,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (currwc >= SMAL_HYPH) {
curr_word_p = true;
} else {
- curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
+ curr_word_p = (currwc >= WAVE_DASH) && (currwc <= KANA_DHYP);
}
break;
case G_UNICODE_CLOSE_PUNCTUATION:
@@ -1860,7 +1860,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
curr_word_p = false;
break;
}
-
+
// # condition for prefix test
// $words[$i] =~ /([\p{IsAlnum}\.\-]*)([\'\"\)\]\%\p{IsPf}]*)(\.+)$/
// $words[$i+1] =~ /^([ ]*[\'\"\(\[\¿\¡\p{IsPi}]*[ ]*[\p{IsUpper}0-9])/
@@ -1875,7 +1875,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else if (curr_word_p) {
if (!fini_word) {
init_word = ocp;
- }
+ }
fini_word = ocp+1;
dotslen = finilen = 0;
} else if (curr_class >= quote && curr_class <= pfpct && curr_class != pinit) {
@@ -1893,7 +1893,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
} else {
init_word = fini_word = 0;
}
-
+
if (check_abbr_p) {
// not a valid word character or post-word punctuation character: check word
std::wstring k((wchar_t *)uout+init_word,fini_word-init_word);
@@ -1986,7 +1986,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
init_word = fini_word = 0;
}
-
+
if (seqpos >= SEQ_LIM) {
seqpos = 0;
}
@@ -2015,7 +2015,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
continue;
}
}
-
+
if (!seqpos) {
if (curr_class != blank) {
uout[ocp++] = gunichar(currwc);
@@ -2024,7 +2024,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
continue;
}
-
+
if (curr_class == blank) {
if (prev_class != blank) {
seq[seqpos] = blank;
@@ -2034,7 +2034,7 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
}
if (icp < ncp)
continue;
- }
+ }
if (curr_class >= quote && curr_class <= pfini) {
if (prev_class < quote || prev_class > pfini) {
@@ -2158,8 +2158,8 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
if (uout[chkpos] == L'\n' || uout[chkpos] == L' ') {
endpos = chkpos;
continue;
- }
- if (g_unichar_isgraph(uout[chkpos]))
+ }
+ if (g_unichar_isgraph(uout[chkpos]))
break;
endpos = chkpos;
}
@@ -2171,17 +2171,17 @@ Tokenizer::splitter(const std::string &istr, bool *continuation_ptr) {
if (continuation_ptr)
*continuation_ptr = endpos > iop;
iop = nextpos;
- }
-
+ }
+
g_free(uout);
g_free(ucs4);
-
+
return parts;
}
std::pair<std::size_t,std::size_t>
-Tokenizer::splitter(std::istream& is, std::ostream& os)
+Tokenizer::splitter(std::istream& is, std::ostream& os)
{
std::pair<std::size_t,std::size_t> counts = { 0, 0 };
bool continuation_p = false;
@@ -2197,7 +2197,7 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
if (istr.empty() && (is.eof() ||!para_marks_p))
continue;
- if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
+ if (skip_xml_p && (RE2::FullMatch(istr,tag_line_x) || RE2::FullMatch(istr,white_line_x)))
continue;
std::vector<std::string> sentences(splitter(istr,&continuation_p));
@@ -2221,13 +2221,13 @@ Tokenizer::splitter(std::istream& is, std::ostream& os)
os << " ";
pending_gap = false;
}
-
- for (std::size_t ii = 0; ii < nsents-1; ++ii)
+
+ for (std::size_t ii = 0; ii < nsents-1; ++ii)
os << sentences[ii] << std::endl;
-
+
os << sentences[nsents-1];
- if (continuation_p)
+ if (continuation_p)
pending_gap = !split_breaks_p;
if (!pending_gap)
os << std::endl;
diff --git a/contrib/c++tokenizer/tokenizer.h b/contrib/c++tokenizer/tokenizer.h
index cc1de2770..978f20197 100644
--- a/contrib/c++tokenizer/tokenizer.h
+++ b/contrib/c++tokenizer/tokenizer.h
@@ -26,7 +26,7 @@ class Tokenizer {
private:
- typedef enum {
+ typedef enum {
empty = 0,
blank,
upper, // upper case
@@ -56,7 +56,7 @@ private:
// non-breaking prefixes (other) ucs4
std::set<std::wstring> nbpre_gen_ucs4;
- // compiled protected patterns
+ // compiled protected patterns
std::vector<re2::RE2 *> prot_pat_vec;
protected:
@@ -96,10 +96,10 @@ protected:
Tokenizer *tokenizer;
std::vector<std::string>& in;
std::vector<std::string>& out;
-
- VectorTokenizerCallable(Tokenizer *_tokenizer,
- std::vector<std::string>& _in,
- std::vector<std::string>& _out)
+
+ VectorTokenizerCallable(Tokenizer *_tokenizer,
+ std::vector<std::string>& _in,
+ std::vector<std::string>& _out)
: tokenizer(_tokenizer)
, in(_in)
, out(_out) {
@@ -107,10 +107,10 @@ protected:
void operator()() {
out.resize(in.size());
- for (std::size_t ii = 0; ii < in.size(); ++ii)
+ for (std::size_t ii = 0; ii < in.size(); ++ii)
if (in[ii].empty())
out[ii] = in[ii];
- else if (tokenizer->penn_p)
+ else if (tokenizer->penn_p)
out[ii] = tokenizer->penn_tokenize(in[ii]);
else
out[ii] = tokenizer->quik_tokenize(in[ii]);
diff --git a/contrib/c++tokenizer/tokenizer_main.cpp b/contrib/c++tokenizer/tokenizer_main.cpp
index 7adb599e7..358a68cc3 100644
--- a/contrib/c++tokenizer/tokenizer_main.cpp
+++ b/contrib/c++tokenizer/tokenizer_main.cpp
@@ -10,8 +10,8 @@ using namespace TOKENIZER_NAMESPACE ;
#endif
-void
-usage(const char *path)
+void
+usage(const char *path)
{
std::cerr << "Usage: " << path << "[-{v|x|p|a|e|s|u|n|N]* [LL] [-{c|o} PATH]* INFILE*" << std::endl;
std::cerr << " -a -- aggressive hyphenization" << std::endl;
@@ -89,7 +89,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
int nlines = 0;
std::string line;
while (ifs.good() && std::getline(ifs,line)) {
- if (line.empty())
+ if (line.empty())
continue;
std::vector<std::string> tokens(tize.tokens(line));
int count = 0;
@@ -127,7 +127,7 @@ copy_words(Tokenizer& tize, std::istream& ifs, std::ostream& ofs) {
}
-int main(int ac, char **av)
+int main(int ac, char **av)
{
int rc = 0;
Parameters params;
@@ -140,7 +140,7 @@ int main(int ac, char **av)
if (!detokenize_p)
params.split_p = std::strstr(av[0],"splitter") != 0;
- while (++av,--ac) {
+ while (++av,--ac) {
if (**av == '-') {
switch (av[0][1]) {
case 'a':
@@ -244,7 +244,7 @@ int main(int ac, char **av)
if (comma) {
*comma++ = 0;
params.chunksize = std::strtoul(comma,0,0);
- }
+ }
params.nthreads = std::strtoul(*av,0,0);
} else {
params.args.push_back(std::string(*av));
@@ -275,7 +275,7 @@ int main(int ac, char **av)
cfg_mos_str.append("/moses");
if (!::access(cfg_mos_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_mos_str.c_str());
- } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
+ } else if (!::access(cfg_shr_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_shr_str.c_str());
} else if (!::access(cfg_dir_str.c_str(),X_OK)) {
params.cfg_path = strdup(cfg_dir_str.c_str());
@@ -287,7 +287,7 @@ int main(int ac, char **av)
if (params.verbose_p) {
std::cerr << "config path: " << params.cfg_path << std::endl;
}
- }
+ }
std::unique_ptr<std::ofstream> pofs = 0;
if (!params.out_path.empty()) {
@@ -345,7 +345,7 @@ int main(int ac, char **av)
if (plines.second) {
std::cerr << "%%% " << plines.second << " sentences." << std::endl;
}
- }
+ }
return rc;
}