Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorakimbal1 <akimball2@bloomberg.net>2015-02-15 08:23:29 +0300
committerakimbal1 <akimball2@bloomberg.net>2015-02-15 08:23:29 +0300
commiteff60db207991dbadbb7a653a0a0dff7cb65b158 (patch)
treed3be99ed9616f666f9456c728c43cdf375fb8875
parentf307e56273065e93cb798698dbc7434acf92adbc (diff)
stop treating dash like hyphen
-rw-r--r--contrib/c++tokenizer/tokenizer.cpp6
1 files changed, 6 insertions, 0 deletions
diff --git a/contrib/c++tokenizer/tokenizer.cpp b/contrib/c++tokenizer/tokenizer.cpp
index 04a362757..f9c32389e 100644
--- a/contrib/c++tokenizer/tokenizer.cpp
+++ b/contrib/c++tokenizer/tokenizer.cpp
@@ -582,6 +582,12 @@ Tokenizer::tokenize(const std::string& buf)
if (aggressive_hyphen_p && !in_url_p) {
substitute_p = L"@-@";
post_break_p = pre_break_p = true;
+ } else if ( ( curr_uch > gunichar(L'\u002D') && curr_uch < gunichar(L'\u2010') ) ||
+ ( curr_uch > gunichar(L'\u2011')
+ && curr_uch != gunichar(L'\u30A0')
+ && curr_uch < gunichar(L'\uFE63') ) ) {
+ // dash, not a hyphen
+ post_break_p = pre_break_p = true;
} else if (next_type == G_UNICODE_SPACE_SEPARATOR) {
} else {
if (prev_type == curr_type) {