diff options
author | Alex Zolotarev <alex@maps.me> | 2016-04-04 17:15:45 +0300 |
---|---|---|
committer | Alex Zolotarev <alex@maps.me> | 2016-04-05 13:55:12 +0300 |
commit | baf4a0c0cd07bea2472e9934970a0f16778898cd (patch) | |
tree | 53d772228bb20bf9f20110d2381679c2cd5da85c /base | |
parent | 32a50c6a59b012024cc59fb71a7bcad5fe360f43 (diff) |
Speed improvement from Yury Gorshenin.
Diffstat (limited to 'base')
-rw-r--r-- | base/string_utils.cpp | 39 |
1 files changed, 31 insertions, 8 deletions
diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 6e643d18f2..857c190c9b 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -113,19 +113,42 @@ UniString Normalize(UniString const & s) return result; } -void NormalizeDigits(string & utf8) -{ - for (size_t i = 0; i + 2 < utf8.size(); ++i) +void NormalizeDigits(string &utf8) { + size_t const n = utf8.size(); + size_t const m = n >= 2 ? n - 2 : 0; + + size_t i = 0; + while (i < n && utf8[i] != '\xEF') + ++i; + size_t j = i; + + // Following invariant holds before/between/after loop iterations below: + // * utf8[0, i) represents a checked part of the input string. + // * utf8[0, j) represents a normalized version of the utf8[0, i). + while (i < m) { if (utf8[i] == '\xEF' && utf8[i + 1] == '\xBC') { - char const n = utf8[i + 2]; - if (n < '\x90' || n > '\x99') - continue; - utf8[i] = n - 0x90 + '0'; - utf8.erase(i + 1, 2); + auto const n = utf8[i + 2]; + if (n >= '\x90' && n <= '\x99') + { + utf8[j++] = n - 0x90 + '0'; + i += 3; + } + else + { + utf8[j++] = utf8[i++]; + utf8[j++] = utf8[i++]; + } + } + else + { + utf8[j++] = utf8[i++]; } } + while (i < n) + utf8[j++] = utf8[i++]; + utf8.resize(j); } namespace |