Speed improvement from Yury Gorshenin.

author: Alex Zolotarev <alex@maps.me> 2016-04-04 17:15:45 +0300
committer: Alex Zolotarev <alex@maps.me> 2016-04-05 13:55:12 +0300
commit: baf4a0c0cd07bea2472e9934970a0f16778898cd (patch)
tree: 53d772228bb20bf9f20110d2381679c2cd5da85c /base
parent: 32a50c6a59b012024cc59fb71a7bcad5fe360f43 (diff)
1 files changed, 31 insertions, 8 deletions
diff --git a/base/string_utils.cpp b/base/string_utils.cpp
index 6e643d18f2..857c190c9b 100644
--- a/base/string_utils.cpp
+++ b/base/string_utils.cpp
@@ -113,19 +113,42 @@ UniString Normalize(UniString const & s)
   return result;
 }
 
-void NormalizeDigits(string & utf8)
-{
-  for (size_t i = 0; i + 2 < utf8.size(); ++i)
+void NormalizeDigits(string &utf8) {
+  size_t const n = utf8.size();
+  size_t const m = n >= 2 ? n - 2 : 0;
+
+  size_t i = 0;
+  while (i < n && utf8[i] != '\xEF')
+    ++i;
+  size_t j = i;
+
+  // Following invariant holds before/between/after loop iterations below:
+  // * utf8[0, i) represents a checked part of the input string.
+  // * utf8[0, j) represents a normalized version of the utf8[0, i).
+  while (i < m)
   {
     if (utf8[i] == '\xEF' && utf8[i + 1] == '\xBC')
     {
-      char const n = utf8[i + 2];
-      if (n < '\x90' || n > '\x99')
-        continue;
-      utf8[i] = n - 0x90 + '0';
-      utf8.erase(i + 1, 2);
+      auto const n = utf8[i + 2];
+      if (n >= '\x90' && n <= '\x99')
+      {
+        utf8[j++] = n - 0x90 + '0';
+        i += 3;
+      }
+      else
+      {
+        utf8[j++] = utf8[i++];
+        utf8[j++] = utf8[i++];
+      }
+    }
+    else
+    {
+      utf8[j++] = utf8[i++];
     }
   }
+  while (i < n)
+    utf8[j++] = utf8[i++];
+  utf8.resize(j);
 }
 
 namespace
author	Alex Zolotarev <alex@maps.me>	2016-04-04 17:15:45 +0300
committer	Alex Zolotarev <alex@maps.me>	2016-04-05 13:55:12 +0300
commit	baf4a0c0cd07bea2472e9934970a0f16778898cd (patch)
tree	53d772228bb20bf9f20110d2381679c2cd5da85c /base
parent	32a50c6a59b012024cc59fb71a7bcad5fe360f43 (diff)