Merge pull request #7 from rjai/mjd/casingmjd/casing

Fix regex_error exceptions in regex_search on VSC compiled binaries
author: Rohit Jain <rjai@microsoft.com> 2021-05-25 19:10:31 +0300
committer: GitHub <noreply@github.com> 2021-05-25 19:10:31 +0300
commit: 658030d285654a17e7fe7eee47659d1954ab7fef (patch)
tree: c88be22433fb7da0d02b7a3d39d4e2c3b621a761
parent: 3c93260124d39613e31d63443dbb594197e69607 (diff)
parent: 8eccf99b372b6dd3452a30b4ea00d8aaeeda2e29 (diff)
1 files changed, 36 insertions, 24 deletions
diff --git a/src/case_encoder.h b/src/case_encoder.h
index d73f92a..f5c8845 100644
--- a/src/case_encoder.h
+++ b/src/case_encoder.h
@@ -15,6 +15,13 @@
 #ifndef NORMALIZER_CASE_ENCODER_H_
 #define NORMALIZER_CASE_ENCODER_H_
 
+// Reduce liklihood of regex_error exceptions on VSC compiled bins by increasing
+// library specific stack size and complexity limits
+#ifdef _MSC_VER
+#define _REGEX_MAX_STACK_COUNT 200000
+#define _REGEX_MAX_COMPLEXITY_COUNT 0
+#endif
+
 #include <memory>
 #include <set>
 #include <string>
@@ -179,34 +186,39 @@ public:
     auto nrm_it = normalized->cbegin();
     auto n2o_it = norm_to_orig->cbegin();
 
-    while(std::regex_search(sig_it, signature_.cend(), m, e)) {
-      auto span = m[0];
-      size_t len = std::distance(sig_it, span.first);
-      normalized_temp.insert(normalized_temp.end(), nrm_it, nrm_it + len);
-      norm_to_orig_temp.insert(norm_to_orig_temp.end(), n2o_it, n2o_it + len);
-
-      sig_it += len; 
-      nrm_it += len;
-      n2o_it += len;
-      normalized_temp.push_back(cAllUppercase);
-      norm_to_orig_temp.push_back(*n2o_it);
+    try {
+      while(std::regex_search(sig_it, signature_.cend(), m, e)) {
+        auto span = m[0];
+        size_t len = std::distance(sig_it, span.first);
+        normalized_temp.insert(normalized_temp.end(), nrm_it, nrm_it + len);
+        norm_to_orig_temp.insert(norm_to_orig_temp.end(), n2o_it, n2o_it + len);
+
+        sig_it += len; 
+        nrm_it += len;
+        n2o_it += len;
+        normalized_temp.push_back(cAllUppercase);
+        norm_to_orig_temp.push_back(*n2o_it);
             
-      while(sig_it != span.second) {
-        if(*sig_it == cUppercase) {
-          sig_it++; 
-          nrm_it++;
-          n2o_it++;
+        while(sig_it != span.second) {
+          if(*sig_it == cUppercase) {
+            sig_it++; 
+            nrm_it++;
+            n2o_it++;
+          }
+          sig_it++;
+          normalized_temp.push_back(*nrm_it++);
+          norm_to_orig_temp.push_back(*n2o_it++);
         }
-        sig_it++;
-        normalized_temp.push_back(*nrm_it++);
-        norm_to_orig_temp.push_back(*n2o_it++);
-      }
-      if(sig_it != signature_.cend()) { 
-        if(*sig_it != cUppercase) {
-          normalized_temp.push_back(cLowercase);
-          norm_to_orig_temp.push_back(*n2o_it);
+        if(sig_it != signature_.cend()) { 
+          if(*sig_it != cUppercase) {
+            normalized_temp.push_back(cLowercase);
+            norm_to_orig_temp.push_back(*n2o_it);
+          }
         }
       }
+    } catch (std::regex_error&) {
+        LOG(WARNING) << "regex_error with unicode case encoding; rejecting sentence";
+        return;
     }
 
     if(nrm_it != normalized->cend())
author	Rohit Jain <rjai@microsoft.com>	2021-05-25 19:10:31 +0300
committer	GitHub <noreply@github.com>	2021-05-25 19:10:31 +0300
commit	658030d285654a17e7fe7eee47659d1954ab7fef (patch)
tree	c88be22433fb7da0d02b7a3d39d4e2c3b621a761
parent	3c93260124d39613e31d63443dbb594197e69607 (diff)
parent	8eccf99b372b6dd3452a30b4ea00d8aaeeda2e29 (diff)