diff options
author | Rohit Jain <rjai@microsoft.com> | 2021-05-25 19:10:31 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-25 19:10:31 +0300 |
commit | 658030d285654a17e7fe7eee47659d1954ab7fef (patch) | |
tree | c88be22433fb7da0d02b7a3d39d4e2c3b621a761 | |
parent | 3c93260124d39613e31d63443dbb594197e69607 (diff) | |
parent | 8eccf99b372b6dd3452a30b4ea00d8aaeeda2e29 (diff) |
Merge pull request #7 from rjai/mjd/casingmjd/casing
Fix regex_error exceptions in regex_search on VSC compiled binaries
-rw-r--r-- | src/case_encoder.h | 60 |
1 files changed, 36 insertions, 24 deletions
diff --git a/src/case_encoder.h b/src/case_encoder.h index d73f92a..f5c8845 100644 --- a/src/case_encoder.h +++ b/src/case_encoder.h @@ -15,6 +15,13 @@ #ifndef NORMALIZER_CASE_ENCODER_H_ #define NORMALIZER_CASE_ENCODER_H_ +// Reduce liklihood of regex_error exceptions on VSC compiled bins by increasing +// library specific stack size and complexity limits +#ifdef _MSC_VER +#define _REGEX_MAX_STACK_COUNT 200000 +#define _REGEX_MAX_COMPLEXITY_COUNT 0 +#endif + #include <memory> #include <set> #include <string> @@ -179,34 +186,39 @@ public: auto nrm_it = normalized->cbegin(); auto n2o_it = norm_to_orig->cbegin(); - while(std::regex_search(sig_it, signature_.cend(), m, e)) { - auto span = m[0]; - size_t len = std::distance(sig_it, span.first); - normalized_temp.insert(normalized_temp.end(), nrm_it, nrm_it + len); - norm_to_orig_temp.insert(norm_to_orig_temp.end(), n2o_it, n2o_it + len); - - sig_it += len; - nrm_it += len; - n2o_it += len; - normalized_temp.push_back(cAllUppercase); - norm_to_orig_temp.push_back(*n2o_it); + try { + while(std::regex_search(sig_it, signature_.cend(), m, e)) { + auto span = m[0]; + size_t len = std::distance(sig_it, span.first); + normalized_temp.insert(normalized_temp.end(), nrm_it, nrm_it + len); + norm_to_orig_temp.insert(norm_to_orig_temp.end(), n2o_it, n2o_it + len); + + sig_it += len; + nrm_it += len; + n2o_it += len; + normalized_temp.push_back(cAllUppercase); + norm_to_orig_temp.push_back(*n2o_it); - while(sig_it != span.second) { - if(*sig_it == cUppercase) { - sig_it++; - nrm_it++; - n2o_it++; + while(sig_it != span.second) { + if(*sig_it == cUppercase) { + sig_it++; + nrm_it++; + n2o_it++; + } + sig_it++; + normalized_temp.push_back(*nrm_it++); + norm_to_orig_temp.push_back(*n2o_it++); } - sig_it++; - normalized_temp.push_back(*nrm_it++); - norm_to_orig_temp.push_back(*n2o_it++); - } - if(sig_it != signature_.cend()) { - if(*sig_it != cUppercase) { - normalized_temp.push_back(cLowercase); - norm_to_orig_temp.push_back(*n2o_it); + if(sig_it != signature_.cend()) { + if(*sig_it != cUppercase) { + normalized_temp.push_back(cLowercase); + norm_to_orig_temp.push_back(*n2o_it); + } } } + } catch (std::regex_error&) { + LOG(WARNING) << "regex_error with unicode case encoding; rejecting sentence"; + return; } if(nrm_it != normalized->cend()) |