#5605 fix Chinese search index (#5611)

generate search index for Latin words correctly if search language is Chinese
author: Timotheus Kampik <timotheus.kampik@gmail.com> 2018-12-25 20:41:52 +0300
committer: GitHub <noreply@github.com> 2018-12-25 20:41:52 +0300
commit: 221614654fed1ce7e07e6bb4e242e8c60c4caa67 (patch)
tree: c98a3930e86b22913d1bfef1b4ad0c3029dc4e12 /sphinx/search
parent: 920aafaee60413f4e536f3168f3122779df01683 (diff)
1 files changed, 16 insertions, 2 deletions
diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py
index 6c5b65d6b..3753bc990 100644
--- a/sphinx/search/zh.py
+++ b/sphinx/search/zh.py
@@ -233,7 +233,8 @@ class SearchChinese(SearchLanguage):
     language_name = 'Chinese'
     js_stemmer_code = js_porter_stemmer
     stopwords = english_stopwords
-    latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]')
+    latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
+    latin_terms = []  # type: List[unicode]
 
     def init(self, options):
         # type: (Dict) -> None
@@ -250,7 +251,9 @@ class SearchChinese(SearchLanguage):
         if JIEBA:
             chinese = list(jieba.cut_for_search(input))
 
-        latin1 = self.latin1_letters.findall(input)
+        latin1 = \
+            [term.strip() for term in self.latin1_letters.findall(input)]
+        self.latin_terms.extend(latin1)
         return chinese + latin1
 
     def word_filter(self, stemmed_word):
@@ -259,4 +262,15 @@ class SearchChinese(SearchLanguage):
 
     def stem(self, word):
         # type: (unicode) -> unicode
+
+        # Don't stem Latin words that are long enough to be relevant for search
+        # if not stemmed, but would be too short after being stemmed
+        # avoids some issues with acronyms
+        should_not_be_stemmed = (
+            word in self.latin_terms and
+            len(word) >= 3 and
+            len(self.stemmer.stem(word.lower())) < 3
+        )
+        if should_not_be_stemmed:
+            return word.lower()
         return self.stemmer.stem(word.lower())
author	Timotheus Kampik <timotheus.kampik@gmail.com>	2018-12-25 20:41:52 +0300
committer	GitHub <noreply@github.com>	2018-12-25 20:41:52 +0300
commit	221614654fed1ce7e07e6bb4e242e8c60c4caa67 (patch)
tree	c98a3930e86b22913d1bfef1b4ad0c3029dc4e12 /sphinx/search
parent	920aafaee60413f4e536f3168f3122779df01683 (diff)