diff options
author | Timotheus Kampik <timotheus.kampik@gmail.com> | 2018-12-25 20:41:52 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-12-25 20:41:52 +0300 |
commit | 221614654fed1ce7e07e6bb4e242e8c60c4caa67 (patch) | |
tree | c98a3930e86b22913d1bfef1b4ad0c3029dc4e12 /sphinx/search | |
parent | 920aafaee60413f4e536f3168f3122779df01683 (diff) |
#5605 fix Chinese search index (#5611)
generate search index for Latin words correctly if search language is Chinese
Diffstat (limited to 'sphinx/search')
-rw-r--r-- | sphinx/search/zh.py | 18 |
1 files changed, 16 insertions, 2 deletions
diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py index 6c5b65d6b..3753bc990 100644 --- a/sphinx/search/zh.py +++ b/sphinx/search/zh.py @@ -233,7 +233,8 @@ class SearchChinese(SearchLanguage): language_name = 'Chinese' js_stemmer_code = js_porter_stemmer stopwords = english_stopwords - latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]') + latin1_letters = re.compile(r'[a-zA-Z0-9_]+') + latin_terms = [] # type: List[unicode] def init(self, options): # type: (Dict) -> None @@ -250,7 +251,9 @@ class SearchChinese(SearchLanguage): if JIEBA: chinese = list(jieba.cut_for_search(input)) - latin1 = self.latin1_letters.findall(input) + latin1 = \ + [term.strip() for term in self.latin1_letters.findall(input)] + self.latin_terms.extend(latin1) return chinese + latin1 def word_filter(self, stemmed_word): @@ -259,4 +262,15 @@ class SearchChinese(SearchLanguage): def stem(self, word): # type: (unicode) -> unicode + + # Don't stem Latin words that are long enough to be relevant for search + # if not stemmed, but would be too short after being stemmed + # avoids some issues with acronyms + should_not_be_stemmed = ( + word in self.latin_terms and + len(word) >= 3 and + len(self.stemmer.stem(word.lower())) < 3 + ) + if should_not_be_stemmed: + return word.lower() return self.stemmer.stem(word.lower()) |