# -*- coding: utf-8 -*- """ sphinx.search.zh ~~~~~~~~~~~~~~~~ Chinese search language: includes routine to split words. :copyright: Copyright 2007-2018 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ import os import re from sphinx.search import SearchLanguage from sphinx.util.stemmer import get_stemmer try: import jieba JIEBA = True except ImportError: JIEBA = False if False: # For type annotation from typing import Dict, List # NOQA english_stopwords = set(u""" a and are as at be but by for if in into is it near no not of on or such that the their then there these they this to was will with """.split()) js_porter_stemmer = """ /** * Porter Stemmer */ var Stemmer = function() { var step2list = { ational: 'ate', tional: 'tion', enci: 'ence', anci: 'ance', izer: 'ize', bli: 'ble', alli: 'al', entli: 'ent', eli: 'e', ousli: 'ous', ization: 'ize', ation: 'ate', ator: 'ate', alism: 'al', iveness: 'ive', fulness: 'ful', ousness: 'ous', aliti: 'al', iviti: 'ive', biliti: 'ble', logi: 'log' }; var step3list = { icate: 'ic', ative: '', alize: 'al', iciti: 'ic', ical: 'ic', ful: '', ness: '' }; var c = "[^aeiou]"; // consonant var v = "[aeiouy]"; // vowel var C = c + "[^aeiouy]*"; // consonant sequence var V = v + "[aeiou]*"; // vowel sequence var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 var s_v = "^(" + C + ")?" + v; // vowel in stem this.stemWord = function (w) { var stem; var suffix; var firstch; var origword = w; if (w.length < 3) return w; var re; var re2; var re3; var re4; firstch = w.substr(0,1); if (firstch == "y") w = firstch.toUpperCase() + w.substr(1); // Step 1a re = /^(.+?)(ss|i)es$/; re2 = /^(.+?)([^s])s$/; if (re.test(w)) w = w.replace(re,"$1$2"); else if (re2.test(w)) w = w.replace(re2,"$1$2"); // Step 1b re = /^(.+?)eed$/; re2 = /^(.+?)(ed|ing)$/; if (re.test(w)) { var fp = re.exec(w); re = new RegExp(mgr0); if (re.test(fp[1])) { re = /.$/; w = w.replace(re,""); } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1]; re2 = new RegExp(s_v); if (re2.test(stem)) { w = stem; re2 = /(at|bl|iz)$/; re3 = new RegExp("([^aeiouylsz])\\\\1$"); re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re2.test(w)) w = w + "e"; else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } else if (re4.test(w)) w = w + "e"; } } // Step 1c re = /^(.+?)y$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(s_v); if (re.test(stem)) w = stem + "i"; } // Step 2 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\ ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) w = stem + step2list[suffix]; } // Step 3 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) w = stem + step3list[suffix]; } // Step 4 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\ iti|ous|ive|ize)$/; re2 = /^(.+?)(s|t)(ion)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); if (re.test(stem)) w = stem; } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1] + fp[2]; re2 = new RegExp(mgr1); if (re2.test(stem)) w = stem; } // Step 5 re = /^(.+?)e$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); re2 = new RegExp(meq1); re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) w = stem; } re = /ll$/; re2 = new RegExp(mgr1); if (re.test(w) && re2.test(w)) { re = /.$/; w = w.replace(re,""); } // and turn initial Y back to y if (firstch == "y") w = firstch.toLowerCase() + w.substr(1); return w; } } """ class SearchChinese(SearchLanguage): """ Chinese search implementation """ lang = 'zh' language_name = 'Chinese' js_stemmer_code = js_porter_stemmer stopwords = english_stopwords latin1_letters = re.compile(u'(?u)\\w+[\u0000-\u00ff]') def init(self, options): # type: (Dict) -> None if JIEBA: dict_path = options.get('dict') if dict_path and os.path.isfile(dict_path): jieba.load_userdict(dict_path) self.stemmer = get_stemmer() def split(self, input): # type: (unicode) -> List[unicode] chinese = [] # type: List[unicode] if JIEBA: chinese = list(jieba.cut_for_search(input)) latin1 = self.latin1_letters.findall(input) return chinese + latin1 def word_filter(self, stemmed_word): # type: (unicode) -> bool return len(stemmed_word) > 1 def stem(self, word): # type: (unicode) -> unicode return self.stemmer.stem(word.lower())