diff options
author | shimizukawa <shimizukawa@gmail.com> | 2016-02-14 13:13:38 +0300 |
---|---|---|
committer | shimizukawa <shimizukawa@gmail.com> | 2016-02-14 13:23:51 +0300 |
commit | 0992ce542bb7364649df9f41787fc5a9d026b1b0 (patch) | |
tree | ebc8af2f75c89306b328f87f696642fe70b13865 /sphinx/search/ja.py | |
parent | e1394b54272c695a5e3d2ff840b97af29a62f592 (diff) |
Closes #1853: support custom text splitter on html search with `language='ja'`.
Diffstat (limited to 'sphinx/search/ja.py')
-rw-r--r-- | sphinx/search/ja.py | 52 |
1 files changed, 40 insertions, 12 deletions
diff --git a/sphinx/search/ja.py b/sphinx/search/ja.py index 125021208..ba6951f39 100644 --- a/sphinx/search/ja.py +++ b/sphinx/search/ja.py @@ -35,12 +35,29 @@ try: except ImportError: janome_module = False -from sphinx.errors import SphinxError +from sphinx.errors import SphinxError, ExtensionError from sphinx.search import SearchLanguage +from sphinx.util import import_object -class MecabBinder(object): +class BaseSplitter(object): + + def __init__(self, options): + self.options = options + + def split(self, input): + """ + + :param str input: + :return: + :rtype: list[str] + """ + raise NotImplementedError + + +class MecabSplitter(BaseSplitter): def __init__(self, options): + super(MecabSplitter, self).__init__(options) self.ctypes_libmecab = None self.ctypes_mecab = None if not native_module: @@ -108,9 +125,12 @@ class MecabBinder(object): if self.ctypes_libmecab: self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab) +MeCabBinder = MecabSplitter # keep backward compatibility until Sphinx-1.6 + -class JanomeBinder(object): +class JanomeSplitter(BaseSplitter): def __init__(self, options): + super(JanomeSplitter, self).__init__(options) self.user_dict = options.get('user_dic') self.user_dict_enc = options.get('user_dic_enc', 'utf8') self.init_tokenizer() @@ -125,7 +145,7 @@ class JanomeBinder(object): return result.split(u' ') -class TinySegmenter(object): +class DefaultSplitter(BaseSplitter): patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({ u'[一二三四五六七八九十百千万億兆]': u'M', u'[一-龠々〆ヵヶ]': u'H', @@ -501,6 +521,9 @@ class TinySegmenter(object): return result +TinySegmenter = DefaultSplitter # keep backward compatibility until Sphinx-1.6 + + class SearchJapanese(SearchLanguage): """ Japanese search implementation: uses no stemmer, but word splitting is quite @@ -508,18 +531,23 @@ class SearchJapanese(SearchLanguage): """ lang = 'ja' language_name = 'Japanese' + splitters = { + 'default': 'sphinx.search.ja.DefaultSplitter', + 'mecab': 'sphinx.sarch.ja.MecabSplitter', + 'janome': 'sphinx.search.ja.JanomeSplitter', + } def init(self, options): type = options.get('type', 'default') - if type not in ('mecab', 'janome', 'default'): - raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'" - " or 'default'")) - if type == 'mecab': - self.splitter = MecabBinder(options) - if type == 'janome': - self.splitter = JanomeBinder(options) + if type in self.splitters: + dotted_path = self.splitters[type] else: - self.splitter = TinySegmenter() + dotted_path = type + try: + self.splitter = import_object(dotted_path)(options) + except ExtensionError: + raise ExtensionError("Splitter module %r can't be imported" % + dotted_path) def split(self, input): return self.splitter.split(input) |