Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/sphinx-doc/sphinx.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorshimizukawa <shimizukawa@gmail.com>2016-02-14 13:13:38 +0300
committershimizukawa <shimizukawa@gmail.com>2016-02-14 13:23:51 +0300
commit0992ce542bb7364649df9f41787fc5a9d026b1b0 (patch)
treeebc8af2f75c89306b328f87f696642fe70b13865 /sphinx/search/ja.py
parente1394b54272c695a5e3d2ff840b97af29a62f592 (diff)
Closes #1853: support custom text splitter on html search with `language='ja'`.
Diffstat (limited to 'sphinx/search/ja.py')
-rw-r--r--sphinx/search/ja.py52
1 files changed, 40 insertions, 12 deletions
diff --git a/sphinx/search/ja.py b/sphinx/search/ja.py
index 125021208..ba6951f39 100644
--- a/sphinx/search/ja.py
+++ b/sphinx/search/ja.py
@@ -35,12 +35,29 @@ try:
except ImportError:
janome_module = False
-from sphinx.errors import SphinxError
+from sphinx.errors import SphinxError, ExtensionError
from sphinx.search import SearchLanguage
+from sphinx.util import import_object
-class MecabBinder(object):
+class BaseSplitter(object):
+
+ def __init__(self, options):
+ self.options = options
+
+ def split(self, input):
+ """
+
+ :param str input:
+ :return:
+ :rtype: list[str]
+ """
+ raise NotImplementedError
+
+
+class MecabSplitter(BaseSplitter):
def __init__(self, options):
+ super(MecabSplitter, self).__init__(options)
self.ctypes_libmecab = None
self.ctypes_mecab = None
if not native_module:
@@ -108,9 +125,12 @@ class MecabBinder(object):
if self.ctypes_libmecab:
self.ctypes_libmecab.mecab_destroy(self.ctypes_mecab)
+MeCabBinder = MecabSplitter # keep backward compatibility until Sphinx-1.6
+
-class JanomeBinder(object):
+class JanomeSplitter(BaseSplitter):
def __init__(self, options):
+ super(JanomeSplitter, self).__init__(options)
self.user_dict = options.get('user_dic')
self.user_dict_enc = options.get('user_dic_enc', 'utf8')
self.init_tokenizer()
@@ -125,7 +145,7 @@ class JanomeBinder(object):
return result.split(u' ')
-class TinySegmenter(object):
+class DefaultSplitter(BaseSplitter):
patterns_ = dict([(re.compile(pattern), value) for pattern, value in iteritems({
u'[一二三四五六七八九十百千万億兆]': u'M',
u'[一-龠々〆ヵヶ]': u'H',
@@ -501,6 +521,9 @@ class TinySegmenter(object):
return result
+TinySegmenter = DefaultSplitter # keep backward compatibility until Sphinx-1.6
+
+
class SearchJapanese(SearchLanguage):
"""
Japanese search implementation: uses no stemmer, but word splitting is quite
@@ -508,18 +531,23 @@ class SearchJapanese(SearchLanguage):
"""
lang = 'ja'
language_name = 'Japanese'
+ splitters = {
+ 'default': 'sphinx.search.ja.DefaultSplitter',
+ 'mecab': 'sphinx.sarch.ja.MecabSplitter',
+ 'janome': 'sphinx.search.ja.JanomeSplitter',
+ }
def init(self, options):
type = options.get('type', 'default')
- if type not in ('mecab', 'janome', 'default'):
- raise ValueError(("Japanese tokenizer's type should be 'mecab' or 'janome'"
- " or 'default'"))
- if type == 'mecab':
- self.splitter = MecabBinder(options)
- if type == 'janome':
- self.splitter = JanomeBinder(options)
+ if type in self.splitters:
+ dotted_path = self.splitters[type]
else:
- self.splitter = TinySegmenter()
+ dotted_path = type
+ try:
+ self.splitter = import_object(dotted_path)(options)
+ except ExtensionError:
+ raise ExtensionError("Splitter module %r can't be imported" %
+ dotted_path)
def split(self, input):
return self.splitter.split(input)