Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/sphinx-doc/sphinx.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Turner <9087854+AA-Turner@users.noreply.github.com>2022-06-16 21:47:09 +0300
committerGitHub <noreply@github.com>2022-06-16 21:47:09 +0300
commit881f66c5573cec1b3333868effb10cec5c62c7b4 (patch)
tree51b890ff465a9080c133c99cc05a339e7ccadc21
parent956cddb7d406a81edf26d80ff408f76aa01d0f24 (diff)
Simplify Sphinx's Stemmer (#10467)
-rw-r--r--AUTHORS2
-rw-r--r--CHANGES3
-rw-r--r--doc/extdev/deprecated.rst5
-rw-r--r--sphinx/search/en.py7
-rw-r--r--sphinx/search/zh.py9
-rw-r--r--sphinx/util/stemmer/__init__.py63
-rw-r--r--sphinx/util/stemmer/porter.py406
7 files changed, 61 insertions, 434 deletions
diff --git a/AUTHORS b/AUTHORS
index c870e863e..c3f306672 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -96,5 +96,3 @@ authors and projects:
* sphinx.util.jsdump uses the basestring encoding from simplejson,
written by Bob Ippolito, released under the MIT license
-* sphinx.util.stemmer was written by Vivake Gupta, placed in the
- Public Domain
diff --git a/CHANGES b/CHANGES
index 4fb96d00b..c155cff34 100644
--- a/CHANGES
+++ b/CHANGES
@@ -10,6 +10,9 @@ Incompatible changes
Deprecated
----------
+* #10467: Deprecated ``sphinx.util.stemmer`` in favour of ``snowballstemmer``.
+ Patch by Adam Turner.
+
Features added
--------------
diff --git a/doc/extdev/deprecated.rst b/doc/extdev/deprecated.rst
index 81167cd4d..d88eb27b0 100644
--- a/doc/extdev/deprecated.rst
+++ b/doc/extdev/deprecated.rst
@@ -22,6 +22,11 @@ The following is a list of deprecated interfaces.
- (will be) Removed
- Alternatives
+ * - ``sphinx.util.stemmer``
+ - 5.1
+ - 7.0
+ - ``snowballstemmer``
+
* - ``sphinx.util.jsdump``
- 5.0
- 7.0
diff --git a/sphinx/search/en.py b/sphinx/search/en.py
index 53cd917dc..19bd9f019 100644
--- a/sphinx/search/en.py
+++ b/sphinx/search/en.py
@@ -2,8 +2,9 @@
from typing import Dict
+import snowballstemmer
+
from sphinx.search import SearchLanguage
-from sphinx.util.stemmer import get_stemmer
english_stopwords = set("""
a and are as at
@@ -211,7 +212,7 @@ class SearchEnglish(SearchLanguage):
stopwords = english_stopwords
def init(self, options: Dict) -> None:
- self.stemmer = get_stemmer()
+ self.stemmer = snowballstemmer.stemmer('porter')
def stem(self, word: str) -> str:
- return self.stemmer.stem(word.lower())
+ return self.stemmer.stemWord(word.lower())
diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py
index 700c2683f..86f612d5d 100644
--- a/sphinx/search/zh.py
+++ b/sphinx/search/zh.py
@@ -4,8 +4,9 @@ import os
import re
from typing import Dict, List
+import snowballstemmer
+
from sphinx.search import SearchLanguage
-from sphinx.util.stemmer import get_stemmer
try:
import jieba
@@ -230,7 +231,7 @@ class SearchChinese(SearchLanguage):
if dict_path and os.path.isfile(dict_path):
jieba.load_userdict(dict_path)
- self.stemmer = get_stemmer()
+ self.stemmer = snowballstemmer.stemmer('english')
def split(self, input: str) -> List[str]:
chinese: List[str] = []
@@ -252,8 +253,8 @@ class SearchChinese(SearchLanguage):
should_not_be_stemmed = (
word in self.latin_terms and
len(word) >= 3 and
- len(self.stemmer.stem(word.lower())) < 3
+ len(self.stemmer.stemWord(word.lower())) < 3
)
if should_not_be_stemmed:
return word.lower()
- return self.stemmer.stem(word.lower())
+ return self.stemmer.stemWord(word.lower())
diff --git a/sphinx/util/stemmer/__init__.py b/sphinx/util/stemmer/__init__.py
index ff6c365c7..6d27592d8 100644
--- a/sphinx/util/stemmer/__init__.py
+++ b/sphinx/util/stemmer/__init__.py
@@ -1,37 +1,62 @@
"""Word stemming utilities for Sphinx."""
-from sphinx.util.stemmer.porter import PorterStemmer
+import warnings
-try:
- from Stemmer import Stemmer as _PyStemmer
- PYSTEMMER = True
-except ImportError:
- PYSTEMMER = False
+import snowballstemmer
+
+from sphinx.deprecation import RemovedInSphinx70Warning
+
+
+class PorterStemmer:
+ def __init__(self):
+ warnings.warn(f"{self.__class__.__name__} is deprecated, use "
+ "snowballstemmer.stemmer('porter') instead.",
+ RemovedInSphinx70Warning, stacklevel=2)
+ self.stemmer = snowballstemmer.stemmer('porter')
+
+ def stem(self, p: str, i: int, j: int) -> str:
+ warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
+ "snowballstemmer.stemmer('porter').stemWord() instead.",
+ RemovedInSphinx70Warning, stacklevel=2)
+ return self.stemmer.stemWord(p)
class BaseStemmer:
+ def __init__(self):
+ warnings.warn(f"{self.__class__.__name__} is deprecated, use "
+ "snowballstemmer.stemmer('porter') instead.",
+ RemovedInSphinx70Warning, stacklevel=3)
+
def stem(self, word: str) -> str:
- raise NotImplementedError()
+ raise NotImplementedError
class PyStemmer(BaseStemmer):
- def __init__(self) -> None:
- self.stemmer = _PyStemmer('porter')
+ def __init__(self): # NoQA
+ super().__init__()
+ self.stemmer = snowballstemmer.stemmer('porter')
def stem(self, word: str) -> str:
+ warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
+ "snowballstemmer.stemmer('porter').stemWord() instead.",
+ RemovedInSphinx70Warning, stacklevel=2)
return self.stemmer.stemWord(word)
-class StandardStemmer(PorterStemmer, BaseStemmer):
- """All those porter stemmer implementations look hideous;
- make at least the stem method nicer.
- """
- def stem(self, word: str) -> str: # type: ignore
- return super().stem(word, 0, len(word) - 1)
+class StandardStemmer(BaseStemmer):
+ def __init__(self): # NoQA
+ super().__init__()
+ self.stemmer = snowballstemmer.stemmer('porter')
+
+ def stem(self, word: str) -> str:
+ warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
+ "snowballstemmer.stemmer('porter').stemWord() instead.",
+ RemovedInSphinx70Warning, stacklevel=2)
+ return self.stemmer.stemWord(word)
def get_stemmer() -> BaseStemmer:
- if PYSTEMMER:
- return PyStemmer()
- else:
- return StandardStemmer()
+ warnings.warn("get_stemmer() is deprecated, use "
+ "snowballstemmer.stemmer('porter') instead.",
+ RemovedInSphinx70Warning, stacklevel=2)
+ return PyStemmer()
diff --git a/sphinx/util/stemmer/porter.py b/sphinx/util/stemmer/porter.py
deleted file mode 100644
index c4f89eb95..000000000
--- a/sphinx/util/stemmer/porter.py
+++ /dev/null
@@ -1,406 +0,0 @@
-"""Porter Stemming Algorithm
-
-This is the Porter stemming algorithm, ported to Python from the
-version coded up in ANSI C by the author. It may be be regarded
-as canonical, in that it follows the algorithm presented in
-
-Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
-no. 3, pp 130-137,
-
-only differing from it at the points made --DEPARTURE-- below.
-
-See also https://tartarus.org/martin/PorterStemmer/
-
-The algorithm as described in the paper could be exactly replicated
-by adjusting the points of DEPARTURE, but this is barely necessary,
-because (a) the points of DEPARTURE are definitely improvements, and
-(b) no encoding of the Porter stemmer I have seen is anything like
-as exact as this version, even with the points of DEPARTURE!
-
-Release 1: January 2001
-
-:author: Vivake Gupta <v@nano.com>.
-:license: Public Domain ("can be used free of charge for any purpose").
-"""
-
-
-class PorterStemmer:
-
- def __init__(self) -> None:
- """The main part of the stemming algorithm starts here.
- b is a buffer holding a word to be stemmed. The letters are in b[k0],
- b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
- readjusted downwards as the stemming progresses. Zero termination is
- not in fact used in the algorithm.
-
- Note that only lower case sequences are stemmed. Forcing to lower case
- should be done before stem(...) is called.
- """
-
- self.b = "" # buffer for word to be stemmed
- self.k = 0
- self.k0 = 0
- self.j = 0 # j is a general offset into the string
-
- def cons(self, i: int) -> int:
- """cons(i) is TRUE <=> b[i] is a consonant."""
- if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \
- or self.b[i] == 'o' or self.b[i] == 'u':
- return 0
- if self.b[i] == 'y':
- if i == self.k0:
- return 1
- else:
- return (not self.cons(i - 1))
- return 1
-
- def m(self) -> int:
- """m() measures the number of consonant sequences between k0 and j.
- if c is a consonant sequence and v a vowel sequence, and <..>
- indicates arbitrary presence,
-
- <c><v> gives 0
- <c>vc<v> gives 1
- <c>vcvc<v> gives 2
- <c>vcvcvc<v> gives 3
- ....
- """
- n = 0
- i = self.k0
- while 1:
- if i > self.j:
- return n
- if not self.cons(i):
- break
- i = i + 1
- i = i + 1
- while 1:
- while 1:
- if i > self.j:
- return n
- if self.cons(i):
- break
- i = i + 1
- i = i + 1
- n = n + 1
- while 1:
- if i > self.j:
- return n
- if not self.cons(i):
- break
- i = i + 1
- i = i + 1
-
- def vowelinstem(self) -> int:
- """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
- for i in range(self.k0, self.j + 1):
- if not self.cons(i):
- return 1
- return 0
-
- def doublec(self, j: int) -> int:
- """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
- if j < (self.k0 + 1):
- return 0
- if (self.b[j] != self.b[j - 1]):
- return 0
- return self.cons(j)
-
- def cvc(self, i: int) -> int:
- """cvc(i) is TRUE <=> i-2,i-1,i has the form
- consonant - vowel - consonant
- and also if the second c is not w,x or y. this is used when trying to
- restore an e at the end of a short e.g.
-
- cav(e), lov(e), hop(e), crim(e), but
- snow, box, tray.
- """
- if i < (self.k0 + 2) or not self.cons(i) or self.cons(i - 1) \
- or not self.cons(i - 2):
- return 0
- ch = self.b[i]
- if ch in ('w', 'x', 'y'):
- return 0
- return 1
-
- def ends(self, s: str) -> int:
- """ends(s) is TRUE <=> k0,...k ends with the string s."""
- length = len(s)
- if s[length - 1] != self.b[self.k]: # tiny speed-up
- return 0
- if length > (self.k - self.k0 + 1):
- return 0
- if self.b[self.k - length + 1:self.k + 1] != s:
- return 0
- self.j = self.k - length
- return 1
-
- def setto(self, s: str) -> None:
- """setto(s) sets (j+1),...k to the characters in the string s,
- readjusting k."""
- length = len(s)
- self.b = self.b[:self.j + 1] + s + self.b[self.j + length + 1:]
- self.k = self.j + length
-
- def r(self, s: str) -> None:
- """r(s) is used further down."""
- if self.m() > 0:
- self.setto(s)
-
- def step1ab(self) -> None:
- """step1ab() gets rid of plurals and -ed or -ing. e.g.
-
- caresses -> caress
- ponies -> poni
- ties -> ti
- caress -> caress
- cats -> cat
-
- feed -> feed
- agreed -> agree
- disabled -> disable
-
- matting -> mat
- mating -> mate
- meeting -> meet
- milling -> mill
- messing -> mess
-
- meetings -> meet
- """
- if self.b[self.k] == 's':
- if self.ends("sses"):
- self.k = self.k - 2
- elif self.ends("ies"):
- self.setto("i")
- elif self.b[self.k - 1] != 's':
- self.k = self.k - 1
- if self.ends("eed"):
- if self.m() > 0:
- self.k = self.k - 1
- elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
- self.k = self.j
- if self.ends("at"):
- self.setto("ate")
- elif self.ends("bl"):
- self.setto("ble")
- elif self.ends("iz"):
- self.setto("ize")
- elif self.doublec(self.k):
- self.k = self.k - 1
- ch = self.b[self.k]
- if ch in ('l', 's', 'z'):
- self.k = self.k + 1
- elif (self.m() == 1 and self.cvc(self.k)):
- self.setto("e")
-
- def step1c(self) -> None:
- """step1c() turns terminal y to i when there is another vowel in
- the stem."""
- if (self.ends("y") and self.vowelinstem()):
- self.b = self.b[:self.k] + 'i' + self.b[self.k + 1:]
-
- def step2(self) -> None:
- """step2() maps double suffices to single ones.
- so -ization ( = -ize plus -ation) maps to -ize etc. note that the
- string before the suffix must give m() > 0.
- """
- if self.b[self.k - 1] == 'a':
- if self.ends("ational"):
- self.r("ate")
- elif self.ends("tional"):
- self.r("tion")
- elif self.b[self.k - 1] == 'c':
- if self.ends("enci"):
- self.r("ence")
- elif self.ends("anci"):
- self.r("ance")
- elif self.b[self.k - 1] == 'e':
- if self.ends("izer"):
- self.r("ize")
- elif self.b[self.k - 1] == 'l':
- if self.ends("bli"):
- self.r("ble") # --DEPARTURE--
- # To match the published algorithm, replace this phrase with
- # if self.ends("abli"): self.r("able")
- elif self.ends("alli"):
- self.r("al")
- elif self.ends("entli"):
- self.r("ent")
- elif self.ends("eli"):
- self.r("e")
- elif self.ends("ousli"):
- self.r("ous")
- elif self.b[self.k - 1] == 'o':
- if self.ends("ization"):
- self.r("ize")
- elif self.ends("ation"):
- self.r("ate")
- elif self.ends("ator"):
- self.r("ate")
- elif self.b[self.k - 1] == 's':
- if self.ends("alism"):
- self.r("al")
- elif self.ends("iveness"):
- self.r("ive")
- elif self.ends("fulness"):
- self.r("ful")
- elif self.ends("ousness"):
- self.r("ous")
- elif self.b[self.k - 1] == 't':
- if self.ends("aliti"):
- self.r("al")
- elif self.ends("iviti"):
- self.r("ive")
- elif self.ends("biliti"):
- self.r("ble")
- elif self.b[self.k - 1] == 'g': # --DEPARTURE--
- if self.ends("logi"):
- self.r("log")
- # To match the published algorithm, delete this phrase
-
- def step3(self) -> None:
- """step3() dels with -ic-, -full, -ness etc. similar strategy
- to step2."""
- if self.b[self.k] == 'e':
- if self.ends("icate"):
- self.r("ic")
- elif self.ends("ative"):
- self.r("")
- elif self.ends("alize"):
- self.r("al")
- elif self.b[self.k] == 'i':
- if self.ends("iciti"):
- self.r("ic")
- elif self.b[self.k] == 'l':
- if self.ends("ical"):
- self.r("ic")
- elif self.ends("ful"):
- self.r("")
- elif self.b[self.k] == 's':
- if self.ends("ness"):
- self.r("")
-
- def step4(self) -> None:
- """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
- if self.b[self.k - 1] == 'a':
- if self.ends("al"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'c':
- if self.ends("ance"):
- pass
- elif self.ends("ence"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'e':
- if self.ends("er"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'i':
- if self.ends("ic"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'l':
- if self.ends("able"):
- pass
- elif self.ends("ible"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'n':
- if self.ends("ant"):
- pass
- elif self.ends("ement"):
- pass
- elif self.ends("ment"):
- pass
- elif self.ends("ent"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'o':
- if self.ends("ion") and (self.b[self.j] == 's' or
- self.b[self.j] == 't'):
- pass
- elif self.ends("ou"):
- pass
- # takes care of -ous
- else:
- return
- elif self.b[self.k - 1] == 's':
- if self.ends("ism"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 't':
- if self.ends("ate"):
- pass
- elif self.ends("iti"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'u':
- if self.ends("ous"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'v':
- if self.ends("ive"):
- pass
- else:
- return
- elif self.b[self.k - 1] == 'z':
- if self.ends("ize"):
- pass
- else:
- return
- else:
- return
- if self.m() > 1:
- self.k = self.j
-
- def step5(self) -> None:
- """step5() removes a final -e if m() > 1, and changes -ll to -l if
- m() > 1.
- """
- self.j = self.k
- if self.b[self.k] == 'e':
- a = self.m()
- if a > 1 or (a == 1 and not self.cvc(self.k - 1)):
- self.k = self.k - 1
- if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
- self.k = self.k - 1
-
- def stem(self, p: str, i: int, j: int) -> str:
- """In stem(p,i,j), p is a char pointer, and the string to be stemmed
- is from p[i] to p[j] inclusive. Typically i is zero and j is the
- offset to the last character of a string, (p[j+1] == '\0'). The
- stemmer adjusts the characters p[i] ... p[j] and returns the new
- end-point of the string, k. Stemming never increases word length, so
- i <= k <= j. To turn the stemmer into a module, declare 'stem' as
- extern, and delete the remainder of this file.
- """
- # copy the parameters into statics
- self.b = p
- self.k = j
- self.k0 = i
- if self.k <= self.k0 + 1:
- return self.b # --DEPARTURE--
-
- # With this line, strings of length 1 or 2 don't go through the
- # stemming process, although no mention is made of this in the
- # published algorithm. Remove the line to match the published
- # algorithm.
-
- self.step1ab()
- self.step1c()
- self.step2()
- self.step3()
- self.step4()
- self.step5()
- return self.b[self.k0:self.k + 1]