Simplify Sphinx's Stemmer (#10467)

author: Adam Turner <9087854+AA-Turner@users.noreply.github.com> 2022-06-16 21:47:09 +0300
committer: GitHub <noreply@github.com> 2022-06-16 21:47:09 +0300
commit: 881f66c5573cec1b3333868effb10cec5c62c7b4 (patch)
tree: 51b890ff465a9080c133c99cc05a339e7ccadc21
parent: 956cddb7d406a81edf26d80ff408f76aa01d0f24 (diff)
7 files changed, 61 insertions, 434 deletions
diff --git a/AUTHORS b/AUTHORS
index c870e863e..c3f306672 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -96,5 +96,3 @@ authors and projects:
 
 * sphinx.util.jsdump uses the basestring encoding from simplejson,
   written by Bob Ippolito, released under the MIT license
-* sphinx.util.stemmer was written by Vivake Gupta, placed in the
-  Public Domain
diff --git a/CHANGES b/CHANGES
index 4fb96d00b..c155cff34 100644
--- a/CHANGES
+++ b/CHANGES
@@ -10,6 +10,9 @@ Incompatible changes
 Deprecated
 ----------
 
+* #10467: Deprecated ``sphinx.util.stemmer`` in favour of ``snowballstemmer``.
+  Patch by Adam Turner.
+
 Features added
 --------------
 
diff --git a/doc/extdev/deprecated.rst b/doc/extdev/deprecated.rst
index 81167cd4d..d88eb27b0 100644
--- a/doc/extdev/deprecated.rst
+++ b/doc/extdev/deprecated.rst
@@ -22,6 +22,11 @@ The following is a list of deprecated interfaces.
      - (will be) Removed
      - Alternatives
 
+   * - ``sphinx.util.stemmer``
+     - 5.1
+     - 7.0
+     - ``snowballstemmer``
+
    * - ``sphinx.util.jsdump``
      - 5.0
      - 7.0
diff --git a/sphinx/search/en.py b/sphinx/search/en.py
index 53cd917dc..19bd9f019 100644
--- a/sphinx/search/en.py
+++ b/sphinx/search/en.py
@@ -2,8 +2,9 @@
 
 from typing import Dict
 
+import snowballstemmer
+
 from sphinx.search import SearchLanguage
-from sphinx.util.stemmer import get_stemmer
 
 english_stopwords = set("""
 a  and  are  as  at
@@ -211,7 +212,7 @@ class SearchEnglish(SearchLanguage):
     stopwords = english_stopwords
 
     def init(self, options: Dict) -> None:
-        self.stemmer = get_stemmer()
+        self.stemmer = snowballstemmer.stemmer('porter')
 
     def stem(self, word: str) -> str:
-        return self.stemmer.stem(word.lower())
+        return self.stemmer.stemWord(word.lower())
diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py
index 700c2683f..86f612d5d 100644
--- a/sphinx/search/zh.py
+++ b/sphinx/search/zh.py
@@ -4,8 +4,9 @@ import os
 import re
 from typing import Dict, List
 
+import snowballstemmer
+
 from sphinx.search import SearchLanguage
-from sphinx.util.stemmer import get_stemmer
 
 try:
     import jieba
@@ -230,7 +231,7 @@ class SearchChinese(SearchLanguage):
             if dict_path and os.path.isfile(dict_path):
                 jieba.load_userdict(dict_path)
 
-        self.stemmer = get_stemmer()
+        self.stemmer = snowballstemmer.stemmer('english')
 
     def split(self, input: str) -> List[str]:
         chinese: List[str] = []
@@ -252,8 +253,8 @@ class SearchChinese(SearchLanguage):
         should_not_be_stemmed = (
             word in self.latin_terms and
             len(word) >= 3 and
-            len(self.stemmer.stem(word.lower())) < 3
+            len(self.stemmer.stemWord(word.lower())) < 3
         )
         if should_not_be_stemmed:
             return word.lower()
-        return self.stemmer.stem(word.lower())
+        return self.stemmer.stemWord(word.lower())
diff --git a/sphinx/util/stemmer/__init__.py b/sphinx/util/stemmer/__init__.py
index ff6c365c7..6d27592d8 100644
--- a/sphinx/util/stemmer/__init__.py
+++ b/sphinx/util/stemmer/__init__.py
@@ -1,37 +1,62 @@
 """Word stemming utilities for Sphinx."""
 
-from sphinx.util.stemmer.porter import PorterStemmer
+import warnings
 
-try:
-    from Stemmer import Stemmer as _PyStemmer
-    PYSTEMMER = True
-except ImportError:
-    PYSTEMMER = False
+import snowballstemmer
+
+from sphinx.deprecation import RemovedInSphinx70Warning
+
+
+class PorterStemmer:
+    def __init__(self):
+        warnings.warn(f"{self.__class__.__name__} is deprecated, use "
+                      "snowballstemmer.stemmer('porter') instead.",
+                      RemovedInSphinx70Warning, stacklevel=2)
+        self.stemmer = snowballstemmer.stemmer('porter')
+
+    def stem(self, p: str, i: int, j: int) -> str:
+        warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
+                      "snowballstemmer.stemmer('porter').stemWord() instead.",
+                      RemovedInSphinx70Warning, stacklevel=2)
+        return self.stemmer.stemWord(p)
 
 
 class BaseStemmer:
+    def __init__(self):
+        warnings.warn(f"{self.__class__.__name__} is deprecated, use "
+                      "snowballstemmer.stemmer('porter') instead.",
+                      RemovedInSphinx70Warning, stacklevel=3)
+
     def stem(self, word: str) -> str:
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class PyStemmer(BaseStemmer):
-    def __init__(self) -> None:
-        self.stemmer = _PyStemmer('porter')
+    def __init__(self):  # NoQA
+        super().__init__()
+        self.stemmer = snowballstemmer.stemmer('porter')
 
     def stem(self, word: str) -> str:
+        warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
+                      "snowballstemmer.stemmer('porter').stemWord() instead.",
+                      RemovedInSphinx70Warning, stacklevel=2)
         return self.stemmer.stemWord(word)
 
 
-class StandardStemmer(PorterStemmer, BaseStemmer):
-    """All those porter stemmer implementations look hideous;
-    make at least the stem method nicer.
-    """
-    def stem(self, word: str) -> str:  # type: ignore
-        return super().stem(word, 0, len(word) - 1)
+class StandardStemmer(BaseStemmer):
+    def __init__(self):  # NoQA
+        super().__init__()
+        self.stemmer = snowballstemmer.stemmer('porter')
+
+    def stem(self, word: str) -> str:
+        warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
+                      "snowballstemmer.stemmer('porter').stemWord() instead.",
+                      RemovedInSphinx70Warning, stacklevel=2)
+        return self.stemmer.stemWord(word)
 
 
 def get_stemmer() -> BaseStemmer:
-    if PYSTEMMER:
-        return PyStemmer()
-    else:
-        return StandardStemmer()
+    warnings.warn("get_stemmer() is deprecated, use "
+                  "snowballstemmer.stemmer('porter') instead.",
+                  RemovedInSphinx70Warning, stacklevel=2)
+    return PyStemmer()
diff --git a/sphinx/util/stemmer/porter.py b/sphinx/util/stemmer/porter.py
deleted file mode 100644
index c4f89eb95..000000000
--- a/sphinx/util/stemmer/porter.py
+++ /dev/null
@@ -1,406 +0,0 @@
-"""Porter Stemming Algorithm
-
-This is the Porter stemming algorithm, ported to Python from the
-version coded up in ANSI C by the author. It may be be regarded
-as canonical, in that it follows the algorithm presented in
-
-Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
-no. 3, pp 130-137,
-
-only differing from it at the points made --DEPARTURE-- below.
-
-See also https://tartarus.org/martin/PorterStemmer/
-
-The algorithm as described in the paper could be exactly replicated
-by adjusting the points of DEPARTURE, but this is barely necessary,
-because (a) the points of DEPARTURE are definitely improvements, and
-(b) no encoding of the Porter stemmer I have seen is anything like
-as exact as this version, even with the points of DEPARTURE!
-
-Release 1: January 2001
-
-:author: Vivake Gupta <v@nano.com>.
-:license: Public Domain ("can be used free of charge for any purpose").
-"""
-
-
-class PorterStemmer:
-
-    def __init__(self) -> None:
-        """The main part of the stemming algorithm starts here.
-        b is a buffer holding a word to be stemmed. The letters are in b[k0],
-        b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
-        readjusted downwards as the stemming progresses. Zero termination is
-        not in fact used in the algorithm.
-
-        Note that only lower case sequences are stemmed. Forcing to lower case
-        should be done before stem(...) is called.
-        """
-
-        self.b = ""     # buffer for word to be stemmed
-        self.k = 0
-        self.k0 = 0
-        self.j = 0      # j is a general offset into the string
-
-    def cons(self, i: int) -> int:
-        """cons(i) is TRUE <=> b[i] is a consonant."""
-        if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \
-           or self.b[i] == 'o' or self.b[i] == 'u':
-            return 0
-        if self.b[i] == 'y':
-            if i == self.k0:
-                return 1
-            else:
-                return (not self.cons(i - 1))
-        return 1
-
-    def m(self) -> int:
-        """m() measures the number of consonant sequences between k0 and j.
-        if c is a consonant sequence and v a vowel sequence, and <..>
-        indicates arbitrary presence,
-
-           <c><v>       gives 0
-           <c>vc<v>     gives 1
-           <c>vcvc<v>   gives 2
-           <c>vcvcvc<v> gives 3
-           ....
-        """
-        n = 0
-        i = self.k0
-        while 1:
-            if i > self.j:
-                return n
-            if not self.cons(i):
-                break
-            i = i + 1
-        i = i + 1
-        while 1:
-            while 1:
-                if i > self.j:
-                    return n
-                if self.cons(i):
-                    break
-                i = i + 1
-            i = i + 1
-            n = n + 1
-            while 1:
-                if i > self.j:
-                    return n
-                if not self.cons(i):
-                    break
-                i = i + 1
-            i = i + 1
-
-    def vowelinstem(self) -> int:
-        """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
-        for i in range(self.k0, self.j + 1):
-            if not self.cons(i):
-                return 1
-        return 0
-
-    def doublec(self, j: int) -> int:
-        """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
-        if j < (self.k0 + 1):
-            return 0
-        if (self.b[j] != self.b[j - 1]):
-            return 0
-        return self.cons(j)
-
-    def cvc(self, i: int) -> int:
-        """cvc(i) is TRUE <=> i-2,i-1,i has the form
-             consonant - vowel - consonant
-        and also if the second c is not w,x or y. this is used when trying to
-        restore an e at the end of a short  e.g.
-
-           cav(e), lov(e), hop(e), crim(e), but
-           snow, box, tray.
-        """
-        if i < (self.k0 + 2) or not self.cons(i) or self.cons(i - 1) \
-           or not self.cons(i - 2):
-            return 0
-        ch = self.b[i]
-        if ch in ('w', 'x', 'y'):
-            return 0
-        return 1
-
-    def ends(self, s: str) -> int:
-        """ends(s) is TRUE <=> k0,...k ends with the string s."""
-        length = len(s)
-        if s[length - 1] != self.b[self.k]:  # tiny speed-up
-            return 0
-        if length > (self.k - self.k0 + 1):
-            return 0
-        if self.b[self.k - length + 1:self.k + 1] != s:
-            return 0
-        self.j = self.k - length
-        return 1
-
-    def setto(self, s: str) -> None:
-        """setto(s) sets (j+1),...k to the characters in the string s,
-        readjusting k."""
-        length = len(s)
-        self.b = self.b[:self.j + 1] + s + self.b[self.j + length + 1:]
-        self.k = self.j + length
-
-    def r(self, s: str) -> None:
-        """r(s) is used further down."""
-        if self.m() > 0:
-            self.setto(s)
-
-    def step1ab(self) -> None:
-        """step1ab() gets rid of plurals and -ed or -ing. e.g.
-
-           caresses  ->  caress
-           ponies    ->  poni
-           ties      ->  ti
-           caress    ->  caress
-           cats      ->  cat
-
-           feed      ->  feed
-           agreed    ->  agree
-           disabled  ->  disable
-
-           matting   ->  mat
-           mating    ->  mate
-           meeting   ->  meet
-           milling   ->  mill
-           messing   ->  mess
-
-           meetings  ->  meet
-        """
-        if self.b[self.k] == 's':
-            if self.ends("sses"):
-                self.k = self.k - 2
-            elif self.ends("ies"):
-                self.setto("i")
-            elif self.b[self.k - 1] != 's':
-                self.k = self.k - 1
-        if self.ends("eed"):
-            if self.m() > 0:
-                self.k = self.k - 1
-        elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
-            self.k = self.j
-            if self.ends("at"):
-                self.setto("ate")
-            elif self.ends("bl"):
-                self.setto("ble")
-            elif self.ends("iz"):
-                self.setto("ize")
-            elif self.doublec(self.k):
-                self.k = self.k - 1
-                ch = self.b[self.k]
-                if ch in ('l', 's', 'z'):
-                    self.k = self.k + 1
-            elif (self.m() == 1 and self.cvc(self.k)):
-                self.setto("e")
-
-    def step1c(self) -> None:
-        """step1c() turns terminal y to i when there is another vowel in
-        the stem."""
-        if (self.ends("y") and self.vowelinstem()):
-            self.b = self.b[:self.k] + 'i' + self.b[self.k + 1:]
-
-    def step2(self) -> None:
-        """step2() maps double suffices to single ones.
-        so -ization ( = -ize plus -ation) maps to -ize etc. note that the
-        string before the suffix must give m() > 0.
-        """
-        if self.b[self.k - 1] == 'a':
-            if self.ends("ational"):
-                self.r("ate")
-            elif self.ends("tional"):
-                self.r("tion")
-        elif self.b[self.k - 1] == 'c':
-            if self.ends("enci"):
-                self.r("ence")
-            elif self.ends("anci"):
-                self.r("ance")
-        elif self.b[self.k - 1] == 'e':
-            if self.ends("izer"):
-                self.r("ize")
-        elif self.b[self.k - 1] == 'l':
-            if self.ends("bli"):
-                self.r("ble")  # --DEPARTURE--
-            # To match the published algorithm, replace this phrase with
-            #   if self.ends("abli"):      self.r("able")
-            elif self.ends("alli"):
-                self.r("al")
-            elif self.ends("entli"):
-                self.r("ent")
-            elif self.ends("eli"):
-                self.r("e")
-            elif self.ends("ousli"):
-                self.r("ous")
-        elif self.b[self.k - 1] == 'o':
-            if self.ends("ization"):
-                self.r("ize")
-            elif self.ends("ation"):
-                self.r("ate")
-            elif self.ends("ator"):
-                self.r("ate")
-        elif self.b[self.k - 1] == 's':
-            if self.ends("alism"):
-                self.r("al")
-            elif self.ends("iveness"):
-                self.r("ive")
-            elif self.ends("fulness"):
-                self.r("ful")
-            elif self.ends("ousness"):
-                self.r("ous")
-        elif self.b[self.k - 1] == 't':
-            if self.ends("aliti"):
-                self.r("al")
-            elif self.ends("iviti"):
-                self.r("ive")
-            elif self.ends("biliti"):
-                self.r("ble")
-        elif self.b[self.k - 1] == 'g':  # --DEPARTURE--
-            if self.ends("logi"):
-                self.r("log")
-        # To match the published algorithm, delete this phrase
-
-    def step3(self) -> None:
-        """step3() dels with -ic-, -full, -ness etc. similar strategy
-        to step2."""
-        if self.b[self.k] == 'e':
-            if self.ends("icate"):
-                self.r("ic")
-            elif self.ends("ative"):
-                self.r("")
-            elif self.ends("alize"):
-                self.r("al")
-        elif self.b[self.k] == 'i':
-            if self.ends("iciti"):
-                self.r("ic")
-        elif self.b[self.k] == 'l':
-            if self.ends("ical"):
-                self.r("ic")
-            elif self.ends("ful"):
-                self.r("")
-        elif self.b[self.k] == 's':
-            if self.ends("ness"):
-                self.r("")
-
-    def step4(self) -> None:
-        """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
-        if self.b[self.k - 1] == 'a':
-            if self.ends("al"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'c':
-            if self.ends("ance"):
-                pass
-            elif self.ends("ence"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'e':
-            if self.ends("er"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'i':
-            if self.ends("ic"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'l':
-            if self.ends("able"):
-                pass
-            elif self.ends("ible"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'n':
-            if self.ends("ant"):
-                pass
-            elif self.ends("ement"):
-                pass
-            elif self.ends("ment"):
-                pass
-            elif self.ends("ent"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'o':
-            if self.ends("ion") and (self.b[self.j] == 's' or
-                                     self.b[self.j] == 't'):
-                pass
-            elif self.ends("ou"):
-                pass
-            # takes care of -ous
-            else:
-                return
-        elif self.b[self.k - 1] == 's':
-            if self.ends("ism"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 't':
-            if self.ends("ate"):
-                pass
-            elif self.ends("iti"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'u':
-            if self.ends("ous"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'v':
-            if self.ends("ive"):
-                pass
-            else:
-                return
-        elif self.b[self.k - 1] == 'z':
-            if self.ends("ize"):
-                pass
-            else:
-                return
-        else:
-            return
-        if self.m() > 1:
-            self.k = self.j
-
-    def step5(self) -> None:
-        """step5() removes a final -e if m() > 1, and changes -ll to -l if
-        m() > 1.
-        """
-        self.j = self.k
-        if self.b[self.k] == 'e':
-            a = self.m()
-            if a > 1 or (a == 1 and not self.cvc(self.k - 1)):
-                self.k = self.k - 1
-        if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
-            self.k = self.k - 1
-
-    def stem(self, p: str, i: int, j: int) -> str:
-        """In stem(p,i,j), p is a char pointer, and the string to be stemmed
-        is from p[i] to p[j] inclusive. Typically i is zero and j is the
-        offset to the last character of a string, (p[j+1] == '\0'). The
-        stemmer adjusts the characters p[i] ... p[j] and returns the new
-        end-point of the string, k. Stemming never increases word length, so
-        i <= k <= j. To turn the stemmer into a module, declare 'stem' as
-        extern, and delete the remainder of this file.
-        """
-        # copy the parameters into statics
-        self.b = p
-        self.k = j
-        self.k0 = i
-        if self.k <= self.k0 + 1:
-            return self.b  # --DEPARTURE--
-
-        # With this line, strings of length 1 or 2 don't go through the
-        # stemming process, although no mention is made of this in the
-        # published algorithm. Remove the line to match the published
-        # algorithm.
-
-        self.step1ab()
-        self.step1c()
-        self.step2()
-        self.step3()
-        self.step4()
-        self.step5()
-        return self.b[self.k0:self.k + 1]
author	Adam Turner <9087854+AA-Turner@users.noreply.github.com>	2022-06-16 21:47:09 +0300
committer	GitHub <noreply@github.com>	2022-06-16 21:47:09 +0300
commit	881f66c5573cec1b3333868effb10cec5c62c7b4 (patch)
tree	51b890ff465a9080c133c99cc05a339e7ccadc21
parent	956cddb7d406a81edf26d80ff408f76aa01d0f24 (diff)