Convert latin_terms to a set (#12995)

sphinx-doc · Oct 10, 2024 · 705d5dd · 705d5dd
1 parent dcd276d
commit 705d5dd
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 5 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -146,6 +146,9 @@ Bugs fixed
   and ensure deterministic resolution of global toctree in parallel builds
   by choosing the lexicographically greatest parent document.
   Patch by A. Rafey Khan
+* #12995: Significantly improve performance when building the search index
+  for Chinese languages.
+  Patch by Adam Turner.
 
 
 Testing

diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py
@@ -227,7 +227,10 @@ class SearchChinese(SearchLanguage):
     js_stemmer_code = js_porter_stemmer
     stopwords = english_stopwords
     latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
-    latin_terms: list[str] = []
+
+    def __init__(self, options: dict[str, str]) -> None:
+        super().__init__(options)
+        self.latin_terms: set[str] = set()
 
     def init(self, options: dict[str, str]) -> None:
         if JIEBA:
@@ -238,12 +241,13 @@ def init(self, options: dict[str, str]) -> None:
         self.stemmer = snowballstemmer.stemmer('english')
 
     def split(self, input: str) -> list[str]:
-        chinese: list[str] = []
         if JIEBA:
-            chinese = list(jieba.cut_for_search(input))
+            chinese: list[str] = list(jieba.cut_for_search(input))
+        else:
+            chinese = []
 
         latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
-        self.latin_terms.extend(latin1)
+        self.latin_terms.update(latin1)
         return chinese + latin1
 
     def word_filter(self, stemmed_word: str) -> bool:
@@ -255,7 +259,7 @@ def stem(self, word: str) -> str:
         # avoids some issues with acronyms
         stemmed = self.stemmer.stemWord(word.lower())
         should_not_be_stemmed = (
-            word in self.latin_terms and len(word) >= 3 > len(stemmed)
+            len(word) >= 3 > len(stemmed) and word in self.latin_terms
         )  # fmt: skip
         if should_not_be_stemmed:
             return word.lower()