diff --git a/CHANGES.rst b/CHANGES.rst index 766b5cc209d..ab93335db79 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -146,6 +146,9 @@ Bugs fixed and ensure deterministic resolution of global toctree in parallel builds by choosing the lexicographically greatest parent document. Patch by A. Rafey Khan +* #12995: Significantly improve performance when building the search index + for Chinese languages. + Patch by Adam Turner. Testing diff --git a/sphinx/search/zh.py b/sphinx/search/zh.py index b05d709a77b..4905eb84474 100644 --- a/sphinx/search/zh.py +++ b/sphinx/search/zh.py @@ -227,7 +227,10 @@ class SearchChinese(SearchLanguage): js_stemmer_code = js_porter_stemmer stopwords = english_stopwords latin1_letters = re.compile(r'[a-zA-Z0-9_]+') - latin_terms: list[str] = [] + + def __init__(self, options: dict[str, str]) -> None: + super().__init__(options) + self.latin_terms: set[str] = set() def init(self, options: dict[str, str]) -> None: if JIEBA: @@ -238,12 +241,13 @@ def init(self, options: dict[str, str]) -> None: self.stemmer = snowballstemmer.stemmer('english') def split(self, input: str) -> list[str]: - chinese: list[str] = [] if JIEBA: - chinese = list(jieba.cut_for_search(input)) + chinese: list[str] = list(jieba.cut_for_search(input)) + else: + chinese = [] latin1 = [term.strip() for term in self.latin1_letters.findall(input)] - self.latin_terms.extend(latin1) + self.latin_terms.update(latin1) return chinese + latin1 def word_filter(self, stemmed_word: str) -> bool: @@ -255,7 +259,7 @@ def stem(self, word: str) -> str: # avoids some issues with acronyms stemmed = self.stemmer.stemWord(word.lower()) should_not_be_stemmed = ( - word in self.latin_terms and len(word) >= 3 > len(stemmed) + len(word) >= 3 > len(stemmed) and word in self.latin_terms ) # fmt: skip if should_not_be_stemmed: return word.lower()