Skip to content

Commit

Permalink
Convert latin_terms to a set (#12995)
Browse files Browse the repository at this point in the history
  • Loading branch information
AA-Turner authored Oct 10, 2024
1 parent dcd276d commit 705d5dd
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 5 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,9 @@ Bugs fixed
and ensure deterministic resolution of global toctree in parallel builds
by choosing the lexicographically greatest parent document.
Patch by A. Rafey Khan
* #12995: Significantly improve performance when building the search index
for Chinese languages.
Patch by Adam Turner.


Testing
Expand Down
14 changes: 9 additions & 5 deletions sphinx/search/zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,10 @@ class SearchChinese(SearchLanguage):
js_stemmer_code = js_porter_stemmer
stopwords = english_stopwords
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
latin_terms: list[str] = []

def __init__(self, options: dict[str, str]) -> None:
super().__init__(options)
self.latin_terms: set[str] = set()

def init(self, options: dict[str, str]) -> None:
if JIEBA:
Expand All @@ -238,12 +241,13 @@ def init(self, options: dict[str, str]) -> None:
self.stemmer = snowballstemmer.stemmer('english')

def split(self, input: str) -> list[str]:
chinese: list[str] = []
if JIEBA:
chinese = list(jieba.cut_for_search(input))
chinese: list[str] = list(jieba.cut_for_search(input))
else:
chinese = []

latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
self.latin_terms.extend(latin1)
self.latin_terms.update(latin1)
return chinese + latin1

def word_filter(self, stemmed_word: str) -> bool:
Expand All @@ -255,7 +259,7 @@ def stem(self, word: str) -> str:
# avoids some issues with acronyms
stemmed = self.stemmer.stemWord(word.lower())
should_not_be_stemmed = (
word in self.latin_terms and len(word) >= 3 > len(stemmed)
len(word) >= 3 > len(stemmed) and word in self.latin_terms
) # fmt: skip
if should_not_be_stemmed:
return word.lower()
Expand Down

0 comments on commit 705d5dd

Please sign in to comment.