Skip to content

Commit

Permalink
Don't count HTML markup in auto summaries
Browse files Browse the repository at this point in the history
This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though.

Closes #12837
  • Loading branch information
bep committed Sep 10, 2024
1 parent 84ee00b commit 3d6baed
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 2 deletions.
2 changes: 1 addition & 1 deletion hugolib/page_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,7 @@ func TestPageSummary(t *testing.T) {
// Source is not Asciidoctor- or RST-compatible so don't test them
if ext != "ad" && ext != "rst" {
checkPageContent(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>\n\n<p>Additional text.</p>\n\n<p>Further text.</p>\n"), ext)
checkPageSummary(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p>"), ext)
checkPageSummary(t, p, normalizeExpected(ext, "<p><a href=\"https://lipsum.com/\">Lorem ipsum</a> dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.</p><p>Additional text.</p>"), ext)
}
checkPageType(t, p, "page")
}
Expand Down
20 changes: 19 additions & 1 deletion resources/page/page_markup.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,16 @@ func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStart
return ptag
}

// Avoid counting words that are most likely HTML tokens.
var (
isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`)
isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`)
)

func isProbablyHTMLToken(s string) bool {
return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s)
}

// ExtractSummaryFromHTML extracts a summary from the given HTML content.
func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) {
result.source = input
Expand All @@ -173,6 +183,14 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
var count int

countWord := func(word string) int {
word = strings.TrimSpace(word)
if len(word) == 0 {
return 0
}
if isProbablyHTMLToken(word) {
return 0
}

if isCJK {
word = tpl.StripHTML(word)
runeCount := utf8.RuneCountInString(word)
Expand All @@ -193,7 +211,7 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo

for j := result.WrapperStart.High; j < high; {
s := input[j:]
closingIndex := strings.Index(s, "</"+ptag.tagName)
closingIndex := strings.Index(s, "</"+ptag.tagName+">")

if closingIndex == -1 {
break
Expand Down
57 changes: 57 additions & 0 deletions resources/page/page_markup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,46 @@ func TestExtractSummaryFromHTML(t *testing.T) {
}
}

// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4
// Also issue 12837
func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) {
c := qt.New(t)

input := `
<p>
<div>
<picture>
<img src="imgs/1.jpg" alt="1"/>
</picture>
<picture>
<img src="imgs/2.jpg" alt="2"/>
</picture>
<picture>
<img src="imgs/3.jpg" alt="3"/>
</picture>
<picture>
<img src="imgs/4.jpg" alt="4"/>
</picture>
<picture>
<img src="imgs/5.jpg" alt="5"/>
</picture>
</div>
</p>
<p>
This is a story about a cat.
</p>
<p>
The cat was white and fluffy.
</p>
<p>
And it liked milk.
</p>
`

summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false)
c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue)
}

func TestExtractSummaryFromHTMLWithDivider(t *testing.T) {
c := qt.New(t)

Expand Down Expand Up @@ -114,6 +154,23 @@ func TestExpandDivider(t *testing.T) {
}
}

func TestIsProbablyHTMLToken(t *testing.T) {
c := qt.New(t)

for i, test := range []struct {
input string
expect bool
}{
{"<p>", true},
{"<p", true},
{"width=\"32\"", true},
{"width='32'", true},
{"<p>Æøå", false},
} {
c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
}
}

func BenchmarkSummaryFromHTML(b *testing.B) {
b.StopTimer()
input := "<p>First paragraph</p><p>Second paragraph</p>"
Expand Down

0 comments on commit 3d6baed

Please sign in to comment.