Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't include HTML content in title search index #13356

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ Bugs fixed
Patch by Adam Turner.
* #13328: Fix parsing of PEP 695 functions with return annotations.
Patch by Bénédikt Tran. Initial work by Arash Badie-Modiri.
* #13355: Don't include escaped title content in the search index.
Patch by Will Lachance.

Testing
-------
Expand Down
3 changes: 2 additions & 1 deletion sphinx/builders/html/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,8 +679,9 @@ def write_doc(self, docname: str, doctree: nodes.document) -> None:
def write_doc_serialized(self, docname: str, doctree: nodes.document) -> None:
self.imgpath = relative_uri(self.get_target_uri(docname), self.imagedir)
self.post_process_images(doctree)
# get title as plain text
title_node = self.env.longtitles.get(docname)
title = self.render_partial(title_node)['title'] if title_node else ''
title = title_node.astext() if title_node else ''
Copy link
Contributor Author

@wlach wlach Feb 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best I can tell the title is only used for indexing purposes here (corroborated by tests not failing, I hope).

self.index_page(docname, doctree, title)

def finish(self) -> None:
Expand Down
15 changes: 13 additions & 2 deletions sphinx/themes/basic/static/searchtools.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,15 @@ const _removeChildren = (element) => {
const _escapeRegExp = (string) =>
string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string

const _escapeHTML = (text) => {
return text
.replaceAll("&", "&")
.replaceAll("<", "&lt;")
.replaceAll(">", "&gt;")
.replaceAll('"', "&quot;")
.replaceAll("'", "&apos;");
}

const _displayItem = (item, searchTerms, highlightTerms) => {
const docBuilder = DOCUMENTATION_OPTIONS.BUILDER;
const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX;
Expand Down Expand Up @@ -340,7 +349,9 @@ const Search = {
const boost = titles[file] === title ? 1 : 0; // add a boost for document titles
normalResults.push([
docNames[file],
titles[file] !== title ? `${titles[file]} > ${title}` : title,
_escapeHTML(
titles[file] !== title ? `${titles[file]} > ${title}` : title
),
id !== null ? "#" + id : "",
null,
score + boost,
Expand All @@ -358,7 +369,7 @@ const Search = {
const score = Math.round(100 * queryLower.length / entry.length);
const result = [
docNames[file],
titles[file],
_escapeHTML(titles[file]),
id ? "#" + id : "",
null,
score,
Expand Down
2 changes: 1 addition & 1 deletion tests/js/fixtures/cpp/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions tests/js/searchtools.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ describe('Basic html theme search', function() {

hits = [[
"index",
"&lt;no title&gt;",
"<no title>",
"",
null,
5,
Expand Down Expand Up @@ -184,7 +184,7 @@ describe('Basic html theme search', function() {

expectedRanking = [
['index', 'Main Page', '#index-0'], /* index entry */
['index', 'Main Page > Result Scoring', '#result-scoring'], /* title */
['index', 'Main Page &gt; Result Scoring', '#result-scoring'], /* title */
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jayaddison This update to an existing test shows quoting HTML content

];

searchParameters = Search._parseQuery('scoring');
Expand All @@ -198,7 +198,7 @@ describe('Basic html theme search', function() {

expectedRanking = [
['relevance', 'Relevance', ''], /* main title */
['index', 'Main Page > Relevance', '#relevance'], /* subsection heading title */
['index', 'Main Page &gt; Relevance', '#relevance'], /* subsection heading title */
];

searchParameters = Search._parseQuery('relevance');
Expand Down
4 changes: 4 additions & 0 deletions tests/roots/test-search/escapedtitle.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
`escaped` title with < and > in it
==================================

this document has escaped content in the title but also the characters < and > in it
19 changes: 14 additions & 5 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,17 @@ def test_term_in_heading_and_section(app: SphinxTestApp) -> None:
# if search term is in the title of one doc and in the text of another
# both documents should be a hit in the search index as a title,
# respectively text hit
assert '"textinhead":2' in searchindex
assert '"textinhead":0' in searchindex
assert '"textinhead":3' in searchindex
assert '"textinhead":1' in searchindex


@pytest.mark.sphinx('html', testroot='search')
def test_escaped_title(app: SphinxTestApp) -> None:
app.build(force_all=True)
searchindex = load_searchindex(app.outdir / 'searchindex.js')
print(searchindex)
assert 'escapedtitle' in searchindex['docnames']
assert 'escaped title with < and > in it' in searchindex['titles']


@pytest.mark.sphinx('html', testroot='search')
Expand Down Expand Up @@ -398,15 +407,15 @@ def test_search_index_gen_zh(app: SphinxTestApp) -> None:
def test_nosearch(app: SphinxTestApp) -> None:
app.build()
index = load_searchindex(app.outdir / 'searchindex.js')
assert index['docnames'] == ['index', 'nosearch', 'tocitem']
assert index['docnames'] == ['escapedtitle', 'index', 'nosearch', 'tocitem']
# latex is in 'nosearch.rst', and nowhere else
assert 'latex' not in index['terms']
# cat is in 'index.rst' but is marked with the 'no-search' class
assert 'cat' not in index['terms']
# bat is indexed from 'index.rst' and 'tocitem.rst' (document IDs 0, 2), and
# not from 'nosearch.rst' (document ID 1)
assert 'bat' in index['terms']
assert index['terms']['bat'] == [0, 2]
assert index['terms']['bat'] == [1, 3]


@pytest.mark.sphinx(
Expand All @@ -418,7 +427,7 @@ def test_nosearch(app: SphinxTestApp) -> None:
def test_parallel(app: SphinxTestApp) -> None:
app.build()
index = load_searchindex(app.outdir / 'searchindex.js')
assert index['docnames'] == ['index', 'nosearch', 'tocitem']
assert index['docnames'] == ['escapedtitle', 'index', 'nosearch', 'tocitem']


@pytest.mark.sphinx('html', testroot='search')
Expand Down
Loading