Skip to content

Commit 2187840

Browse files
committed
Add checks for broken docs urls
1 parent 70ab07e commit 2187840

3 files changed

Lines changed: 303 additions & 0 deletions

File tree

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: check-doc-links
2+
permissions:
3+
contents: read
4+
5+
concurrency:
6+
group: ${{ github.workflow }}-${{ github.event.pull_request.id || github.ref }}
7+
cancel-in-progress: true
8+
9+
on:
10+
pull_request:
11+
branches: ["main"]
12+
paths:
13+
- "docs/**/*.md"
14+
- "docs/app/scripts/check_doc_links.py"
15+
- ".github/workflows/check_doc_links.yml"
16+
push:
17+
branches: ["main"]
18+
paths:
19+
- "docs/**/*.md"
20+
- "docs/app/scripts/check_doc_links.py"
21+
- ".github/workflows/check_doc_links.yml"
22+
23+
jobs:
24+
check-doc-links:
25+
timeout-minutes: 20
26+
runs-on: ubuntu-latest
27+
defaults:
28+
run:
29+
working-directory: docs/app
30+
steps:
31+
- uses: actions/checkout@v4
32+
- uses: ./.github/actions/setup_build_env
33+
with:
34+
python-version: 3.14
35+
run-uv-sync: true
36+
- name: Build frontend to generate sitemap.xml
37+
run: uv run reflex export --frontend-only --no-zip
38+
- name: Validate /docs links against sitemap.xml
39+
run: uv run python scripts/check_doc_links.py
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""Validate /docs/* markdown links against the generated sitemap.xml.
2+
3+
For every .md file under the docs tree, find markdown links of the form
4+
`[text](/docs/...)` and verify:
5+
6+
1. The URL path contains no underscores (URLs use hyphens).
7+
2. After stripping the `/docs` prefix, the path exists in sitemap.xml.
8+
9+
Run after building the frontend so .web/public/sitemap.xml is present, e.g.:
10+
11+
cd docs/app
12+
uv run reflex export --frontend-only --no-zip
13+
uv run python scripts/check_doc_links.py
14+
"""
15+
16+
from __future__ import annotations
17+
18+
import argparse
19+
import re
20+
import sys
21+
import xml.etree.ElementTree as ET
22+
from pathlib import Path
23+
from urllib.parse import urlparse
24+
25+
LINK_RE = re.compile(r"\]\(\s*(/docs(?=[/)#?\s])[^)]*?)(?:\s+\"[^\"]*\")?\s*\)")
26+
SITEMAP_NS = {"sm": "https://www.sitemaps.org/schemas/sitemap/0.9"}
27+
SKIP_DIRS = {".web", "node_modules", "__pycache__", ".git", ".venv", "dist", "build"}
28+
29+
30+
def _normalize(path: str) -> str:
31+
path = path.split("#", 1)[0].split("?", 1)[0]
32+
if not path.startswith("/"):
33+
path = "/" + path
34+
return path.rstrip("/") or "/"
35+
36+
37+
def _strip_docs_prefix(path: str) -> str:
38+
"""Drop a leading `/docs` segment so both deployment styles compare equal."""
39+
if path == "/docs":
40+
return "/"
41+
if path.startswith("/docs/"):
42+
return path[len("/docs") :]
43+
return path
44+
45+
46+
def load_sitemap_paths(sitemap_path: Path) -> set[str]:
47+
"""Return the set of normalized URL paths declared in sitemap.xml."""
48+
tree = ET.parse(sitemap_path)
49+
paths: set[str] = set()
50+
for loc in tree.getroot().findall("sm:url/sm:loc", SITEMAP_NS):
51+
if loc.text is None:
52+
continue
53+
path = urlparse(loc.text.strip()).path
54+
paths.add(_strip_docs_prefix(_normalize(path)))
55+
return paths
56+
57+
58+
def iter_md_files(md_root: Path):
59+
"""Yield .md files under md_root, skipping build/vendor directories."""
60+
for path in md_root.rglob("*.md"):
61+
if any(part in SKIP_DIRS for part in path.relative_to(md_root).parts):
62+
continue
63+
yield path
64+
65+
66+
def iter_md_links(md_root: Path):
67+
"""Yield (file, line_no, raw_url) for every /docs/* markdown link."""
68+
for md_file in iter_md_files(md_root):
69+
try:
70+
text = md_file.read_text(encoding="utf-8")
71+
except OSError:
72+
continue
73+
for line_no, line in enumerate(text.splitlines(), start=1):
74+
for match in LINK_RE.finditer(line):
75+
yield md_file, line_no, match.group(1)
76+
77+
78+
def check(md_root: Path, sitemap_path: Path) -> list[str]:
79+
"""Return a list of human-readable error strings."""
80+
if not sitemap_path.is_file():
81+
return [
82+
f"sitemap.xml not found at {sitemap_path}. "
83+
"Build the frontend first (e.g. `uv run reflex export --frontend-only --no-zip`)."
84+
]
85+
86+
valid_paths = load_sitemap_paths(sitemap_path)
87+
errors: list[str] = []
88+
89+
for md_file, line_no, raw in iter_md_links(md_root):
90+
location = f"{md_file}:{line_no}"
91+
92+
if "_" in raw:
93+
errors.append(
94+
f"{location}: link contains an underscore (use hyphens): {raw!r}"
95+
)
96+
97+
# Compare in /docs-stripped form so the check works whether the
98+
# sitemap entries include the /docs prefix or not.
99+
sitemap_key = _strip_docs_prefix(_normalize(raw))
100+
if sitemap_key not in valid_paths:
101+
errors.append(
102+
f"{location}: {raw!r} -> {sitemap_key!r} not found in sitemap"
103+
)
104+
105+
return errors
106+
107+
108+
def main() -> int:
109+
parser = argparse.ArgumentParser(description=__doc__)
110+
here = Path(__file__).resolve().parent
111+
parser.add_argument(
112+
"--md-root",
113+
type=Path,
114+
default=here.parent.parent,
115+
help="Root directory containing .md docs (default: ../..).",
116+
)
117+
parser.add_argument(
118+
"--sitemap",
119+
type=Path,
120+
default=here.parent / ".web" / "public" / "sitemap.xml",
121+
help="Path to sitemap.xml (default: ../.web/public/sitemap.xml).",
122+
)
123+
args = parser.parse_args()
124+
125+
errors = check(args.md_root.resolve(), args.sitemap.resolve())
126+
if errors:
127+
print(f"Found {len(errors)} broken /docs link(s):", file=sys.stderr)
128+
for err in errors:
129+
print(f" {err}", file=sys.stderr)
130+
return 1
131+
print("All /docs links resolve against sitemap.xml.")
132+
return 0
133+
134+
135+
if __name__ == "__main__":
136+
sys.exit(main())

docs/app/tests/test_doc_links.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
"""Unit tests for scripts/check_doc_links.py."""
2+
3+
import sys
4+
from pathlib import Path
5+
6+
import pytest
7+
8+
sys.path.append(str(Path(__file__).resolve().parent.parent / "scripts"))
9+
10+
from check_doc_links import LINK_RE, _normalize, check
11+
12+
SITEMAP_XML = """<?xml version='1.0' encoding='utf-8'?>
13+
<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
14+
<url><loc>http://localhost:3000/getting-started/basics/</loc></url>
15+
<url><loc>http://localhost:3000/library/disclosure/</loc></url>
16+
</urlset>
17+
"""
18+
19+
SITEMAP_XML_WITH_DOCS_PREFIX = """<?xml version='1.0' encoding='utf-8'?>
20+
<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
21+
<url><loc>http://localhost:3000/docs/getting-started/basics/</loc></url>
22+
<url><loc>http://localhost:3000/docs/library/disclosure/</loc></url>
23+
</urlset>
24+
"""
25+
26+
27+
@pytest.fixture
28+
def docs_tree(tmp_path: Path) -> tuple[Path, Path]:
29+
"""Create a tmp docs root + sitemap.xml and return their paths."""
30+
sitemap = tmp_path / "sitemap.xml"
31+
sitemap.write_text(SITEMAP_XML)
32+
md_root = tmp_path / "docs"
33+
md_root.mkdir()
34+
return md_root, sitemap
35+
36+
37+
def test_normalize_strips_fragment_query_and_trailing_slash():
38+
assert _normalize("/foo/bar/") == "/foo/bar"
39+
assert _normalize("/foo/bar#section") == "/foo/bar"
40+
assert _normalize("/foo/bar?x=1") == "/foo/bar"
41+
assert _normalize("/") == "/"
42+
43+
44+
def test_link_re_matches_basic_link():
45+
matches = LINK_RE.findall("see [basics](/docs/getting-started/basics) here")
46+
assert matches == ["/docs/getting-started/basics"]
47+
48+
49+
def test_link_re_does_not_match_docs_prefix_without_separator():
50+
"""`/docsfoo` and `/docs-foo` must not be treated as /docs links."""
51+
assert LINK_RE.findall("[x](/docsfoo/bar)") == []
52+
assert LINK_RE.findall("[x](/docs-foo/bar)") == []
53+
54+
55+
def test_link_re_keeps_fragment_and_query():
56+
assert LINK_RE.findall("[x](/docs/foo#anchor)") == ["/docs/foo#anchor"]
57+
assert LINK_RE.findall("[x](/docs/foo?q=1)") == ["/docs/foo?q=1"]
58+
59+
60+
def test_check_passes_for_valid_link(docs_tree):
61+
md_root, sitemap = docs_tree
62+
(md_root / "page.md").write_text("[ok](/docs/getting-started/basics)\n")
63+
assert check(md_root, sitemap) == []
64+
65+
66+
def test_check_flags_missing_link(docs_tree):
67+
md_root, sitemap = docs_tree
68+
(md_root / "page.md").write_text("[bad](/docs/no-such-page)\n")
69+
errors = check(md_root, sitemap)
70+
assert len(errors) == 1
71+
assert "not found in sitemap" in errors[0]
72+
73+
74+
def test_check_flags_underscore_and_missing(docs_tree):
75+
"""Underscore link is reported twice: once for the underscore, once for missing."""
76+
md_root, sitemap = docs_tree
77+
(md_root / "page.md").write_text("[under](/docs/getting_started/basics)\n")
78+
errors = check(md_root, sitemap)
79+
assert len(errors) == 2
80+
assert any("underscore" in e for e in errors)
81+
assert any("not found in sitemap" in e for e in errors)
82+
83+
84+
def test_check_ignores_fragment_for_sitemap_lookup(docs_tree):
85+
md_root, sitemap = docs_tree
86+
(md_root / "page.md").write_text("[anchor](/docs/getting-started/basics#section)\n")
87+
assert check(md_root, sitemap) == []
88+
89+
90+
def test_check_ignores_query_for_sitemap_lookup(docs_tree):
91+
md_root, sitemap = docs_tree
92+
(md_root / "page.md").write_text("[q](/docs/library/disclosure?x=1)\n")
93+
assert check(md_root, sitemap) == []
94+
95+
96+
def test_check_ignores_docs_prefix_lookalikes(docs_tree):
97+
"""`/docsfoo` should not even be treated as a /docs link."""
98+
md_root, sitemap = docs_tree
99+
(md_root / "page.md").write_text("[x](/docsfoo/bar)\n")
100+
assert check(md_root, sitemap) == []
101+
102+
103+
def test_check_skips_build_dirs(docs_tree):
104+
md_root, sitemap = docs_tree
105+
skipped = md_root / "node_modules" / "vendor"
106+
skipped.mkdir(parents=True)
107+
(skipped / "README.md").write_text("[bad](/docs/no-such-page)\n")
108+
assert check(md_root, sitemap) == []
109+
110+
111+
def test_check_returns_helpful_message_when_sitemap_missing(tmp_path):
112+
errors = check(tmp_path, tmp_path / "missing.xml")
113+
assert len(errors) == 1
114+
assert "sitemap.xml not found" in errors[0]
115+
116+
117+
def test_check_works_when_sitemap_has_docs_prefix(tmp_path: Path):
118+
"""Both deployment styles (with or without /docs prefix in sitemap) work."""
119+
sitemap = tmp_path / "sitemap.xml"
120+
sitemap.write_text(SITEMAP_XML_WITH_DOCS_PREFIX)
121+
md_root = tmp_path / "docs"
122+
md_root.mkdir()
123+
(md_root / "page.md").write_text(
124+
"[ok](/docs/getting-started/basics)\n[bad](/docs/no-such-page)\n"
125+
)
126+
errors = check(md_root, sitemap)
127+
assert len(errors) == 1
128+
assert "no-such-page" in errors[0]

0 commit comments

Comments
 (0)