Skip to content

Commit 68e96d8

Browse files
fix: prevent autolink duplication (#2151)
* Prevent autolink duplication
1 parent 97a086a commit 68e96d8

1 file changed

Lines changed: 21 additions & 17 deletions

File tree

lychee-lib/src/extract/markdown.rs

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,15 @@ fn md_extensions() -> Options {
2525

2626
/// Extract unparsed URL strings from a Markdown string.
2727
// TODO: Refactor the extractor to reduce the complexity and number of lines.
28-
#[allow(clippy::too_many_lines)]
28+
#[expect(clippy::too_many_lines)]
2929
pub(crate) fn extract_markdown(
3030
input: &str,
3131
include_verbatim: bool,
3232
include_wikilinks: bool,
3333
) -> Vec<RawUri> {
34-
// In some cases it is undesirable to extract links from within code blocks,
35-
// which is why we keep track of entries and exits while traversing the input.
3634
let mut inside_code_block = false;
37-
let mut inside_link_block = false;
38-
let mut inside_wikilink_block = false;
35+
let mut inside_link_label = false; // encountering `X` in `[X]()`
36+
let mut inside_extracted_link = false; // prevent double extraction when encountering `Text(X)` in `<X>` or `[[X]]`
3937

4038
// HTML blocks come in chunks from pulldown_cmark, so we need to accumulate them
4139
let mut inside_html_block = false;
@@ -53,14 +51,11 @@ pub(crate) fn extract_markdown(
5351
dest_url,
5452
..
5553
}) => {
56-
// Note: Explicitly listing all link types below to make it easier to
57-
// change the behavior for a specific link type in the future.
58-
#[allow(clippy::match_same_arms)]
5954
match link_type {
6055
// Inline link like `[foo](bar)`
6156
// This is the most common link type
6257
LinkType::Inline => {
63-
inside_link_block = true;
58+
inside_link_label = true;
6459
Some(raw_uri(&dest_url, span_provider.span(span.start)))
6560
}
6661
// Reference without destination in the document, but resolved by the `broken_link_callback`
@@ -75,7 +70,7 @@ pub(crate) fn extract_markdown(
7570
LinkType::Shortcut |
7671
// Shortcut without destination in the document, but resolved by the `broken_link_callback`
7772
LinkType::ShortcutUnknown => {
78-
inside_link_block = true;
73+
inside_link_label = true;
7974
// For reference links, create RawUri directly to handle relative file paths
8075
// that linkify doesn't recognize as URLs
8176
Some(raw_uri(&dest_url, span_provider.span(span.start)))
@@ -84,6 +79,7 @@ pub(crate) fn extract_markdown(
8479
LinkType::Autolink |
8580
// Email address in autolink like `<john@example.org>`
8681
LinkType::Email => {
82+
inside_extracted_link = true;
8783
let span_provider = get_email_span_provider(&span_provider, &span, link_type);
8884
Some(extract_raw_uri_from_plaintext(&dest_url, &span_provider))
8985
}
@@ -93,7 +89,7 @@ pub(crate) fn extract_markdown(
9389
if !include_wikilinks {
9490
return None;
9591
}
96-
inside_wikilink_block = true;
92+
inside_extracted_link = true;
9793
// Ignore gitlab toc notation: https://docs.gitlab.com/user/markdown/#table-of-contents
9894
if ["_TOC_".to_string(), "TOC".to_string()].contains(&dest_url.to_string()) {
9995
return None;
@@ -129,8 +125,8 @@ pub(crate) fn extract_markdown(
129125

130126
// A text node.
131127
Event::Text(txt) => {
132-
if inside_wikilink_block
133-
|| (inside_link_block && !include_verbatim)
128+
if inside_extracted_link
129+
|| (inside_link_label && !include_verbatim)
134130
|| (inside_code_block && !include_verbatim) {
135131
None
136132
} else {
@@ -205,13 +201,12 @@ pub(crate) fn extract_markdown(
205201
}
206202

207203
Event::End(TagEnd::Link) => {
208-
inside_link_block = false;
209-
inside_wikilink_block = false;
204+
inside_link_label = false;
205+
inside_extracted_link = false;
210206
None
211207
}
212208

213-
// Skip footnote references and definitions explicitly - they're not links to check
214-
#[allow(clippy::match_same_arms)]
209+
#[expect(clippy::match_same_arms, reason = "Skip footnote references and definitions explicitly - they're not links to check")]
215210
Event::FootnoteReference(_) | Event::Start(Tag::FootnoteDefinition(_)) | Event::End(TagEnd::FootnoteDefinition) => None,
216211

217212
// Silently skip over other events
@@ -575,6 +570,15 @@ $$
575570
assert!(uris.is_empty());
576571
}
577572

573+
/// Don't extract the text of autolinks, as this is the link itself already.
574+
/// Prevents a regression of <https://github.com/lycheeverse/lychee/issues/2150>
575+
#[test]
576+
fn test_autolink() {
577+
let markdown = "<http://example>";
578+
assert_eq!(extract_markdown(markdown, false, false).len(), 1);
579+
assert_eq!(extract_markdown(markdown, true, false).len(), 1);
580+
}
581+
578582
#[test]
579583
fn test_link_text_not_checked() {
580584
// Test that link text is not extracted as a separate link by default

0 commit comments

Comments
 (0)