Skip to content

Commit f2ae4e4

Browse files
authored
Adjust md_in_html "markdown" blocks to process content consistently (#1503)
Ensure `md_in_html` processes content inside a "markdown" block the same way content is processed outside of a "markdown" block. - Flatten the HTML content into placeholders so that the parser will treat the "markdown" block content in the same way it does when `md_in_html` is not enabled. The placeholders are expanded once the parser reaches them in a linear fashion. This allows extensions to deal with HTML content and consume it the same way it deals with them with them when the content is not nested under a "markdown" block. - Instead of content being processed in dummy tags, content is now processed under the real parent allowing extensions to have better context to make better decisions. Additionally, fix some issues with tags and inline code. Also, fix some issues with one-liner block tags, e.g. `<tag><tag>...` Resolves #1502 Resolves #1075 Resolves #1074
1 parent 4260e7b commit f2ae4e4

File tree

6 files changed

+422
-48
lines changed

6 files changed

+422
-48
lines changed

.spell-dict

+1-1
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ plugins
178178
configs
179179
pre
180180
formatters
181-
181+
unflattened
182182
dedented
183183
Setext
184184
unindented

docs/changelog.md

+5
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1717
### Fixed
1818

1919
* Backslash Unescape IDs set via `attr_list` on `toc` (#1493).
20+
* `md_in_html` will process content inside "markdown" blocks a similar way
21+
as they are parsed outside of "markdown" blocks giving a more consistent
22+
expectation to external extensions (#1503).
23+
* `md_in_html` handle tags within inline code blocks better (#1075).
24+
* `md_in_html` fix handling of one-liner block HTML handling (#1074)
2025

2126
## [3.7] -- 2024-08-16
2227

markdown/extensions/md_in_html.py

+97-45
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def reset(self):
6161
self.mdstack: list[str] = [] # When markdown=1, stack contains a list of tags
6262
self.treebuilder = etree.TreeBuilder()
6363
self.mdstate: list[Literal['block', 'span', 'off', None]] = []
64+
self.mdstarted: list[bool] = []
6465
super().reset()
6566

6667
def close(self):
@@ -111,7 +112,10 @@ def handle_starttag(self, tag, attrs):
111112
self.handle_empty_tag(data, True)
112113
return
113114

114-
if tag in self.block_level_tags and (self.at_line_start() or self.intail):
115+
if (
116+
tag in self.block_level_tags and
117+
(self.at_line_start() or self.intail or self.mdstarted and self.mdstarted[-1])
118+
):
115119
# Valueless attribute (ex: `<tag checked>`) results in `[('checked', None)]`.
116120
# Convert to `{'checked': 'checked'}`.
117121
attrs = {key: value if value is not None else key for key, value in attrs}
@@ -126,8 +130,10 @@ def handle_starttag(self, tag, attrs):
126130
self.handle_endtag('p')
127131
self.mdstate.append(state)
128132
self.mdstack.append(tag)
133+
self.mdstarted.append(True)
129134
attrs['markdown'] = state
130135
self.treebuilder.start(tag, attrs)
136+
131137
else:
132138
# Span level tag
133139
if self.inraw:
@@ -151,6 +157,7 @@ def handle_endtag(self, tag):
151157
while self.mdstack:
152158
item = self.mdstack.pop()
153159
self.mdstate.pop()
160+
self.mdstarted.pop()
154161
self.treebuilder.end(item)
155162
if item == tag:
156163
break
@@ -163,6 +170,45 @@ def handle_endtag(self, tag):
163170
# If we only have one newline before block element, add another
164171
if not item.endswith('\n\n') and item.endswith('\n'):
165172
self.cleandoc.append('\n')
173+
174+
# Flatten the HTML structure of "markdown" blocks such that when they
175+
# get parsed, content will be parsed similar inside the blocks as it
176+
# does outside the block. Having real HTML elements in the tree before
177+
# the content adjacent content is processed can cause unpredictable
178+
# issues for extensions.
179+
current = element
180+
last = []
181+
while current is not None:
182+
for child in list(current):
183+
current.remove(child)
184+
text = current.text if current.text is not None else ''
185+
tail = child.tail if child.tail is not None else ''
186+
child.tail = None
187+
state = child.attrib.get('markdown', 'off')
188+
189+
# Add a newline to tail if it is not just a trailing newline
190+
if tail != '\n':
191+
tail = '\n' + tail.rstrip('\n')
192+
193+
# Ensure there is an empty new line between blocks
194+
if not text.endswith('\n\n'):
195+
text = text.rstrip('\n') + '\n\n'
196+
197+
# Process the block nested under the span appropriately
198+
if state in ('span', 'block'):
199+
current.text = f'{text}{self.md.htmlStash.store(child)}{tail}'
200+
last.append(child)
201+
else:
202+
# Non-Markdown HTML will not be recursively parsed for Markdown,
203+
# so we can just remove markers and leave them unflattened.
204+
# Additionally, we don't need to append to our list for further
205+
# processing.
206+
child.attrib.pop('markdown')
207+
[c.attrib.pop('markdown', None) for c in child.iter()]
208+
current.text = f'{text}{self.md.htmlStash.store(child)}{tail}'
209+
# Target the child elements that have been expanded.
210+
current = last.pop(0) if last else None
211+
166212
self.cleandoc.append(self.md.htmlStash.store(element))
167213
self.cleandoc.append('\n\n')
168214
self.state = []
@@ -208,6 +254,7 @@ def handle_data(self, data):
208254
if self.inraw or not self.mdstack:
209255
super().handle_data(data)
210256
else:
257+
self.mdstarted[-1] = False
211258
self.treebuilder.data(data)
212259

213260
def handle_empty_tag(self, data, is_block):
@@ -216,8 +263,10 @@ def handle_empty_tag(self, data, is_block):
216263
else:
217264
if self.at_line_start() and is_block:
218265
self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n')
219-
else:
266+
elif self.mdstate and self.mdstate[-1] == "off":
220267
self.handle_data(self.md.htmlStash.store(data))
268+
else:
269+
self.handle_data(data)
221270

222271
def parse_pi(self, i: int) -> int:
223272
if self.at_line_start() or self.intail or self.mdstack:
@@ -270,53 +319,56 @@ def parse_element_content(self, element: etree.Element) -> None:
270319
md_attr = element.attrib.pop('markdown', 'off')
271320

272321
if md_attr == 'block':
273-
# Parse content as block level
274-
# The order in which the different parts are parsed (text, children, tails) is important here as the
275-
# order of elements needs to be preserved. We can't be inserting items at a later point in the current
276-
# iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
277-
# example). Therefore, the order of operations is children, tails, text.
278-
279-
# Recursively parse existing children from raw HTML
280-
for child in list(element):
281-
self.parse_element_content(child)
282-
283-
# Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing.
284-
# Save the position of each item to be inserted later in reverse.
285-
tails = []
286-
for pos, child in enumerate(element):
287-
if child.tail:
288-
block = child.tail.rstrip('\n')
289-
child.tail = ''
290-
# Use a dummy placeholder element.
291-
dummy = etree.Element('div')
292-
self.parser.parseBlocks(dummy, block.split('\n\n'))
293-
children = list(dummy)
294-
children.reverse()
295-
tails.append((pos + 1, children))
296-
297-
# Insert the elements created from the tails in reverse.
298-
tails.reverse()
299-
for pos, tail in tails:
300-
for item in tail:
301-
element.insert(pos, item)
302-
303-
# Parse Markdown text content. Do this last to avoid raw HTML parsing.
322+
# Parse the block elements content as Markdown
304323
if element.text:
305324
block = element.text.rstrip('\n')
306325
element.text = ''
307-
# Use a dummy placeholder element as the content needs to get inserted before existing children.
308-
dummy = etree.Element('div')
309-
self.parser.parseBlocks(dummy, block.split('\n\n'))
310-
children = list(dummy)
311-
children.reverse()
312-
for child in children:
313-
element.insert(0, child)
326+
self.parser.parseBlocks(element, block.split('\n\n'))
314327

315328
elif md_attr == 'span':
316-
# Span level parsing will be handled by inline processors.
317-
# Walk children here to remove any `markdown` attributes.
318-
for child in list(element):
319-
self.parse_element_content(child)
329+
# Span elements need to be recursively processed for block elements and raw HTML
330+
# as their content is not normally accessed by block processors, so expand stashed
331+
# HTML under the span. Span content itself will not be parsed here, but will await
332+
# the inline parser.
333+
block = element.text if element.text is not None else ''
334+
element.text = ''
335+
child = None
336+
start = 0
337+
338+
# Search the content for HTML placeholders and process the elements
339+
for m in util.HTML_PLACEHOLDER_RE.finditer(block):
340+
index = int(m.group(1))
341+
el = self.parser.md.htmlStash.rawHtmlBlocks[index]
342+
end = m.start()
343+
344+
if isinstance(el, etree.Element):
345+
# Replace the placeholder with the element and process it.
346+
# Content after the placeholder should be attached to the tail.
347+
if child is None:
348+
element.text += block[start:end]
349+
else:
350+
child.tail += block[start:end]
351+
element.append(el)
352+
self.parse_element_content(el)
353+
child = el
354+
if child.tail is None:
355+
child.tail = ''
356+
self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
357+
self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
358+
359+
else:
360+
# Not an element object, so insert content back into the element
361+
if child is None:
362+
element.text += block[start:end]
363+
else:
364+
child.tail += block[start:end]
365+
start = end
366+
367+
# Insert anything left after last element
368+
if child is None:
369+
element.text += block[start:]
370+
else:
371+
child.tail += block[start:]
320372

321373
else:
322374
# Disable inline parsing for everything else
@@ -336,8 +388,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
336388
if isinstance(element, etree.Element):
337389
# We have a matched element. Process it.
338390
blocks.pop(0)
339-
self.parse_element_content(element)
340391
parent.append(element)
392+
self.parse_element_content(element)
341393
# Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
342394
self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
343395
self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')

markdown/inlinepatterns.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,13 @@ def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlinePro
158158
AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
159159
""" Match an automatic email link (`<[email protected]>`). """
160160

161-
HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)'
161+
HTML_RE = (
162+
r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|' # Tag
163+
r'!--(?:(?!<!--|-->).)*--|' # Comment
164+
r'[?](?:(?!<[?]|[?]>).)*[?]|' # Processing instruction
165+
r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]' # `CDATA`
166+
')>)'
167+
)
162168
""" Match an HTML tag (`<...>`). """
163169

164170
ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'

0 commit comments

Comments
 (0)