Adjust md_in_html "markdown" blocks to process content consistently (#1503)

facelessuser · web-flow · commit f2ae4e4a453b · 2025-01-28T13:03:58.000-07:00
Ensure `md_in_html` processes content inside a "markdown" block the same way content is processed outside of a "markdown" block. - Flatten the HTML content into placeholders so that the parser will treat the "markdown" block content in the same way it does when `md_in_html` is not enabled. The placeholders are expanded once the parser reaches them in a linear fashion. This allows extensions to deal with HTML content and consume it the same way it deals with them with them when the content is not nested under a "markdown" block. - Instead of content being processed in dummy tags, content is now processed under the real parent allowing extensions to have better context to make better decisions. Additionally, fix some issues with tags and inline code. Also, fix some issues with one-liner block tags, e.g. `<tag><tag>...` Resolves #1502 Resolves #1075 Resolves #1074
diff --git a/.spell-dict b/.spell-dict
@@ -178,7 +178,7 @@ plugins
 configs
 pre
 formatters
-
+unflattened
 dedented
 Setext
 unindented
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -17,6 +17,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 * Backslash Unescape IDs set via `attr_list` on `toc` (#1493).
+* `md_in_html` will process content inside "markdown" blocks a similar way
+  as they are parsed outside of "markdown" blocks giving a more consistent
+  expectation to external extensions (#1503).
+* `md_in_html` handle tags within inline code blocks better (#1075).
+* `md_in_html` fix handling of one-liner block HTML handling (#1074)
 
 ## [3.7] -- 2024-08-16
 
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
@@ -61,6 +61,7 @@ def reset(self):
         self.mdstack: list[str] = []  # When markdown=1, stack contains a list of tags
         self.treebuilder = etree.TreeBuilder()
         self.mdstate: list[Literal['block', 'span', 'off', None]] = []
+        self.mdstarted: list[bool] = []
         super().reset()
 
     def close(self):
@@ -111,7 +112,10 @@ def handle_starttag(self, tag, attrs):
             self.handle_empty_tag(data, True)
             return
 
-        if tag in self.block_level_tags and (self.at_line_start() or self.intail):
+        if (
+            tag in self.block_level_tags and
+            (self.at_line_start() or self.intail or self.mdstarted and self.mdstarted[-1])
+        ):
             # Valueless attribute (ex: `<tag checked>`) results in `[('checked', None)]`.
             # Convert to `{'checked': 'checked'}`.
             attrs = {key: value if value is not None else key for key, value in attrs}
@@ -126,8 +130,10 @@ def handle_starttag(self, tag, attrs):
                     self.handle_endtag('p')
                 self.mdstate.append(state)
                 self.mdstack.append(tag)
+                self.mdstarted.append(True)
                 attrs['markdown'] = state
                 self.treebuilder.start(tag, attrs)
+
         else:
             # Span level tag
             if self.inraw:
@@ -151,6 +157,7 @@ def handle_endtag(self, tag):
                 while self.mdstack:
                     item = self.mdstack.pop()
                     self.mdstate.pop()
+                    self.mdstarted.pop()
                     self.treebuilder.end(item)
                     if item == tag:
                         break
@@ -163,6 +170,45 @@ def handle_endtag(self, tag):
                     # If we only have one newline before block element, add another
                     if not item.endswith('\n\n') and item.endswith('\n'):
                         self.cleandoc.append('\n')
+
+                    # Flatten the HTML structure of "markdown" blocks such that when they
+                    # get parsed, content will be parsed similar inside the blocks as it
+                    # does outside the block. Having real HTML elements in the tree before
+                    # the content adjacent content is processed can cause unpredictable
+                    # issues for extensions.
+                    current = element
+                    last = []
+                    while current is not None:
+                        for child in list(current):
+                            current.remove(child)
+                            text = current.text if current.text is not None else ''
+                            tail = child.tail if child.tail is not None else ''
+                            child.tail = None
+                            state = child.attrib.get('markdown', 'off')
+
+                            # Add a newline to tail if it is not just a trailing newline
+                            if tail != '\n':
+                                tail = '\n' + tail.rstrip('\n')
+
+                            # Ensure there is an empty new line between blocks
+                            if not text.endswith('\n\n'):
+                                text = text.rstrip('\n') + '\n\n'
+
+                            # Process the block nested under the span appropriately
+                            if state in ('span', 'block'):
+                                current.text = f'{text}{self.md.htmlStash.store(child)}{tail}'
+                                last.append(child)
+                            else:
+                                # Non-Markdown HTML will not be recursively parsed for Markdown,
+                                # so we can just remove markers and leave them unflattened.
+                                # Additionally, we don't need to append to our list for further
+                                # processing.
+                                child.attrib.pop('markdown')
+                                [c.attrib.pop('markdown', None) for c in child.iter()]
+                                current.text = f'{text}{self.md.htmlStash.store(child)}{tail}'
+                        # Target the child elements that have been expanded.
+                        current = last.pop(0) if last else None
+
                     self.cleandoc.append(self.md.htmlStash.store(element))
                     self.cleandoc.append('\n\n')
                     self.state = []
@@ -208,6 +254,7 @@ def handle_data(self, data):
         if self.inraw or not self.mdstack:
             super().handle_data(data)
         else:
+            self.mdstarted[-1] = False
             self.treebuilder.data(data)
 
     def handle_empty_tag(self, data, is_block):
@@ -216,8 +263,10 @@ def handle_empty_tag(self, data, is_block):
         else:
             if self.at_line_start() and is_block:
                 self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n')
-            else:
+            elif self.mdstate and self.mdstate[-1] == "off":
                 self.handle_data(self.md.htmlStash.store(data))
+            else:
+                self.handle_data(data)
 
     def parse_pi(self, i: int) -> int:
         if self.at_line_start() or self.intail or self.mdstack:
@@ -270,53 +319,56 @@ def parse_element_content(self, element: etree.Element) -> None:
         md_attr = element.attrib.pop('markdown', 'off')
 
         if md_attr == 'block':
-            # Parse content as block level
-            # The order in which the different parts are parsed (text, children, tails) is important here as the
-            # order of elements needs to be preserved. We can't be inserting items at a later point in the current
-            # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
-            # example). Therefore, the order of operations is children, tails, text.
-
-            # Recursively parse existing children from raw HTML
-            for child in list(element):
-                self.parse_element_content(child)
-
-            # Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing.
-            # Save the position of each item to be inserted later in reverse.
-            tails = []
-            for pos, child in enumerate(element):
-                if child.tail:
-                    block = child.tail.rstrip('\n')
-                    child.tail = ''
-                    # Use a dummy placeholder element.
-                    dummy = etree.Element('div')
-                    self.parser.parseBlocks(dummy, block.split('\n\n'))
-                    children = list(dummy)
-                    children.reverse()
-                    tails.append((pos + 1, children))
-
-            # Insert the elements created from the tails in reverse.
-            tails.reverse()
-            for pos, tail in tails:
-                for item in tail:
-                    element.insert(pos, item)
-
-            # Parse Markdown text content. Do this last to avoid raw HTML parsing.
+            # Parse the block elements content as Markdown
             if element.text:
                 block = element.text.rstrip('\n')
                 element.text = ''
-                # Use a dummy placeholder element as the content needs to get inserted before existing children.
-                dummy = etree.Element('div')
-                self.parser.parseBlocks(dummy, block.split('\n\n'))
-                children = list(dummy)
-                children.reverse()
-                for child in children:
-                    element.insert(0, child)
+                self.parser.parseBlocks(element, block.split('\n\n'))
 
         elif md_attr == 'span':
-            # Span level parsing will be handled by inline processors.
-            # Walk children here to remove any `markdown` attributes.
-            for child in list(element):
-                self.parse_element_content(child)
+            # Span elements need to be recursively processed for block elements and raw HTML
+            # as their content is not normally accessed by block processors, so expand stashed
+            # HTML under the span. Span content itself will not be parsed here, but will await
+            # the inline parser.
+            block = element.text if element.text is not None else ''
+            element.text = ''
+            child = None
+            start = 0
+
+            # Search the content for HTML placeholders and process the elements
+            for m in util.HTML_PLACEHOLDER_RE.finditer(block):
+                index = int(m.group(1))
+                el = self.parser.md.htmlStash.rawHtmlBlocks[index]
+                end = m.start()
+
+                if isinstance(el, etree.Element):
+                    # Replace the placeholder with the element and process it.
+                    # Content after the placeholder should be attached to the tail.
+                    if child is None:
+                        element.text += block[start:end]
+                    else:
+                        child.tail += block[start:end]
+                    element.append(el)
+                    self.parse_element_content(el)
+                    child = el
+                    if child.tail is None:
+                        child.tail = ''
+                    self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
+                    self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
+
+                else:
+                    # Not an element object, so insert content back into the element
+                    if child is None:
+                        element.text += block[start:end]
+                    else:
+                        child.tail += block[start:end]
+                start = end
+
+            # Insert anything left after last element
+            if child is None:
+                element.text += block[start:]
+            else:
+                child.tail += block[start:]
 
         else:
             # Disable inline parsing for everything else
@@ -336,8 +388,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
             if isinstance(element, etree.Element):
                 # We have a matched element. Process it.
                 blocks.pop(0)
-                self.parse_element_content(element)
                 parent.append(element)
+                self.parse_element_content(element)
                 # Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
                 self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
                 self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py
@@ -158,7 +158,13 @@ def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlinePro
 AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>'
 """ Match an automatic email link (`<me@example.com>`). """
 
-HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)'
+HTML_RE = (
+    r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|'          # Tag
+    r'!--(?:(?!<!--|-->).)*--|'                   # Comment
+    r'[?](?:(?!<[?]|[?]>).)*[?]|'                 # Processing instruction
+    r'!\[CDATA\[(?:(?!<!\[CDATA\[|\]\]>).)*\]\]'  # `CDATA`
+    ')>)'
+)
 """ Match an HTML tag (`<...>`). """
 
 ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)'
diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py
diff --git a/tox.ini b/tox.ini