@@ -61,6 +61,7 @@ def reset(self):
61
61
self .mdstack : list [str ] = [] # When markdown=1, stack contains a list of tags
62
62
self .treebuilder = etree .TreeBuilder ()
63
63
self .mdstate : list [Literal ['block' , 'span' , 'off' , None ]] = []
64
+ self .mdstarted : list [bool ] = []
64
65
super ().reset ()
65
66
66
67
def close (self ):
@@ -111,7 +112,10 @@ def handle_starttag(self, tag, attrs):
111
112
self .handle_empty_tag (data , True )
112
113
return
113
114
114
- if tag in self .block_level_tags and (self .at_line_start () or self .intail ):
115
+ if (
116
+ tag in self .block_level_tags and
117
+ (self .at_line_start () or self .intail or self .mdstarted and self .mdstarted [- 1 ])
118
+ ):
115
119
# Valueless attribute (ex: `<tag checked>`) results in `[('checked', None)]`.
116
120
# Convert to `{'checked': 'checked'}`.
117
121
attrs = {key : value if value is not None else key for key , value in attrs }
@@ -126,8 +130,10 @@ def handle_starttag(self, tag, attrs):
126
130
self .handle_endtag ('p' )
127
131
self .mdstate .append (state )
128
132
self .mdstack .append (tag )
133
+ self .mdstarted .append (True )
129
134
attrs ['markdown' ] = state
130
135
self .treebuilder .start (tag , attrs )
136
+
131
137
else :
132
138
# Span level tag
133
139
if self .inraw :
@@ -151,6 +157,7 @@ def handle_endtag(self, tag):
151
157
while self .mdstack :
152
158
item = self .mdstack .pop ()
153
159
self .mdstate .pop ()
160
+ self .mdstarted .pop ()
154
161
self .treebuilder .end (item )
155
162
if item == tag :
156
163
break
@@ -163,6 +170,45 @@ def handle_endtag(self, tag):
163
170
# If we only have one newline before block element, add another
164
171
if not item .endswith ('\n \n ' ) and item .endswith ('\n ' ):
165
172
self .cleandoc .append ('\n ' )
173
+
174
+ # Flatten the HTML structure of "markdown" blocks such that when they
175
+ # get parsed, content will be parsed similar inside the blocks as it
176
+ # does outside the block. Having real HTML elements in the tree before
177
+ # the content adjacent content is processed can cause unpredictable
178
+ # issues for extensions.
179
+ current = element
180
+ last = []
181
+ while current is not None :
182
+ for child in list (current ):
183
+ current .remove (child )
184
+ text = current .text if current .text is not None else ''
185
+ tail = child .tail if child .tail is not None else ''
186
+ child .tail = None
187
+ state = child .attrib .get ('markdown' , 'off' )
188
+
189
+ # Add a newline to tail if it is not just a trailing newline
190
+ if tail != '\n ' :
191
+ tail = '\n ' + tail .rstrip ('\n ' )
192
+
193
+ # Ensure there is an empty new line between blocks
194
+ if not text .endswith ('\n \n ' ):
195
+ text = text .rstrip ('\n ' ) + '\n \n '
196
+
197
+ # Process the block nested under the span appropriately
198
+ if state in ('span' , 'block' ):
199
+ current .text = f'{ text } { self .md .htmlStash .store (child )} { tail } '
200
+ last .append (child )
201
+ else :
202
+ # Non-Markdown HTML will not be recursively parsed for Markdown,
203
+ # so we can just remove markers and leave them unflattened.
204
+ # Additionally, we don't need to append to our list for further
205
+ # processing.
206
+ child .attrib .pop ('markdown' )
207
+ [c .attrib .pop ('markdown' , None ) for c in child .iter ()]
208
+ current .text = f'{ text } { self .md .htmlStash .store (child )} { tail } '
209
+ # Target the child elements that have been expanded.
210
+ current = last .pop (0 ) if last else None
211
+
166
212
self .cleandoc .append (self .md .htmlStash .store (element ))
167
213
self .cleandoc .append ('\n \n ' )
168
214
self .state = []
@@ -208,6 +254,7 @@ def handle_data(self, data):
208
254
if self .inraw or not self .mdstack :
209
255
super ().handle_data (data )
210
256
else :
257
+ self .mdstarted [- 1 ] = False
211
258
self .treebuilder .data (data )
212
259
213
260
def handle_empty_tag (self , data , is_block ):
@@ -216,8 +263,10 @@ def handle_empty_tag(self, data, is_block):
216
263
else :
217
264
if self .at_line_start () and is_block :
218
265
self .handle_data ('\n ' + self .md .htmlStash .store (data ) + '\n \n ' )
219
- else :
266
+ elif self . mdstate and self . mdstate [ - 1 ] == "off" :
220
267
self .handle_data (self .md .htmlStash .store (data ))
268
+ else :
269
+ self .handle_data (data )
221
270
222
271
def parse_pi (self , i : int ) -> int :
223
272
if self .at_line_start () or self .intail or self .mdstack :
@@ -270,53 +319,56 @@ def parse_element_content(self, element: etree.Element) -> None:
270
319
md_attr = element .attrib .pop ('markdown' , 'off' )
271
320
272
321
if md_attr == 'block' :
273
- # Parse content as block level
274
- # The order in which the different parts are parsed (text, children, tails) is important here as the
275
- # order of elements needs to be preserved. We can't be inserting items at a later point in the current
276
- # iteration as we don't want to do raw processing on elements created from parsing Markdown text (for
277
- # example). Therefore, the order of operations is children, tails, text.
278
-
279
- # Recursively parse existing children from raw HTML
280
- for child in list (element ):
281
- self .parse_element_content (child )
282
-
283
- # Parse Markdown text in tail of children. Do this separate to avoid raw HTML parsing.
284
- # Save the position of each item to be inserted later in reverse.
285
- tails = []
286
- for pos , child in enumerate (element ):
287
- if child .tail :
288
- block = child .tail .rstrip ('\n ' )
289
- child .tail = ''
290
- # Use a dummy placeholder element.
291
- dummy = etree .Element ('div' )
292
- self .parser .parseBlocks (dummy , block .split ('\n \n ' ))
293
- children = list (dummy )
294
- children .reverse ()
295
- tails .append ((pos + 1 , children ))
296
-
297
- # Insert the elements created from the tails in reverse.
298
- tails .reverse ()
299
- for pos , tail in tails :
300
- for item in tail :
301
- element .insert (pos , item )
302
-
303
- # Parse Markdown text content. Do this last to avoid raw HTML parsing.
322
+ # Parse the block elements content as Markdown
304
323
if element .text :
305
324
block = element .text .rstrip ('\n ' )
306
325
element .text = ''
307
- # Use a dummy placeholder element as the content needs to get inserted before existing children.
308
- dummy = etree .Element ('div' )
309
- self .parser .parseBlocks (dummy , block .split ('\n \n ' ))
310
- children = list (dummy )
311
- children .reverse ()
312
- for child in children :
313
- element .insert (0 , child )
326
+ self .parser .parseBlocks (element , block .split ('\n \n ' ))
314
327
315
328
elif md_attr == 'span' :
316
- # Span level parsing will be handled by inline processors.
317
- # Walk children here to remove any `markdown` attributes.
318
- for child in list (element ):
319
- self .parse_element_content (child )
329
+ # Span elements need to be recursively processed for block elements and raw HTML
330
+ # as their content is not normally accessed by block processors, so expand stashed
331
+ # HTML under the span. Span content itself will not be parsed here, but will await
332
+ # the inline parser.
333
+ block = element .text if element .text is not None else ''
334
+ element .text = ''
335
+ child = None
336
+ start = 0
337
+
338
+ # Search the content for HTML placeholders and process the elements
339
+ for m in util .HTML_PLACEHOLDER_RE .finditer (block ):
340
+ index = int (m .group (1 ))
341
+ el = self .parser .md .htmlStash .rawHtmlBlocks [index ]
342
+ end = m .start ()
343
+
344
+ if isinstance (el , etree .Element ):
345
+ # Replace the placeholder with the element and process it.
346
+ # Content after the placeholder should be attached to the tail.
347
+ if child is None :
348
+ element .text += block [start :end ]
349
+ else :
350
+ child .tail += block [start :end ]
351
+ element .append (el )
352
+ self .parse_element_content (el )
353
+ child = el
354
+ if child .tail is None :
355
+ child .tail = ''
356
+ self .parser .md .htmlStash .rawHtmlBlocks .pop (index )
357
+ self .parser .md .htmlStash .rawHtmlBlocks .insert (index , '' )
358
+
359
+ else :
360
+ # Not an element object, so insert content back into the element
361
+ if child is None :
362
+ element .text += block [start :end ]
363
+ else :
364
+ child .tail += block [start :end ]
365
+ start = end
366
+
367
+ # Insert anything left after last element
368
+ if child is None :
369
+ element .text += block [start :]
370
+ else :
371
+ child .tail += block [start :]
320
372
321
373
else :
322
374
# Disable inline parsing for everything else
@@ -336,8 +388,8 @@ def run(self, parent: etree.Element, blocks: list[str]) -> bool:
336
388
if isinstance (element , etree .Element ):
337
389
# We have a matched element. Process it.
338
390
blocks .pop (0 )
339
- self .parse_element_content (element )
340
391
parent .append (element )
392
+ self .parse_element_content (element )
341
393
# Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
342
394
self .parser .md .htmlStash .rawHtmlBlocks .pop (index )
343
395
self .parser .md .htmlStash .rawHtmlBlocks .insert (index , '' )
0 commit comments