Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 63 additions & 2 deletions docling/backend/mspowerpoint_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,67 @@ def _get_effective_list_marker(self, shape, paragraph) -> dict:
"level": lvl,
}

def _get_shape_position(self, shape, attr: str) -> Optional[int]:
"""Return a shape position attribute as an integer for ordering."""
try:
value = getattr(shape, attr)
except (AttributeError, ValueError, TypeError):
return None

if value is None:
return None

return int(value)

def _iter_shapes_by_position(self, shapes):
"""Iterate shapes in visual top-to-bottom, left-to-right order.

PowerPoint stores shapes in creation/z-order, which can differ from the way
content is visually read on a slide. Sorting by position keeps split text
boxes, such as a subheading followed by its bullet textbox, adjacent in the
extracted document.
"""
row_tolerance = 45720 # 0.05 inch in EMUs
fallback_position = 2**63 - 1
shape_infos = []

for index, shape in enumerate(shapes):
top = self._get_shape_position(shape, "top")
left = self._get_shape_position(shape, "left")
shape_infos.append(
(
index,
shape,
top if top is not None else fallback_position,
left if left is not None else fallback_position,
)
)

shape_infos.sort(key=lambda shape_info: (shape_info[2], shape_info[0]))

rows = []
current_row = []
current_row_top = None

for shape_info in shape_infos:
top = shape_info[2]
if current_row_top is None or top - current_row_top <= row_tolerance:
current_row.append(shape_info)
current_row_top = top if current_row_top is None else current_row_top
else:
rows.append(current_row)
current_row = [shape_info]
current_row_top = top

if current_row:
rows.append(current_row)

for row in rows:
for _, shape, _, _ in sorted(
row, key=lambda shape_info: (shape_info[3], shape_info[0])
):
yield shape

def _handle_text_elements(
self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
):
Expand Down Expand Up @@ -743,13 +804,13 @@ def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):

def handle_groups(shape, parent_slide, slide_ind, doc, slide_size):
if _safe_shape_type(shape) == MSO_SHAPE_TYPE.GROUP:
for groupedshape in shape.shapes:
for groupedshape in self._iter_shapes_by_position(shape.shapes):
handle_shapes(
groupedshape, parent_slide, slide_ind, doc, slide_size
)

# Loop through each shape in the slide
for shape in slide.shapes:
for shape in self._iter_shapes_by_position(slide.shapes):
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)

# Handle notes slide
Expand Down
179 changes: 177 additions & 2 deletions tests/test_backend_pptx.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from pathlib import Path

import pytest
from pptx import Presentation
from pptx.oxml.xmlchemy import OxmlElement
from pptx.util import Inches, Pt

from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.document_converter import DocumentConverter
from docling.datamodel.document import ConversionResult, DoclingDocument, InputDocument

from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
Expand All @@ -22,11 +25,24 @@ def get_pptx_paths():


def get_converter():
from docling.document_converter import DocumentConverter

converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])

return converter


def convert_with_pptx_backend(pptx_path: Path) -> DoclingDocument:
in_doc = InputDocument(
path_or_stream=pptx_path,
format=InputFormat.PPTX,
backend=MsPowerpointDocumentBackend,
)

assert in_doc.valid
return in_doc._backend.convert()


def test_e2e_pptx_conversions():
pptx_paths = get_pptx_paths()
converter = get_converter()
Expand Down Expand Up @@ -122,3 +138,162 @@ def test_pptx_page_range():
assert "Second slide title" in pred_md
assert "Test Table Slide" not in pred_md
assert "List item4" not in pred_md


def test_pptx_issue_2663_keeps_bullets_with_subheadings():
pptx_path = Path("./tests/data/pptx/powerpoint_issue_2663.pptx")

doc = convert_with_pptx_backend(pptx_path)
pred_md = doc.export_to_markdown()

key_benefits = pred_md.index("Key Benefits:")
benefit_bullet = pred_md.index("Open-source software is cost-effective")
last_benefit_bullet = pred_md.index("Offers flexibility")
considerations = pred_md.index("Considerations When Using Open-Source Software:")
first_consideration_bullet = pred_md.index("Open-source projects often rely")
last_consideration_bullet = pred_md.index("advanced technical expertise")

assert key_benefits < benefit_bullet < last_benefit_bullet < considerations
assert considerations < first_consideration_bullet < last_consideration_bullet


def test_pptx_shapes_are_sorted_by_visual_position():
class FakeShape:
def __init__(self, name, top=None, left=None):
self.name = name
self.top = top
self.left = left

class BadPositionShape:
@property
def top(self):
raise ValueError("bad position")

backend = object.__new__(MsPowerpointDocumentBackend)

same_row_right = FakeShape("same-row-right", top=100, left=300)
lower_left = FakeShape("lower-left", top=200000, left=100)
same_row_left = FakeShape("same-row-left", top=1000, left=100)
unpositioned = FakeShape("unpositioned")

ordered_shapes = backend._iter_shapes_by_position(
[lower_left, same_row_right, unpositioned, same_row_left]
)

assert [shape.name for shape in ordered_shapes] == [
"same-row-left",
"same-row-right",
"lower-left",
"unpositioned",
]
assert backend._get_shape_position(BadPositionShape(), "top") is None


def test_pptx_split_list_textboxes_follow_visual_order(tmp_path):
"""Visually ordered subheadings should keep their own following bullets."""

def add_textbox(slide, left, top, width, height, text, font_size=24):
textbox = slide.shapes.add_textbox(
Inches(left), Inches(top), Inches(width), Inches(height)
)
text_frame = textbox.text_frame
text_frame.clear()
paragraph = text_frame.paragraphs[0]
paragraph.text = text
paragraph.font.size = Pt(font_size)
return textbox

def mark_as_bullet(paragraph):
paragraph_properties = paragraph._p.get_or_add_pPr()
bullet = OxmlElement("a:buChar")
bullet.set("char", "\u2022")
paragraph_properties.insert(0, bullet)

def add_bullet_textbox(slide, left, top, width, height, items):
textbox = slide.shapes.add_textbox(
Inches(left), Inches(top), Inches(width), Inches(height)
)
text_frame = textbox.text_frame
text_frame.clear()

for index, item in enumerate(items):
paragraph = (
text_frame.paragraphs[0] if index == 0 else text_frame.add_paragraph()
)
paragraph.text = item
paragraph.font.size = Pt(18)
mark_as_bullet(paragraph)

return textbox

presentation = Presentation()
slide = presentation.slides.add_slide(presentation.slide_layouts[6])

add_textbox(slide, 3.0, 0.4, 4.0, 0.5, "Open-Source Software", 32)
add_textbox(slide, 4.6, 1.4, 2.5, 0.4, "Introduction", 20)
add_textbox(slide, 0.9, 1.5, 3.0, 0.4, "Key Benefits:", 22)
add_bullet_textbox(
slide,
1.2,
2.1,
8.0,
1.6,
[
"Cost effective",
"Transparent community",
],
)
# Add this textbox before its subheading to mimic PPTX creation/z-order that
# does not match the visual reading order.
add_bullet_textbox(
slide,
1.2,
5.2,
8.0,
1.2,
[
"Community support can vary",
"Maintenance requires expertise",
],
)
add_textbox(slide, 0.9, 4.6, 6.0, 0.4, "Considerations:", 22)

pptx_path = tmp_path / "split_list_textboxes.pptx"
presentation.save(pptx_path)

doc = convert_with_pptx_backend(pptx_path)
pred_md = doc.export_to_markdown()

assert pred_md.index("Key Benefits:") < pred_md.index("Cost effective")
assert pred_md.index("Transparent community") < pred_md.index("Considerations:")
assert pred_md.index("Considerations:") < pred_md.index("Community support")
assert pred_md.index("Community support") < pred_md.index(
"Maintenance requires expertise"
)


def test_pptx_grouped_shapes_follow_visual_order(tmp_path):
presentation = Presentation()
slide = presentation.slides.add_slide(presentation.slide_layouts[6])

lower_textbox = slide.shapes.add_textbox(
Inches(1.0), Inches(2.0), Inches(4.0), Inches(0.5)
)
lower_textbox.text = "Lower grouped textbox"

upper_textbox = slide.shapes.add_textbox(
Inches(1.0), Inches(1.0), Inches(4.0), Inches(0.5)
)
upper_textbox.text = "Upper grouped textbox"

slide.shapes.add_group_shape([lower_textbox, upper_textbox])

pptx_path = tmp_path / "grouped_textboxes.pptx"
presentation.save(pptx_path)

doc = convert_with_pptx_backend(pptx_path)
pred_md = doc.export_to_markdown()

assert pred_md.index("Upper grouped textbox") < pred_md.index(
"Lower grouped textbox"
)
Loading