Skip to content

Commit 49535f8

Browse files
committed
Version 0.2.6
1 parent 8030443 commit 49535f8

File tree

7 files changed

+27
-11
lines changed

7 files changed

+27
-11
lines changed

CHANGES.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# Change Log
22

3+
## Changes in version 0.2.6
4+
5+
### Fixes:
6+
7+
* [Forum](https://forum.mupdf.com/t/bug-pymupdf4llm-list-index-out-of-range-in-document-layout-py-2/216) - List index out of range ...
8+
9+
### Other Changes:
10+
11+
12+
------
13+
314
## Changes in version 0.2.5
415

516
### Fixes:

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9-
version = "0.2.4" # must always equal the pymupdf4llm version
9+
version = "0.2.6" # must always equal the pymupdf4llm version
1010

1111
classifiers = [
1212
"Development Status :: 5 - Production/Stable",

pymupdf4llm/pymupdf4llm/helpers/document_layout.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ def fallback_text_to_md(textlines, ignore_code: bool = False, clip=None):
548548
for tl in textlines:
549549
ltext = "|" + "|".join([s["text"].strip() for s in tl["spans"]]) + "|\n"
550550
output += ltext
551-
output += "**----- End of picture text -----**<br>\n"
551+
output += "\n**----- End of picture text -----**<br>\n"
552552
return output + "\n\n"
553553

554554

@@ -631,7 +631,7 @@ def to_markdown(
631631
continue
632632

633633
# pictures and formulas: either write image file or embed
634-
if btype in ("picture", "formula", "fallback"):
634+
if btype in ("picture", "formula", "table-fallback"):
635635
if isinstance(box.image, str):
636636
output += GRAPHICS_TEXT % box.image + "\n\n"
637637
elif isinstance(box.image, bytes):
@@ -650,7 +650,7 @@ def to_markdown(
650650
ignore_code=ignore_code or page.full_ocred,
651651
clip=clip,
652652
)
653-
elif btype == "fallback":
653+
elif btype == "table-fallback":
654654
output += fallback_text_to_md(
655655
box.textlines,
656656
ignore_code=ignore_code or page.full_ocred,
@@ -741,7 +741,7 @@ def to_text(
741741
continue
742742
if btype == "page-footer" and footer is False:
743743
continue
744-
if btype in ("picture", "formula", "fallback"):
744+
if btype in ("picture", "formula", "table-fallback"):
745745
output += f"==> picture [{clip.width} x {clip.height}] <==\n\n"
746746
if box.textlines:
747747
if btype == "picture":
@@ -750,7 +750,7 @@ def to_text(
750750
ignore_code=ignore_code or page.full_ocred,
751751
clip=clip,
752752
)
753-
elif btype == "fallback":
753+
elif btype == "table-fallback":
754754
output += fallback_text_to_text(
755755
box.textlines,
756756
ignore_code=ignore_code or page.full_ocred,
@@ -1018,7 +1018,7 @@ def parse_document(
10181018

10191019
except Exception as e:
10201020
# print(f"table detection error '{e}' on page {page.number+1}")
1021-
layoutbox.boxclass = "fallback"
1021+
layoutbox.boxclass = "table-fallback"
10221022
# table structure not detected: treat like an image
10231023
if document.embed_images or document.write_images:
10241024
pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def sanitize_spans(line):
101101
):
102102
continue # no joining
103103
# We need to join bbox and text of two consecutive spans
104-
# On occasion, spans may also be duplicated.
104+
# Sometimes, spans may also be duplicated.
105105
if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]:
106106
s0["text"] += s1["text"]
107107
s0["bbox"] |= s1["bbox"] # join boundary boxes
@@ -131,7 +131,8 @@ def sanitize_spans(line):
131131
continue
132132
for sno, s in enumerate(line["spans"]): # the numered spans
133133
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
134-
if is_white(s["text"]): # ignore white text
134+
if is_white(s["text"]):
135+
# ignore white text if not a Type3 font
135136
continue
136137
# Ignore invisible text. Type 3 font text is never invisible.
137138
if (

pymupdf4llm/pymupdf4llm/helpers/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,10 @@ def clean_tables(page, blocks):
258258
l for b in blocks if b["type"] == 0 for l in b["lines"] if l["bbox"] in bbox
259259
]
260260
y_vals0 = sorted(set(round(l["bbox"][3]) for l in lines))
261+
if not y_vals0:
262+
# no text lines in the table bbox
263+
page.layout_information[i][4] = "table-fallback"
264+
continue
261265
y_vals = [y_vals0[0]]
262266
for y in y_vals0[1:]:
263267
if y - y_vals[-1] > 3:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Generated file - do not edit.
22
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
3-
VERSION = '0.2.5'
3+
VERSION = '0.2.6'

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"Topic :: Utilities",
1212
]
1313

14-
version = "0.2.5"
14+
version = "0.2.6"
1515
pymupdf_version = "1.26.6"
1616
pymupdf_version_tuple = tuple(int(x) for x in pymupdf_version.split("."))
1717
requires = [f"pymupdf>={pymupdf_version}", "tabulate"]

0 commit comments

Comments
 (0)