Skip to content

Commit c26b2c1

Browse files
committed
Updates for v0.0.22
1 parent 57cddeb commit c26b2c1

File tree

6 files changed

+118
-40
lines changed

6 files changed

+118
-40
lines changed

CHANGES.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,17 @@
11
# Change Log
22

3+
## Changes in version 0.0.22
4+
5+
### Fixes:
6+
7+
* [255](https://github.com/pymupdf/RAG/issues/255) - Single-row/column tables are skipped
8+
* [258](https://github.com/pymupdf/RAG/issues/258) - Pymupdf4llm to_markdown crashes on some documents
9+
10+
### Other Changes:
11+
12+
* Added class `TocHeaders` as an alternative way for identifying headers.
13+
14+
315
## Changes in version 0.0.21
416

517
### Fixes:

pdf4llm/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@
1313
"Programming Language :: Python :: 3",
1414
"Topic :: Utilities",
1515
]
16-
requires = ["pymupdf4llm==0.0.21"]
16+
requires = ["pymupdf4llm==0.0.22"]
1717

1818
setuptools.setup(
1919
name="pdf4llm",
20-
version="0.0.21",
20+
version="0.0.22",
2121
author="Artifex",
2222
author_email="[email protected]",
2323
description="PyMuPDF Utilities for LLM/RAG",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.21"
3+
__version__ = "0.0.22"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ def sanitize_spans(line):
7171
line.sort(key=lambda s: s["bbox"].x0)
7272
# join spans, delete duplicates
7373
for i in range(len(line) - 1, 0, -1): # iterate back to front
74-
s0 = line[i - 1]
75-
s1 = line[i]
74+
s0 = line[i - 1] # preceding span
75+
s1 = line[i] # this span
7676
# "delta" depends on the font size. Spans will be joined if
7777
# no more than 10% of the font size separates them and important
7878
# attributes are the same.
@@ -107,13 +107,12 @@ def sanitize_spans(line):
107107
continue
108108
for sno, s in enumerate(line["spans"]): # the numered spans
109109
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
110-
mpoint = (sbbox.tl + sbbox.br) / 2 # middle point
111-
if mpoint not in clip:
112-
continue
113110
if is_white(s["text"]): # ignore white text
114111
continue
115112
if s["alpha"] == 0: # ignore invisible text
116113
continue
114+
if abs(sbbox & clip) < abs(sbbox) * 0.8: # if not in clip
115+
continue
117116
if s["flags"] & 1 == 1: # if a superscript, modify bbox
118117
# with that of the preceding or following span
119118
i = 1 if sno == 0 else sno - 1

pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py

Lines changed: 98 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -144,11 +144,12 @@ def __init__(
144144
[f for f in fontsizes.keys() if f > self.body_limit],
145145
reverse=True,
146146
)[:max_levels]
147-
self.body_limit = min(self.body_limit, sizes[-1] - 1 if sizes else body_limit)
148147

149148
# make the header tag dictionary
150149
for i, size in enumerate(sizes, start=1):
151150
self.header_id[size] = "#" * i + " "
151+
if self.header_id.keys():
152+
self.body_limit = min(self.header_id.keys()) - 1
152153

153154
def get_header_id(self, span: dict, page=None) -> str:
154155
"""Return appropriate markdown header prefix.
@@ -163,6 +164,54 @@ def get_header_id(self, span: dict, page=None) -> str:
163164
return hdr_id
164165

165166

167+
class TocHeaders:
168+
"""Compute data for identifying header text.
169+
170+
This is an alternative to IdentifyHeaders. Instead of running through the
171+
full document to identify font sizes, it uses the document's Table Of
172+
Contents (TOC) to identify headers on pages.
173+
Like IdentifyHeaders, this also is no guarantee to find headers, but it
174+
is a good change for appropriately build documents. In such cases, this
175+
method can be very much faster and more accurate, because we can use the
176+
hierarchy level of TOC items directly to ientify the header level.
177+
Examples where this approach works very well are the Adobe PDF documents.
178+
"""
179+
180+
def __init__(self, doc: str):
181+
"""Read and store the TOC of the document."""
182+
if isinstance(doc, pymupdf.Document):
183+
mydoc = doc
184+
else:
185+
mydoc = pymupdf.open(doc)
186+
187+
self.TOC = doc.get_toc()
188+
if mydoc != doc:
189+
# if opened here, close it now
190+
mydoc.close()
191+
192+
def get_header_id(self, span: dict, page=None) -> str:
193+
"""Return appropriate markdown header prefix.
194+
195+
Given a text span from a "dict"/"rawdict" extraction, determine the
196+
markdown header prefix string of 0 to n concatenated '#' characters.
197+
"""
198+
if page is None:
199+
return ""
200+
# check if this page has TOC entries with an actual title
201+
my_toc = [t for t in self.TOC if t[1] and t[-1] == page.number + 1]
202+
if not my_toc:
203+
return ""
204+
# check if the span matches a TOC entry
205+
text = span["text"].strip()
206+
for t in toc:
207+
title = t[1].strip() # title of TOC entry
208+
lvl = t[0] # level of TOC entry
209+
if text.startswith(title) or title.startswith(text):
210+
# found a match: return the header tag
211+
return "#" * lvl + " "
212+
return ""
213+
214+
166215
# store relevant parameters here
167216
@dataclass
168217
class Parameters:
@@ -216,19 +265,33 @@ def is_significant(box, paths):
216265
else:
217266
d = box.height * 0.025
218267
nbox = box + (d, d, -d, -d) # nbox covers 90% of box interior
219-
# paths contained in box:
268+
# paths contained in, but not equal to box:
220269
my_paths = [p for p in paths if p["rect"] in box and p["rect"] != box]
221270
for p in my_paths:
222271
rect = p["rect"]
223-
if not (rect & nbox).is_empty: # intersects interior: significant!
272+
if (
273+
not (rect & nbox).is_empty and not p["rect"].is_empty
274+
): # intersects interior: significant!
224275
return True
225276
# Remaining case: a horizontal or vertical line
226277
# horizontal line:
227-
if rect.y0 == rect.y1 and rect.x0 < nbox.x1 and rect.x1 > nbox.x0:
228-
return True
278+
if (
279+
1
280+
and rect.y0 == rect.y1
281+
and nbox.y0 <= rect.y0 <= nbox.y1
282+
and rect.x0 < nbox.x1
283+
and rect.x1 > nbox.x0
284+
):
285+
pass # return True
229286
# vertical line
230-
if rect.x0 == rect.x1 and rect.y0 < nbox.y1 and rect.y1 > nbox.y0:
231-
return True
287+
if (
288+
1
289+
and rect.x0 == rect.x1
290+
and nbox.x0 <= rect.x0 <= nbox.x1
291+
and rect.y0 < nbox.y1
292+
and rect.y1 > nbox.y0
293+
):
294+
pass # return True
232295
return False
233296

234297

@@ -654,8 +717,10 @@ def is_in_rects(rect, rect_list):
654717
def intersects_rects(rect, rect_list):
655718
"""Check if middle of rect is contained in a rect of the list."""
656719
delta = (-1, -1, 1, 1) # enlarge rect_list members somewhat by this
720+
enlarged = rect + delta
721+
abs_enlarged = abs(enlarged) * 0.5
657722
for i, r in enumerate(rect_list, start=1):
658-
if (rect.tl + rect.br) / 2 in r + delta: # middle point is inside r
723+
if abs(enlarged & r) > abs_enlarged:
659724
return i
660725
return 0
661726

@@ -764,31 +829,32 @@ def get_bg_color(page):
764829
page. If they are unicolor and of the same color, we assume this to
765830
be the background color.
766831
"""
767-
pix = page.get_pixmap(clip=(0, 0, 10, 10))
768-
if not pix.is_unicolor:
832+
pix = page.get_pixmap(
833+
clip=(page.rect.x0, page.rect.y0, page.rect.x0 + 10, page.rect.y0 + 10)
834+
)
835+
if not pix.samples or not pix.is_unicolor:
769836
return None
770837
pixel_ul = pix.pixel(0, 0) # upper left color
771-
pix = page.get_pixmap(clip=(page.rect.width - 10, 0, page.rect.width, 10))
772-
if not pix.is_unicolor:
838+
pix = page.get_pixmap(
839+
clip=(page.rect.x1 - 10, page.rect.y0, page.rect.x1, page.rect.y0 + 10)
840+
)
841+
if not pix.samples or not pix.is_unicolor:
773842
return None
774843
pixel_ur = pix.pixel(0, 0) # upper right color
775844
if not pixel_ul == pixel_ur:
776845
return None
777-
pix = page.get_pixmap(clip=(0, page.rect.height - 10, 10, page.rect.height))
778-
if not pix.is_unicolor:
846+
pix = page.get_pixmap(
847+
clip=(page.rect.x0, page.rect.y1 - 10, page.rect.x0 + 10, page.rect.y1)
848+
)
849+
if not pix.samples or not pix.is_unicolor:
779850
return None
780851
pixel_ll = pix.pixel(0, 0) # lower left color
781852
if not pixel_ul == pixel_ll:
782853
return None
783854
pix = page.get_pixmap(
784-
clip=(
785-
page.rect.width - 10,
786-
page.rect.height - 10,
787-
page.rect.width,
788-
page.rect.height,
789-
)
855+
clip=(page.rect.x1 - 10, page.rect.y1 - 10, page.rect.x1, page.rect.y1)
790856
)
791-
if not pix.is_unicolor:
857+
if not pix.samples or not pix.is_unicolor:
792858
return None
793859
pixel_lr = pix.pixel(0, 0) # lower right color
794860
if not pixel_ul == pixel_lr:
@@ -881,7 +947,7 @@ def get_page_output(
881947
for i in img_info
882948
if i["bbox"].width >= image_size_limit * parms.clip.width
883949
and i["bbox"].height >= image_size_limit * parms.clip.height
884-
and i["bbox"] in parms.clip
950+
and i["bbox"].intersects(parms.clip)
885951
and i["bbox"].width > 3
886952
and i["bbox"].height > 3
887953
]
@@ -904,23 +970,23 @@ def get_page_output(
904970

905971
# Locate all tables on page
906972
parms.written_tables = [] # stores already written tables
973+
omitted_table_rects = []
907974
if table_strategy is None:
908975
parms.tabs = []
909976
else:
910977
parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
911-
del_this = []
912-
for i, t in enumerate(parms.tabs):
978+
# remove tables with too few rows or columns
979+
for i in range(len(parms.tabs.tables) - 1, -1, -1):
980+
t = parms.tabs.tables[i]
913981
if t.row_count < 2 or t.col_count < 2:
914-
# ignore tables with too few rows or columns
915-
del_this.append(i)
916-
for i in sorted(del_this, reverse=True):
917-
del parms.tabs.tables[i]
982+
omitted_table_rects.append(pymupdf.Rect(t.bbox))
983+
del parms.tabs.tables[i]
918984
parms.tabs.tables.sort(key=lambda t: (t.bbox[0], t.bbox[1]))
919985

920986
# Make a list of table boundary boxes.
921987
# Must include the header bbox (which may exist outside tab.bbox)
922988
tab_rects = {}
923-
for i, t in enumerate(parms.tabs):
989+
for i, t in enumerate(parms.tabs.tables):
924990
tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
925991
tab_dict = {
926992
"bbox": tuple(tab_rects[i]),
@@ -944,7 +1010,9 @@ def get_page_output(
9441010
and p["rect"].height < parms.clip.height
9451011
and (p["rect"].width > 3 or p["rect"].height > 3)
9461012
and not (p["fill"] == parms.bg_color and p["fill"] != None)
947-
and not intersects_rects(p["rect"], parms.tab_rects0)
1013+
and not intersects_rects(
1014+
p["rect"], parms.tab_rects0 + omitted_table_rects
1015+
)
9481016
and not intersects_rects(p["rect"], parms.annot_rects)
9491017
]
9501018
else:
@@ -977,7 +1045,6 @@ def get_page_output(
9771045
parms.vg_clusters0 = refine_boxes(vg_clusters0)
9781046

9791047
parms.vg_clusters = dict((i, r) for i, r in enumerate(parms.vg_clusters0))
980-
9811048
# identify text bboxes on page, avoiding tables, images and graphics
9821049
text_rects = column_boxes(
9831050
parms.page,

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
setuptools.setup(
1919
name="pymupdf4llm",
20-
version="0.0.21",
20+
version="0.0.22",
2121
author="Artifex",
2222
author_email="[email protected]",
2323
description="PyMuPDF Utilities for LLM/RAG",

0 commit comments

Comments
 (0)