Updates for v0.0.22

JorjMcKie · JorjMcKie · commit c26b2c198fbd · 2025-04-28T05:09:24.000-04:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,17 @@
 # Change Log
 
+## Changes in version 0.0.22
+
+### Fixes:
+
+* [255](https://github.com/pymupdf/RAG/issues/255) - Single-row/column tables are skipped
+* [258](https://github.com/pymupdf/RAG/issues/258) - Pymupdf4llm to_markdown crashes on some documents
+
+### Other Changes:
+
+* Added class `TocHeaders` as an alternative way for identifying headers.
+
+
 ## Changes in version 0.0.21
 
 ### Fixes:
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
@@ -13,11 +13,11 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf4llm==0.0.21"]
+requires = ["pymupdf4llm==0.0.22"]
 
 setuptools.setup(
     name="pdf4llm",
-    version="0.0.21",
+    version="0.0.22",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.21"
+__version__ = "0.0.22"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -71,8 +71,8 @@ def sanitize_spans(line):
         line.sort(key=lambda s: s["bbox"].x0)
         # join spans, delete duplicates
         for i in range(len(line) - 1, 0, -1):  # iterate back to front
-            s0 = line[i - 1]
-            s1 = line[i]
+            s0 = line[i - 1]  # preceding span
+            s1 = line[i]  # this span
             # "delta" depends on the font size. Spans  will be joined if
             # no more than 10% of the font size separates them and important
             # attributes are the same.
@@ -107,13 +107,12 @@ def sanitize_spans(line):
                 continue
             for sno, s in enumerate(line["spans"]):  # the numered spans
                 sbbox = pymupdf.Rect(s["bbox"])  # span bbox as a Rect
-                mpoint = (sbbox.tl + sbbox.br) / 2  # middle point
-                if mpoint not in clip:
-                    continue
                 if is_white(s["text"]):  # ignore white text
                     continue
                 if s["alpha"] == 0:  # ignore invisible text
                     continue
+                if abs(sbbox & clip) < abs(sbbox) * 0.8:  # if not in clip
+                    continue
                 if s["flags"] & 1 == 1:  # if a superscript, modify bbox
                     # with that of the preceding or following span
                     i = 1 if sno == 0 else sno - 1
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -144,11 +144,12 @@ def __init__(
             [f for f in fontsizes.keys() if f > self.body_limit],
             reverse=True,
         )[:max_levels]
-        self.body_limit = min(self.body_limit, sizes[-1] - 1 if sizes else body_limit)
 
         # make the header tag dictionary
         for i, size in enumerate(sizes, start=1):
             self.header_id[size] = "#" * i + " "
+        if self.header_id.keys():
+            self.body_limit = min(self.header_id.keys()) - 1
 
     def get_header_id(self, span: dict, page=None) -> str:
         """Return appropriate markdown header prefix.
@@ -163,6 +164,54 @@ def get_header_id(self, span: dict, page=None) -> str:
         return hdr_id
 
 
+class TocHeaders:
+    """Compute data for identifying header text.
+
+    This is an alternative to IdentifyHeaders. Instead of running through the
+    full document to identify font sizes, it uses the document's Table Of
+    Contents (TOC) to identify headers on pages.
+    Like IdentifyHeaders, this also is no guarantee to find headers, but it
+    is a good change for appropriately build documents. In such cases, this
+    method can be very much faster and more accurate, because we can use the
+    hierarchy level of TOC items directly to ientify the header level.
+    Examples where this approach works very well are the Adobe PDF documents.
+    """
+
+    def __init__(self, doc: str):
+        """Read and store the TOC of the document."""
+        if isinstance(doc, pymupdf.Document):
+            mydoc = doc
+        else:
+            mydoc = pymupdf.open(doc)
+
+        self.TOC = doc.get_toc()
+        if mydoc != doc:
+            # if opened here, close it now
+            mydoc.close()
+
+    def get_header_id(self, span: dict, page=None) -> str:
+        """Return appropriate markdown header prefix.
+
+        Given a text span from a "dict"/"rawdict" extraction, determine the
+        markdown header prefix string of 0 to n concatenated '#' characters.
+        """
+        if page is None:
+            return ""
+        # check if this page has TOC entries with an actual title
+        my_toc = [t for t in self.TOC if t[1] and t[-1] == page.number + 1]
+        if not my_toc:
+            return ""
+        # check if the span matches a TOC entry
+        text = span["text"].strip()
+        for t in toc:
+            title = t[1].strip()  # title of TOC entry
+            lvl = t[0]  # level of TOC entry
+            if text.startswith(title) or title.startswith(text):
+                # found a match: return the header tag
+                return "#" * lvl + " "
+        return ""
+
+
 # store relevant parameters here
 @dataclass
 class Parameters:
@@ -216,19 +265,33 @@ def is_significant(box, paths):
     else:
         d = box.height * 0.025
     nbox = box + (d, d, -d, -d)  # nbox covers 90% of box interior
-    # paths contained in box:
+    # paths contained in, but not equal to box:
     my_paths = [p for p in paths if p["rect"] in box and p["rect"] != box]
     for p in my_paths:
         rect = p["rect"]
-        if not (rect & nbox).is_empty:  # intersects interior: significant!
+        if (
+            not (rect & nbox).is_empty and not p["rect"].is_empty
+        ):  # intersects interior: significant!
             return True
         # Remaining case: a horizontal or vertical line
         # horizontal line:
-        if rect.y0 == rect.y1 and rect.x0 < nbox.x1 and rect.x1 > nbox.x0:
-            return True
+        if (
+            1
+            and rect.y0 == rect.y1
+            and nbox.y0 <= rect.y0 <= nbox.y1
+            and rect.x0 < nbox.x1
+            and rect.x1 > nbox.x0
+        ):
+            pass  # return True
         # vertical line
-        if rect.x0 == rect.x1 and rect.y0 < nbox.y1 and rect.y1 > nbox.y0:
-            return True
+        if (
+            1
+            and rect.x0 == rect.x1
+            and nbox.x0 <= rect.x0 <= nbox.x1
+            and rect.y0 < nbox.y1
+            and rect.y1 > nbox.y0
+        ):
+            pass  # return True
     return False
 
 
@@ -654,8 +717,10 @@ def is_in_rects(rect, rect_list):
     def intersects_rects(rect, rect_list):
         """Check if middle of rect is contained in a rect of the list."""
         delta = (-1, -1, 1, 1)  # enlarge rect_list members somewhat by this
+        enlarged = rect + delta
+        abs_enlarged = abs(enlarged) * 0.5
         for i, r in enumerate(rect_list, start=1):
-            if (rect.tl + rect.br) / 2 in r + delta:  # middle point is inside r
+            if abs(enlarged & r) > abs_enlarged:
                 return i
         return 0
 
@@ -764,31 +829,32 @@ def get_bg_color(page):
         page. If they are unicolor and of the same color, we assume this to
         be the background color.
         """
-        pix = page.get_pixmap(clip=(0, 0, 10, 10))
-        if not pix.is_unicolor:
+        pix = page.get_pixmap(
+            clip=(page.rect.x0, page.rect.y0, page.rect.x0 + 10, page.rect.y0 + 10)
+        )
+        if not pix.samples or not pix.is_unicolor:
             return None
         pixel_ul = pix.pixel(0, 0)  # upper left color
-        pix = page.get_pixmap(clip=(page.rect.width - 10, 0, page.rect.width, 10))
-        if not pix.is_unicolor:
+        pix = page.get_pixmap(
+            clip=(page.rect.x1 - 10, page.rect.y0, page.rect.x1, page.rect.y0 + 10)
+        )
+        if not pix.samples or not pix.is_unicolor:
             return None
         pixel_ur = pix.pixel(0, 0)  # upper right color
         if not pixel_ul == pixel_ur:
             return None
-        pix = page.get_pixmap(clip=(0, page.rect.height - 10, 10, page.rect.height))
-        if not pix.is_unicolor:
+        pix = page.get_pixmap(
+            clip=(page.rect.x0, page.rect.y1 - 10, page.rect.x0 + 10, page.rect.y1)
+        )
+        if not pix.samples or not pix.is_unicolor:
             return None
         pixel_ll = pix.pixel(0, 0)  # lower left color
         if not pixel_ul == pixel_ll:
             return None
         pix = page.get_pixmap(
-            clip=(
-                page.rect.width - 10,
-                page.rect.height - 10,
-                page.rect.width,
-                page.rect.height,
-            )
+            clip=(page.rect.x1 - 10, page.rect.y1 - 10, page.rect.x1, page.rect.y1)
         )
-        if not pix.is_unicolor:
+        if not pix.samples or not pix.is_unicolor:
             return None
         pixel_lr = pix.pixel(0, 0)  # lower right color
         if not pixel_ul == pixel_lr:
@@ -881,7 +947,7 @@ def get_page_output(
             for i in img_info
             if i["bbox"].width >= image_size_limit * parms.clip.width
             and i["bbox"].height >= image_size_limit * parms.clip.height
-            and i["bbox"] in parms.clip
+            and i["bbox"].intersects(parms.clip)
             and i["bbox"].width > 3
             and i["bbox"].height > 3
         ]
@@ -904,23 +970,23 @@ def get_page_output(
 
         # Locate all tables on page
         parms.written_tables = []  # stores already written tables
+        omitted_table_rects = []
         if table_strategy is None:
             parms.tabs = []
         else:
             parms.tabs = page.find_tables(clip=parms.clip, strategy=table_strategy)
-            del_this = []
-            for i, t in enumerate(parms.tabs):
+            # remove tables with too few rows or columns
+            for i in range(len(parms.tabs.tables) - 1, -1, -1):
+                t = parms.tabs.tables[i]
                 if t.row_count < 2 or t.col_count < 2:
-                    # ignore tables with too few rows or columns
-                    del_this.append(i)
-            for i in sorted(del_this, reverse=True):
-                del parms.tabs.tables[i]
+                    omitted_table_rects.append(pymupdf.Rect(t.bbox))
+                    del parms.tabs.tables[i]
             parms.tabs.tables.sort(key=lambda t: (t.bbox[0], t.bbox[1]))
 
         # Make a list of table boundary boxes.
         # Must include the header bbox (which may exist outside tab.bbox)
         tab_rects = {}
-        for i, t in enumerate(parms.tabs):
+        for i, t in enumerate(parms.tabs.tables):
             tab_rects[i] = pymupdf.Rect(t.bbox) | pymupdf.Rect(t.header.bbox)
             tab_dict = {
                 "bbox": tuple(tab_rects[i]),
@@ -944,7 +1010,9 @@ def get_page_output(
                 and p["rect"].height < parms.clip.height
                 and (p["rect"].width > 3 or p["rect"].height > 3)
                 and not (p["fill"] == parms.bg_color and p["fill"] != None)
-                and not intersects_rects(p["rect"], parms.tab_rects0)
+                and not intersects_rects(
+                    p["rect"], parms.tab_rects0 + omitted_table_rects
+                )
                 and not intersects_rects(p["rect"], parms.annot_rects)
             ]
         else:
@@ -977,7 +1045,6 @@ def get_page_output(
         parms.vg_clusters0 = refine_boxes(vg_clusters0)
 
         parms.vg_clusters = dict((i, r) for i, r in enumerate(parms.vg_clusters0))
-
         # identify text bboxes on page, avoiding tables, images and graphics
         text_rects = column_boxes(
             parms.page,
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
@@ -17,7 +17,7 @@
 
 setuptools.setup(
     name="pymupdf4llm",
-    version="0.0.21",
+    version="0.0.22",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",