Merge pull request #337 from pymupdf/v0.2.4

JorjMcKie · web-flow · commit 681673fbd936 · 2025-11-25T08:58:04.000-04:00
Version 0.2.4
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,15 @@
 # Change Log
 
+## Changes in version 0.2.4
+
+### Fixes:
+
+* [335](https://github.com/pymupdf/RAG/issues/335) - KeyError "has_ocr_text"
+
+### Other Changes:
+
+
+------
 ## Changes in version 0.2.3
 
 ### Fixes:
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
@@ -6,7 +6,7 @@
 with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
     readme = f.read()
 
-version = "0.2.3"  # must always equal the pymupdf4llm version
+version = "0.2.4"  # must always equal the pymupdf4llm version
 
 classifiers = [
     "Development Status :: 5 - Production/Stable",
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -37,6 +37,7 @@ def parse_document(
         embed_images=False,
         show_progress=False,
         force_text=True,
+        use_ocr=True,
     ):
         return document_layout.parse_document(
             doc,
@@ -50,6 +51,7 @@ def parse_document(
             embed_images=embed_images,
             show_progress=show_progress,
             force_text=force_text,
+            use_ocr=use_ocr,
         )
 
     def to_markdown(
@@ -72,6 +74,7 @@ def to_markdown(
         page_height=None,
         ignore_code=False,
         show_progress=False,
+        use_ocr=True,
         # unsupported options for pymupdf layout:
         **kwargs,
     ):
@@ -89,6 +92,7 @@ def to_markdown(
             embed_images=embed_images,
             show_progress=show_progress,
             force_text=force_text,
+            use_ocr=use_ocr,
         )
         return parsed_doc.to_markdown(
             header=header,
@@ -99,6 +103,7 @@ def to_markdown(
             show_progress=show_progress,
             page_separators=page_separators,
             page_chunks=page_chunks,
+            use_ocr=use_ocr,
         )
 
     def to_json(
@@ -112,6 +117,7 @@ def to_json(
         embed_images=False,
         show_progress=False,
         force_text=True,
+        use_ocr=True,
         # unsupported options for pymupdf layout:
         **kwargs,
     ):
@@ -125,6 +131,7 @@ def to_json(
             write_images=write_images,
             show_progress=show_progress,
             force_text=force_text,
+            use_ocr=use_ocr,
         )
         return parsed_doc.to_json()
 
@@ -138,6 +145,7 @@ def to_text(
         show_progress=False,
         force_text=True,
         ocr_dpi=400,
+        use_ocr=True,
         # unsupported options for pymupdf layout:
         **kwargs,
     ):
@@ -149,6 +157,7 @@ def to_text(
             write_images=False,
             show_progress=show_progress,
             force_text=force_text,
+            use_ocr=use_ocr,
         )
         return parsed_doc.to_text(
             header=header,
diff --git a/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py b/pymupdf4llm/pymupdf4llm/helpers/check_ocr.py
@@ -1,7 +1,7 @@
 import cv2
 import numpy as np
 import pymupdf  # PyMuPDF
-from pymupdf4llm.helpers.utils import WHITE_CHARS
+from pymupdf4llm.helpers.utils import WHITE_CHARS, analyze_page
 
 FLAGS = (
     0
@@ -108,11 +108,6 @@
 """
 
 
-def is_white(text):
-    """Identify white text."""
-    return WHITE_CHARS.issuperset(text)
-
-
 def get_span_ocr(page, bbox, dpi=300):
     """Return OCR'd span text using Tesseract.
 
@@ -197,96 +192,6 @@ def get_page_image(page, dpi=150, covered=None):
     return gray, matrix, pix
 
 
-def analyze_page(page, blocks=None) -> dict:
-    """Analyze the page for the OCR decision.
-
-    Args:
-        blocks: output of page.get_text("dict") if already available
-    Returns:
-        A dict with analysis results. The area-related float values are
-        computed as fractions of the total covered area.
-
-        "covered": pymupdf.Rect, page area covered by content
-        "img_joins": float, fraction of area of the joined images
-        "img_area": float, fraction of sum of image area sizes
-        "txt_joins": float, fraction of area of the joined text spans
-        "txt_area": float, fraction of sum of text span bbox area sizes
-        "vec_joins": float, fraction of area of the joined vector characters
-        "vec_area": float, fraction of sum of vector character area sizes
-        "chars_total": int, count of visible characters
-        "chars_bad": int, count of Replacement Unicode characters
-        "ocr_spans": int, count of text spans with 'GlyphLessFont'
-
-    """
-    chars_total = 0
-    chars_bad = 0
-    if blocks is None:
-        blocks = page.get_text(
-            "dict",
-            flags=FLAGS,
-            clip=pymupdf.INFINITE_RECT(),
-        )["blocks"]
-    img_rect = pymupdf.EMPTY_RECT()
-    txt_rect = +img_rect
-    vec_rect = +img_rect
-    img_area = 0
-    txt_area = 0
-    vec_area = 0
-    ocr_spans = 0
-    for b in blocks:
-        bbox = page.rect & b["bbox"]
-        area = bbox.width * bbox.height
-        if not area:
-            continue
-        if b["type"] == 1:  # Image block
-            img_rect |= bbox
-            img_area += area
-        elif b["type"] == 0:  # Text block
-            for l in b["lines"]:
-                for s in l["spans"]:
-                    if is_white(s["text"]):
-                        continue
-                    sr = page.rect & s["bbox"]
-                    if sr.is_empty or sr.is_infinite:
-                        continue
-                    if s["font"] == "GlyphLessFont":
-                        ocr_spans += 1
-                    elif s["alpha"] == 0:
-                        continue  # skip invisible text
-                    chars_total += len(s["text"].strip())
-                    chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)])
-                    txt_rect |= sr
-                    txt_area += sr.width * sr.height
-        elif (
-            1
-            and b["type"] == 3  # vector block
-            and b["stroked"]  # has been stroked
-            and bbox.width <= 20  # width limit for typical characters
-            and bbox.height <= 20  # height limit for typical characters
-            and not b["isrect"]  # contains curves
-        ):
-            # potential character-like vector block
-            vec_rect |= bbox
-            vec_area += area
-
-    # the rectangle on page covered by some content
-    covered = img_rect | txt_rect | vec_rect
-    cover_area = abs(covered)
-    analysis = {
-        "covered": covered,
-        "img_joins": (abs(img_rect) / cover_area) if cover_area else 0,
-        "img_area": img_area / cover_area if cover_area else 0,
-        "txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0,
-        "txt_area": txt_area / cover_area if cover_area else 0,
-        "vec_area": vec_area / cover_area if cover_area else 0,
-        "vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0,
-        "chars_total": chars_total,
-        "chars_bad": chars_bad,
-        "ocr_spans": ocr_spans,
-    }
-    return analysis
-
-
 def should_ocr_page(
     page,
     dpi=150,
diff --git a/pymupdf4llm/pymupdf4llm/helpers/document_layout.py b/pymupdf4llm/pymupdf4llm/helpers/document_layout.py
@@ -18,7 +18,11 @@
     from pymupdf4llm.helpers.progress import ProgressBar
 try:
     import cv2
-    from pymupdf4llm.helpers import check_ocr
+
+    if hasattr(cv2, "Canny"):
+        from pymupdf4llm.helpers import check_ocr
+    else:
+        cv2 = None
 except ImportError:
     cv2 = None
 
@@ -777,6 +781,7 @@ def parse_document(
     embed_images=False,
     write_images=False,
     force_text=False,
+    use_ocr=True,
 ) -> ParsedDocument:
     if isinstance(doc, pymupdf.Document):
         mydoc = doc
@@ -803,14 +808,17 @@ def parse_document(
         raise ValueError("Cannot both embed and write images.")
     document.embed_images = embed_images
     document.write_images = write_images
-    try:
-        reason = "OpenCV not installed"
-        assert cv2 is not None
-        reason = "Tesseract language data not found"
-        assert pymupdf.get_tessdata()
-        document.use_ocr = True
-    except Exception as e:
-        print(f"{reason}. OCR disabled.", file=INFO_MESSAGES)
+    if use_ocr:
+        try:
+            reason = "OpenCV not installed"
+            assert cv2 is not None
+            reason = "Tesseract language data not found"
+            assert pymupdf.get_tessdata()
+            document.use_ocr = True
+        except Exception as e:
+            print(f"OCR disabled: {reason}.")
+            document.use_ocr = False
+    else:
         document.use_ocr = False
     if pages is None:
         page_filter = range(mydoc.page_count)
@@ -848,6 +856,8 @@ def parse_document(
             )
         else:
             decision = {"should_ocr": False}
+            page_analysis = utils.analyze_page(page, blocks)
+            decision["has_ocr_text"] = page_analysis["ocr_spans"] > 0
 
         if decision["has_ocr_text"]:  # prevent MD styling if already OCR'd
             page_full_ocred = True
diff --git a/pymupdf4llm/pymupdf4llm/helpers/utils.py b/pymupdf4llm/pymupdf4llm/helpers/utils.py
@@ -46,6 +46,112 @@
     + list(map(chr, range(0x25A0, 0x2600)))
 )
 
+FLAGS = (
+    0
+    | pymupdf.TEXT_COLLECT_STYLES
+    | pymupdf.TEXT_COLLECT_VECTORS
+    | pymupdf.TEXT_PRESERVE_IMAGES
+    | pymupdf.TEXT_ACCURATE_BBOXES
+    # | pymupdf.TEXT_MEDIABOX_CLIP
+)
+
+REPLACEMENT_CHARACTER = chr(0xFFFD)
+
+
+def is_white(text):
+    """Identify white text."""
+    return WHITE_CHARS.issuperset(text)
+
+
+def analyze_page(page, blocks=None) -> dict:
+    """Analyze the page for the OCR decision.
+
+    Args:
+        blocks: output of page.get_text("dict") if already available
+    Returns:
+        A dict with analysis results. The area-related float values are
+        computed as fractions of the total covered area.
+
+        "covered": pymupdf.Rect, page area covered by content
+        "img_joins": float, fraction of area of the joined images
+        "img_area": float, fraction of sum of image area sizes
+        "txt_joins": float, fraction of area of the joined text spans
+        "txt_area": float, fraction of sum of text span bbox area sizes
+        "vec_joins": float, fraction of area of the joined vector characters
+        "vec_area": float, fraction of sum of vector character area sizes
+        "chars_total": int, count of visible characters
+        "chars_bad": int, count of Replacement Unicode characters
+        "ocr_spans": int, count of text spans with 'GlyphLessFont'
+
+    """
+    chars_total = 0
+    chars_bad = 0
+    if blocks is None:
+        blocks = page.get_text(
+            "dict",
+            flags=FLAGS,
+            clip=pymupdf.INFINITE_RECT(),
+        )["blocks"]
+    img_rect = pymupdf.EMPTY_RECT()
+    txt_rect = +img_rect
+    vec_rect = +img_rect
+    img_area = 0
+    txt_area = 0
+    vec_area = 0
+    ocr_spans = 0
+    for b in blocks:
+        bbox = page.rect & b["bbox"]
+        area = bbox.width * bbox.height
+        if not area:
+            continue
+        if b["type"] == 1:  # Image block
+            img_rect |= bbox
+            img_area += area
+        elif b["type"] == 0:  # Text block
+            for l in b["lines"]:
+                for s in l["spans"]:
+                    if is_white(s["text"]):
+                        continue
+                    sr = page.rect & s["bbox"]
+                    if sr.is_empty or sr.is_infinite:
+                        continue
+                    if s["font"] == "GlyphLessFont":
+                        ocr_spans += 1
+                    elif s["alpha"] == 0:
+                        continue  # skip invisible text
+                    chars_total += len(s["text"].strip())
+                    chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)])
+                    txt_rect |= sr
+                    txt_area += sr.width * sr.height
+        elif (
+            1
+            and b["type"] == 3  # vector block
+            and b["stroked"]  # has been stroked
+            and bbox.width <= 20  # width limit for typical characters
+            and bbox.height <= 20  # height limit for typical characters
+            and not b["isrect"]  # contains curves
+        ):
+            # potential character-like vector block
+            vec_rect |= bbox
+            vec_area += area
+
+    # the rectangle on page covered by some content
+    covered = img_rect | txt_rect | vec_rect
+    cover_area = abs(covered)
+    analysis = {
+        "covered": covered,
+        "img_joins": (abs(img_rect) / cover_area) if cover_area else 0,
+        "img_area": img_area / cover_area if cover_area else 0,
+        "txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0,
+        "txt_area": txt_area / cover_area if cover_area else 0,
+        "vec_area": vec_area / cover_area if cover_area else 0,
+        "vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0,
+        "chars_total": chars_total,
+        "chars_bad": chars_bad,
+        "ocr_spans": ocr_spans,
+    }
+    return analysis
+
 
 def table_cleaner(page, blocks, tbbox):
     """Clean the table bbox 'tbbox'.
diff --git a/pymupdf4llm/pymupdf4llm/versions_file.py b/pymupdf4llm/pymupdf4llm/versions_file.py
@@ -1,3 +1,3 @@
 # Generated file - do not edit.
 MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
-VERSION = '0.2.3'
+VERSION = '0.2.4'
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py
@@ -14,7 +14,7 @@
     "Topic :: Utilities",
 ]
 
-version = "0.2.3"
+version = "0.2.4"
 requires = ["pymupdf>=1.26.6", "tabulate"]
 
 text = requires[0].split("=")[1]

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@`
`14`	`14`	`"Topic :: Utilities",`
`15`	`15`	`]`
`16`	`16`
`17`		`-version = "0.2.3"`
	`17`	`+version = "0.2.4"`
`18`	`18`	`requires = ["pymupdf>=1.26.6", "tabulate"]`
`19`	`19`
`20`	`20`	`text = requires[0].split("=")[1]`