Merge pull request #236 from pymupdf/Version-0.0.19

JorjMcKie · web-flow · commit 47b68a36d218 · 2025-03-29T09:16:08.000-04:00
Version 0.0.19
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,6 +1,50 @@
 # Change Log
 
 
+## Changes in version 0.0.19
+
+### Fixes:
+The following list includes fixes made in version 0.0.18 already.
+
+* [158](https://github.com/pymupdf/RAG/issues/158) - Very long titles when converting to markdown.
+* [155](https://github.com/pymupdf/RAG/issues/155) - Inconsistent image extraction from image-only PDFs
+* [161](https://github.com/pymupdf/RAG/issues/161) - force_text param ignored.
+* [162](https://github.com/pymupdf/RAG/issues/162) - to_markdown isn't outputting all the pages but get_text is.
+* [173](https://github.com/pymupdf/RAG/issues/173) - First column of table is repeated before the actual table.
+* [187](https://github.com/pymupdf/RAG/issues/187) - Unsolicited Text Particles
+* [188](https://github.com/pymupdf/RAG/issues/188) - Takes lot of time to convert into markdown.
+* [191](https://github.com/pymupdf/RAG/issues/191) - Extraction of text stops in the middle while working fine with PyMuPDF.
+* [212](https://github.com/pymupdf/RAG/issues/212) - In pymupdf4llm, if a page has multiple images, only 1 image per-page is extracted.
+* [213](https://github.com/pymupdf/RAG/issues/213) - Many ���� after converting when using pymupdf4llm
+* [215](https://github.com/pymupdf/RAG/issues/215) - Spending too much time on identifying text bboxes
+* [218](https://github.com/pymupdf/RAG/issues/218) - IndexError in get_raw_lines when processing PDFs with formulas
+* [225](https://github.com/pymupdf/RAG/issues/225) - Text with background missing from output.
+* [229](https://github.com/pymupdf/RAG/issues/229) - Duplicated Table Content on pymuPDF4LLM.
+
+
+### Other Changes:
+
+* Added **_new parameter_** `filename`: (str), optional. Overwrites or sets the filename for saved images. Useful when the document is opened from memory.
+
+* Added **_new parameter_** `use_glyphs`: (bool), optional. Request to use the glyph number (if possible) of a character if the font has no back-translation to the original Unicode value. The default is `False` which causes &#xfffd; symbols to be rendered in these cases.
+
+* Added **_strike-out support_**: We now detect and render ~~striked-out text.~~
+
+* Improved **_background color_** detection: We have introduced a simple background color detection mechanism: If a page shows an identical color in all four corners, we assume this to be the background color. Text and vector graphics with this color will be ignored as invisible.
+
+* Improved **_invisible text detection_**: Text with an alpha value of 0 is now ignored.
+
+* Improved **_fake-bold_** detection: Text mimicking bold appearance is now treated like standard bold text in most cases.
+
+* Header handling changes:
+    - Detection now happens based on the **_largest font size_** of the line.
+    - Uniformly rendered: All spans of a header line will now be rendered with the same appearance.
+
+* Changed handling of parameter `graphics_limit`: We previously ignored a page completely if the vector graphics count exceeded the limit. We now only ignore vector graphics if their count **_outside table boundary boxes_** is too large. This should only suppress vector graphics on the page, while keeping images, text and table content extractable.
+
+* Changed the `margins` default to 0. The previous default `(0, 50, 0, 50)` ignored 50 points at the top and bottom of pages. This has turned out to cause confusion in too many cases.
+
+
 ## Changes in version 0.0.17
 
 ### Fixes:
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
@@ -13,11 +13,11 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf4llm>=0.0.18"]
+requires = ["pymupdf4llm>=0.0.19"]
 
 setuptools.setup(
     name="pdf4llm",
-    version="0.0.18",
+    version="0.0.19",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
@@ -32,4 +32,10 @@
     package_data={
         "pdf4llm": ["LICENSE"],
     },
+    project_urls={
+        "Documentation": "https://pymupdf.readthedocs.io/",
+        "Source": "https://github.com/pymupdf/RAG/tree/main/pdf4llm/pdf4llm",
+        "Tracker": "https://github.com/pymupdf/RAG/issues",
+        "Changelog": "https://github.com/pymupdf/RAG/blob/main/CHANGES.md",
+    },
 )
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.18"
+__version__ = "0.0.19"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -74,11 +74,15 @@ def sanitize_spans(line):
             s0 = line[i - 1]
             s1 = line[i]
             # "delta" depends on the font size. Spans  will be joined if
-            # no more than 10% of the font size separates them.
+            # no more than 10% of the font size separates them and important
+            # attributes are the same.
             delta = s1["size"] * 0.1
-            if s0["bbox"].x1 + delta < s1["bbox"].x0:
-                continue  # all good: no joining neded
-
+            if s0["bbox"].x1 + delta < s1["bbox"].x0 or (
+                s0["flags"],
+                s0["char_flags"],
+                s0["size"],
+            ) != (s1["flags"], s1["char_flags"], s1["size"]):
+                continue  # no joining
             # We need to join bbox and text of two consecutive spans
             # On occasion, spans may also be duplicated.
             if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]:
@@ -108,11 +112,14 @@ def sanitize_spans(line):
                     continue
                 if is_white(s["text"]):  # ignore white text
                     continue
+                if s["alpha"] == 0:  # ignore invisible text
+                    continue
                 if s["flags"] & 1 == 1:  # if a superscript, modify bbox
                     # with that of the preceding or following span
                     i = 1 if sno == 0 else sno - 1
-                    neighbor = line["spans"][i]
-                    sbbox.y1 = neighbor["bbox"][3]
+                    if len(line["spans"]) > i:
+                        neighbor = line["spans"][i]
+                        sbbox.y1 = neighbor["bbox"][3]
                     s["text"] = f"[{s['text']}]"
                 s["bbox"] = sbbox  # update with the Rect version
                 # include line/block numbers to facilitate separator insertion
@@ -132,10 +139,7 @@ def sanitize_spans(line):
         sbbox = s["bbox"]  # this bbox
         sbbox0 = line[-1]["bbox"]  # previous bbox
         # if any of top or bottom coordinates are close enough, join...
-        if (
-            abs(sbbox.y1 - sbbox0.y1) <= y_delta
-            or abs(sbbox.y0 - sbbox0.y0) <= y_delta
-        ):
+        if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta:
             line.append(s)  # append to this line
             lrect |= sbbox  # extend line rectangle
             continue
@@ -156,9 +160,7 @@ def sanitize_spans(line):
     return nlines
 
 
-def get_text_lines(
-    page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False
-):
+def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False):
     """Extract text by line keeping natural reading sequence.
 
     Notes:
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py