Skip to content

Commit 681673f

Browse files
authored
Merge pull request #337 from pymupdf/v0.2.4
Version 0.2.4
2 parents cf0eecd + eb8edfd commit 681673f

File tree

8 files changed

+148
-108
lines changed

8 files changed

+148
-108
lines changed

CHANGES.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
# Change Log
22

3+
## Changes in version 0.2.4
4+
5+
### Fixes:
6+
7+
* [335](https://github.com/pymupdf/RAG/issues/335) - KeyError "has_ocr_text"
8+
9+
### Other Changes:
10+
11+
12+
------
313
## Changes in version 0.2.3
414

515
### Fixes:

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9-
version = "0.2.3" # must always equal the pymupdf4llm version
9+
version = "0.2.4" # must always equal the pymupdf4llm version
1010

1111
classifiers = [
1212
"Development Status :: 5 - Production/Stable",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def parse_document(
3737
embed_images=False,
3838
show_progress=False,
3939
force_text=True,
40+
use_ocr=True,
4041
):
4142
return document_layout.parse_document(
4243
doc,
@@ -50,6 +51,7 @@ def parse_document(
5051
embed_images=embed_images,
5152
show_progress=show_progress,
5253
force_text=force_text,
54+
use_ocr=use_ocr,
5355
)
5456

5557
def to_markdown(
@@ -72,6 +74,7 @@ def to_markdown(
7274
page_height=None,
7375
ignore_code=False,
7476
show_progress=False,
77+
use_ocr=True,
7578
# unsupported options for pymupdf layout:
7679
**kwargs,
7780
):
@@ -89,6 +92,7 @@ def to_markdown(
8992
embed_images=embed_images,
9093
show_progress=show_progress,
9194
force_text=force_text,
95+
use_ocr=use_ocr,
9296
)
9397
return parsed_doc.to_markdown(
9498
header=header,
@@ -99,6 +103,7 @@ def to_markdown(
99103
show_progress=show_progress,
100104
page_separators=page_separators,
101105
page_chunks=page_chunks,
106+
use_ocr=use_ocr,
102107
)
103108

104109
def to_json(
@@ -112,6 +117,7 @@ def to_json(
112117
embed_images=False,
113118
show_progress=False,
114119
force_text=True,
120+
use_ocr=True,
115121
# unsupported options for pymupdf layout:
116122
**kwargs,
117123
):
@@ -125,6 +131,7 @@ def to_json(
125131
write_images=write_images,
126132
show_progress=show_progress,
127133
force_text=force_text,
134+
use_ocr=use_ocr,
128135
)
129136
return parsed_doc.to_json()
130137

@@ -138,6 +145,7 @@ def to_text(
138145
show_progress=False,
139146
force_text=True,
140147
ocr_dpi=400,
148+
use_ocr=True,
141149
# unsupported options for pymupdf layout:
142150
**kwargs,
143151
):
@@ -149,6 +157,7 @@ def to_text(
149157
write_images=False,
150158
show_progress=show_progress,
151159
force_text=force_text,
160+
use_ocr=use_ocr,
152161
)
153162
return parsed_doc.to_text(
154163
header=header,

pymupdf4llm/pymupdf4llm/helpers/check_ocr.py

Lines changed: 1 addition & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import cv2
22
import numpy as np
33
import pymupdf # PyMuPDF
4-
from pymupdf4llm.helpers.utils import WHITE_CHARS
4+
from pymupdf4llm.helpers.utils import WHITE_CHARS, analyze_page
55

66
FLAGS = (
77
0
@@ -108,11 +108,6 @@
108108
"""
109109

110110

111-
def is_white(text):
112-
"""Identify white text."""
113-
return WHITE_CHARS.issuperset(text)
114-
115-
116111
def get_span_ocr(page, bbox, dpi=300):
117112
"""Return OCR'd span text using Tesseract.
118113
@@ -197,96 +192,6 @@ def get_page_image(page, dpi=150, covered=None):
197192
return gray, matrix, pix
198193

199194

200-
def analyze_page(page, blocks=None) -> dict:
201-
"""Analyze the page for the OCR decision.
202-
203-
Args:
204-
blocks: output of page.get_text("dict") if already available
205-
Returns:
206-
A dict with analysis results. The area-related float values are
207-
computed as fractions of the total covered area.
208-
209-
"covered": pymupdf.Rect, page area covered by content
210-
"img_joins": float, fraction of area of the joined images
211-
"img_area": float, fraction of sum of image area sizes
212-
"txt_joins": float, fraction of area of the joined text spans
213-
"txt_area": float, fraction of sum of text span bbox area sizes
214-
"vec_joins": float, fraction of area of the joined vector characters
215-
"vec_area": float, fraction of sum of vector character area sizes
216-
"chars_total": int, count of visible characters
217-
"chars_bad": int, count of Replacement Unicode characters
218-
"ocr_spans": int, count of text spans with 'GlyphLessFont'
219-
220-
"""
221-
chars_total = 0
222-
chars_bad = 0
223-
if blocks is None:
224-
blocks = page.get_text(
225-
"dict",
226-
flags=FLAGS,
227-
clip=pymupdf.INFINITE_RECT(),
228-
)["blocks"]
229-
img_rect = pymupdf.EMPTY_RECT()
230-
txt_rect = +img_rect
231-
vec_rect = +img_rect
232-
img_area = 0
233-
txt_area = 0
234-
vec_area = 0
235-
ocr_spans = 0
236-
for b in blocks:
237-
bbox = page.rect & b["bbox"]
238-
area = bbox.width * bbox.height
239-
if not area:
240-
continue
241-
if b["type"] == 1: # Image block
242-
img_rect |= bbox
243-
img_area += area
244-
elif b["type"] == 0: # Text block
245-
for l in b["lines"]:
246-
for s in l["spans"]:
247-
if is_white(s["text"]):
248-
continue
249-
sr = page.rect & s["bbox"]
250-
if sr.is_empty or sr.is_infinite:
251-
continue
252-
if s["font"] == "GlyphLessFont":
253-
ocr_spans += 1
254-
elif s["alpha"] == 0:
255-
continue # skip invisible text
256-
chars_total += len(s["text"].strip())
257-
chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)])
258-
txt_rect |= sr
259-
txt_area += sr.width * sr.height
260-
elif (
261-
1
262-
and b["type"] == 3 # vector block
263-
and b["stroked"] # has been stroked
264-
and bbox.width <= 20 # width limit for typical characters
265-
and bbox.height <= 20 # height limit for typical characters
266-
and not b["isrect"] # contains curves
267-
):
268-
# potential character-like vector block
269-
vec_rect |= bbox
270-
vec_area += area
271-
272-
# the rectangle on page covered by some content
273-
covered = img_rect | txt_rect | vec_rect
274-
cover_area = abs(covered)
275-
analysis = {
276-
"covered": covered,
277-
"img_joins": (abs(img_rect) / cover_area) if cover_area else 0,
278-
"img_area": img_area / cover_area if cover_area else 0,
279-
"txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0,
280-
"txt_area": txt_area / cover_area if cover_area else 0,
281-
"vec_area": vec_area / cover_area if cover_area else 0,
282-
"vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0,
283-
"chars_total": chars_total,
284-
"chars_bad": chars_bad,
285-
"ocr_spans": ocr_spans,
286-
}
287-
return analysis
288-
289-
290195
def should_ocr_page(
291196
page,
292197
dpi=150,

pymupdf4llm/pymupdf4llm/helpers/document_layout.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
from pymupdf4llm.helpers.progress import ProgressBar
1919
try:
2020
import cv2
21-
from pymupdf4llm.helpers import check_ocr
21+
22+
if hasattr(cv2, "Canny"):
23+
from pymupdf4llm.helpers import check_ocr
24+
else:
25+
cv2 = None
2226
except ImportError:
2327
cv2 = None
2428

@@ -777,6 +781,7 @@ def parse_document(
777781
embed_images=False,
778782
write_images=False,
779783
force_text=False,
784+
use_ocr=True,
780785
) -> ParsedDocument:
781786
if isinstance(doc, pymupdf.Document):
782787
mydoc = doc
@@ -803,14 +808,17 @@ def parse_document(
803808
raise ValueError("Cannot both embed and write images.")
804809
document.embed_images = embed_images
805810
document.write_images = write_images
806-
try:
807-
reason = "OpenCV not installed"
808-
assert cv2 is not None
809-
reason = "Tesseract language data not found"
810-
assert pymupdf.get_tessdata()
811-
document.use_ocr = True
812-
except Exception as e:
813-
print(f"{reason}. OCR disabled.", file=INFO_MESSAGES)
811+
if use_ocr:
812+
try:
813+
reason = "OpenCV not installed"
814+
assert cv2 is not None
815+
reason = "Tesseract language data not found"
816+
assert pymupdf.get_tessdata()
817+
document.use_ocr = True
818+
except Exception as e:
819+
print(f"OCR disabled: {reason}.")
820+
document.use_ocr = False
821+
else:
814822
document.use_ocr = False
815823
if pages is None:
816824
page_filter = range(mydoc.page_count)
@@ -848,6 +856,8 @@ def parse_document(
848856
)
849857
else:
850858
decision = {"should_ocr": False}
859+
page_analysis = utils.analyze_page(page, blocks)
860+
decision["has_ocr_text"] = page_analysis["ocr_spans"] > 0
851861

852862
if decision["has_ocr_text"]: # prevent MD styling if already OCR'd
853863
page_full_ocred = True

pymupdf4llm/pymupdf4llm/helpers/utils.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,112 @@
4646
+ list(map(chr, range(0x25A0, 0x2600)))
4747
)
4848

49+
FLAGS = (
50+
0
51+
| pymupdf.TEXT_COLLECT_STYLES
52+
| pymupdf.TEXT_COLLECT_VECTORS
53+
| pymupdf.TEXT_PRESERVE_IMAGES
54+
| pymupdf.TEXT_ACCURATE_BBOXES
55+
# | pymupdf.TEXT_MEDIABOX_CLIP
56+
)
57+
58+
REPLACEMENT_CHARACTER = chr(0xFFFD)
59+
60+
61+
def is_white(text):
62+
"""Identify white text."""
63+
return WHITE_CHARS.issuperset(text)
64+
65+
66+
def analyze_page(page, blocks=None) -> dict:
67+
"""Analyze the page for the OCR decision.
68+
69+
Args:
70+
blocks: output of page.get_text("dict") if already available
71+
Returns:
72+
A dict with analysis results. The area-related float values are
73+
computed as fractions of the total covered area.
74+
75+
"covered": pymupdf.Rect, page area covered by content
76+
"img_joins": float, fraction of area of the joined images
77+
"img_area": float, fraction of sum of image area sizes
78+
"txt_joins": float, fraction of area of the joined text spans
79+
"txt_area": float, fraction of sum of text span bbox area sizes
80+
"vec_joins": float, fraction of area of the joined vector characters
81+
"vec_area": float, fraction of sum of vector character area sizes
82+
"chars_total": int, count of visible characters
83+
"chars_bad": int, count of Replacement Unicode characters
84+
"ocr_spans": int, count of text spans with 'GlyphLessFont'
85+
86+
"""
87+
chars_total = 0
88+
chars_bad = 0
89+
if blocks is None:
90+
blocks = page.get_text(
91+
"dict",
92+
flags=FLAGS,
93+
clip=pymupdf.INFINITE_RECT(),
94+
)["blocks"]
95+
img_rect = pymupdf.EMPTY_RECT()
96+
txt_rect = +img_rect
97+
vec_rect = +img_rect
98+
img_area = 0
99+
txt_area = 0
100+
vec_area = 0
101+
ocr_spans = 0
102+
for b in blocks:
103+
bbox = page.rect & b["bbox"]
104+
area = bbox.width * bbox.height
105+
if not area:
106+
continue
107+
if b["type"] == 1: # Image block
108+
img_rect |= bbox
109+
img_area += area
110+
elif b["type"] == 0: # Text block
111+
for l in b["lines"]:
112+
for s in l["spans"]:
113+
if is_white(s["text"]):
114+
continue
115+
sr = page.rect & s["bbox"]
116+
if sr.is_empty or sr.is_infinite:
117+
continue
118+
if s["font"] == "GlyphLessFont":
119+
ocr_spans += 1
120+
elif s["alpha"] == 0:
121+
continue # skip invisible text
122+
chars_total += len(s["text"].strip())
123+
chars_bad += len([c for c in s["text"] if c == chr(0xFFFD)])
124+
txt_rect |= sr
125+
txt_area += sr.width * sr.height
126+
elif (
127+
1
128+
and b["type"] == 3 # vector block
129+
and b["stroked"] # has been stroked
130+
and bbox.width <= 20 # width limit for typical characters
131+
and bbox.height <= 20 # height limit for typical characters
132+
and not b["isrect"] # contains curves
133+
):
134+
# potential character-like vector block
135+
vec_rect |= bbox
136+
vec_area += area
137+
138+
# the rectangle on page covered by some content
139+
covered = img_rect | txt_rect | vec_rect
140+
cover_area = abs(covered)
141+
analysis = {
142+
"covered": covered,
143+
"img_joins": (abs(img_rect) / cover_area) if cover_area else 0,
144+
"img_area": img_area / cover_area if cover_area else 0,
145+
"txt_joins": (abs(txt_rect) / cover_area) if cover_area else 0,
146+
"txt_area": txt_area / cover_area if cover_area else 0,
147+
"vec_area": vec_area / cover_area if cover_area else 0,
148+
"vec_joins": (abs(vec_rect) / cover_area) if cover_area else 0,
149+
"chars_total": chars_total,
150+
"chars_bad": chars_bad,
151+
"ocr_spans": ocr_spans,
152+
}
153+
return analysis
154+
49155

50156
def table_cleaner(page, blocks, tbbox):
51157
"""Clean the table bbox 'tbbox'.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Generated file - do not edit.
22
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
3-
VERSION = '0.2.3'
3+
VERSION = '0.2.4'

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"Topic :: Utilities",
1515
]
1616

17-
version = "0.2.3"
17+
version = "0.2.4"
1818
requires = ["pymupdf>=1.26.6", "tabulate"]
1919

2020
text = requires[0].split("=")[1]

0 commit comments

Comments
 (0)