-
Notifications
You must be signed in to change notification settings - Fork 2k
/
Copy pathpdf_image_extractor.py
48 lines (47 loc) · 1.74 KB
/
pdf_image_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import fitz # PyMuPDF
import io
from PIL import Image
# Output directory for the extracted images
output_dir = "extracted_images"
# Desired output image format
output_format = "png"
# Minimum width and height for extracted images
min_width = 100
min_height = 100
# Create the output directory if it does not exist
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# File path you want to extract images from
file = "1710.05006.pdf"
# Open the file
pdf_file = fitz.open(file)
# Iterate over PDF pages
for page_index in range(len(pdf_file)):
# Get the page itself
page = pdf_file[page_index]
# Get image list
image_list = page.get_images(full=True)
# Print the number of images found on this page
if image_list:
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
else:
print(f"[!] No images found on page {page_index}")
# Iterate over the images on the page
for image_index, img in enumerate(image_list, start=1):
# Get the XREF of the image
xref = img[0]
# Extract the image bytes
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]
# Get the image extension
image_ext = base_image["ext"]
# Load it to PIL
image = Image.open(io.BytesIO(image_bytes))
# Check if the image meets the minimum dimensions and save it
if image.width >= min_width and image.height >= min_height:
image.save(
open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
format=output_format.upper())
else:
print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")