Skip to content

Commit 26d1615

Browse files
authored
ENH: Add decode_as_image() to ContentStreams (#2615)
Closes #2613
1 parent 4b086ef commit 26d1615

File tree

3 files changed

+68
-0
lines changed

3 files changed

+68
-0
lines changed

docs/user/extract-images.md

+22
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,25 @@ for image_file_object in page.images:
1919
fp.write(image_file_object.data)
2020
count += 1
2121
```
22+
23+
# Other images
24+
25+
Some other objects can contain images, such as stamp annotations.
26+
27+
For example, this document contains such stamps:
28+
[test_stamp.pdf](https://github.com/user-attachments/files/15751424/test_stamp.pdf)
29+
30+
You can extract the image from the annotation with the following code:
31+
32+
```python
33+
from pypdf import PdfReader
34+
35+
reader = PdfReader("test_stamp.pdf")
36+
im = (
37+
reader.pages[0]["/Annots"][0]
38+
.get_object()["/AP"]["/N"]["/Resources"]["/XObject"]["/Im4"]
39+
.decode_as_image()
40+
)
41+
42+
im.show()
43+
```

pypdf/generic/_data_structures.py

+25
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,31 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
955955
retval._data = FlateDecode.encode(b_(self._data), level)
956956
return retval
957957

958+
def decode_as_image(self) -> Any:
959+
"""
960+
Try to decode the stream object as an image
961+
962+
Returns:
963+
a PIL image if proper decoding has been found
964+
Raises:
965+
Exception: (any)during decoding to to invalid object or
966+
errors during decoding will be reported
967+
It is recommended to catch exceptions to prevent
968+
stops in your program.
969+
"""
970+
from ..filters import _xobj_to_image
971+
972+
if self.get("/Subtype", "") != "/Image":
973+
try:
974+
msg = f"{self.indirect_reference} does not seem to be an Image" # pragma: no cover
975+
except AttributeError:
976+
msg = f"{self.__repr__()} object does not seem to be an Image" # pragma: no cover
977+
logger_warning(msg, __name__)
978+
extension, byte_stream, img = _xobj_to_image(self)
979+
if extension is None:
980+
return None # pragma: no cover
981+
return img
982+
958983

959984
class DecodedStreamObject(StreamObject):
960985
pass

tests/test_images.py

+21
Original file line numberDiff line numberDiff line change
@@ -441,3 +441,24 @@ def test_inline_image_extraction():
441441
name = "iss2598d.png"
442442
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
443443
assert image_similarity(reader.pages[0].images[0].image, img) == 1
444+
445+
446+
@pytest.mark.enable_socket()
447+
def test_extract_image_from_object(caplog):
448+
url = "https://github.com/py-pdf/pypdf/files/15176076/B2.pdf"
449+
name = "iss2613.pdf"
450+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
451+
image = reader.pages[0]["/Resources"]["/Pattern"]["/P1"]["/Resources"]["/XObject"][
452+
"/X1"
453+
].decode_as_image()
454+
assert isinstance(image, Image.Image)
455+
with pytest.raises(Exception):
456+
co = reader.pages[0].get_contents()
457+
co.decode_as_image()
458+
assert "does not seem to be an Image" in caplog.text
459+
caplog.clear()
460+
co.indirect_reference = "for_test"
461+
with pytest.raises(Exception):
462+
co = reader.pages[0].get_contents()
463+
co.decode_as_image()
464+
assert "does not seem to be an Image" in caplog.text

0 commit comments

Comments
 (0)