Skip to content

Commit 9afda0a

Browse files
authored
ENH: Add parameter to select images to be removed (#2214)
Closes #2208
1 parent 9b23ac3 commit 9afda0a

File tree

4 files changed

+116
-39
lines changed

4 files changed

+116
-39
lines changed

pypdf/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from ._reader import DocumentInformation, PdfFileReader, PdfReader
1515
from ._version import __version__
1616
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
17+
from .constants import ImageType
1718
from .pagerange import PageRange, parse_filename_page_ranges
1819
from .papersizes import PaperSize
1920

@@ -31,6 +32,7 @@
3132
__all__ = [
3233
"__version__",
3334
"_debug_versions",
35+
"ImageType",
3436
"mult",
3537
"PageRange",
3638
"PaperSize",

pypdf/_writer.py

+73-38
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
FieldFlag,
7878
FileSpecificationDictionaryEntries,
7979
GoToActionArguments,
80+
ImageType,
8081
InteractiveFormDictEntries,
8182
PageLabelStyle,
8283
TypFitArguments,
@@ -132,12 +133,16 @@
132133

133134

134135
class ObjectDeletionFlag(enum.IntFlag):
136+
NONE = 0
135137
TEXT = enum.auto()
136-
IMAGES = enum.auto()
137138
LINKS = enum.auto()
138139
ATTACHMENTS = enum.auto()
139140
OBJECTS_3D = enum.auto()
140141
ALL_ANNOTATIONS = enum.auto()
142+
XOBJECT_IMAGES = enum.auto()
143+
INLINE_IMAGES = enum.auto()
144+
DRAWING_IMAGES = enum.auto()
145+
IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
141146

142147

143148
def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
@@ -2193,33 +2198,42 @@ def remove_objects_from_page(
21932198
if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS:
21942199
return self._remove_annots_from_page(page, None)
21952200

2196-
if to_delete & ObjectDeletionFlag.IMAGES:
2201+
jump_operators = []
2202+
if to_delete & ObjectDeletionFlag.DRAWING_IMAGES:
21972203
jump_operators = (
21982204
[b"w", b"J", b"j", b"M", b"d", b"i"]
21992205
+ [b"W", b"W*"]
22002206
+ [b"b", b"b*", b"B", b"B*", b"S", b"s", b"f", b"f*", b"F", b"n"]
22012207
+ [b"m", b"l", b"c", b"v", b"y", b"h", b"re"]
22022208
+ [b"sh"]
22032209
)
2204-
else: # del text
2210+
if to_delete & ObjectDeletionFlag.TEXT:
22052211
jump_operators = [b"Tj", b"TJ", b"'", b'"']
22062212

22072213
def clean(content: ContentStream, images: List[str], forms: List[str]) -> None:
2208-
nonlocal to_delete
2214+
nonlocal jump_operators, to_delete
22092215
i = 0
22102216
while i < len(content.operations):
22112217
operands, operator = content.operations[i]
2212-
if operator in jump_operators:
2218+
if (
2219+
(
2220+
operator == b"INLINE IMAGE"
2221+
and (
2222+
cast(ObjectDeletionFlag, to_delete)
2223+
& ObjectDeletionFlag.INLINE_IMAGES
2224+
)
2225+
)
2226+
or (operator in jump_operators)
2227+
or (
2228+
operator == b"Do"
2229+
and (
2230+
cast(ObjectDeletionFlag, to_delete)
2231+
& ObjectDeletionFlag.XOBJECT_IMAGES
2232+
)
2233+
and (operands[0] in images)
2234+
)
2235+
):
22132236
del content.operations[i]
2214-
elif operator == b"Do":
2215-
if (
2216-
to_delete & ObjectDeletionFlag.IMAGES
2217-
and operands[0] in images
2218-
or to_delete & ObjectDeletionFlag.TEXT
2219-
and operands[0] in forms
2220-
):
2221-
del content.operations[i]
2222-
i += 1
22232237
else:
22242238
i += 1
22252239
content.get_data() # this ensures ._data is rebuilt from the .operations
@@ -2242,23 +2256,25 @@ def clean_forms(
22422256
try:
22432257
content: Any = None
22442258
if (
2245-
to_delete & ObjectDeletionFlag.IMAGES
2259+
to_delete
2260+
& ObjectDeletionFlag.XOBJECT_IMAGES
22462261
and o["/Subtype"] == "/Image"
22472262
):
2248-
content = NullObject()
2263+
content = NullObject() # to delete the image keeping the entry
22492264
images.append(k)
22502265
if o["/Subtype"] == "/Form":
22512266
forms.append(k)
22522267
if isinstance(o, ContentStream):
22532268
content = o
22542269
else:
22552270
content = ContentStream(o, self)
2256-
content.update(o.items())
2257-
for k1 in ["/Length", "/Filter", "/DecodeParms"]:
2258-
try:
2259-
del content[k1]
2260-
except KeyError:
2261-
pass
2271+
content.update(
2272+
{
2273+
k1: v1
2274+
for k1, v1 in o.items()
2275+
if k1 not in ["/Length", "/Filter", "/DecodeParms"]
2276+
}
2277+
)
22622278
clean_forms(content, stack + [elt]) # clean sub forms
22632279
if content is not None:
22642280
if isinstance(v, IndirectObject):
@@ -2269,6 +2285,8 @@ def clean_forms(
22692285
d[k] = self._add_object(content) # pragma: no cover
22702286
except (TypeError, KeyError):
22712287
pass
2288+
for im in images:
2289+
del d[im] # for clean-up
22722290
if isinstance(elt, StreamObject): # for /Form
22732291
if not isinstance(elt, ContentStream): # pragma: no cover
22742292
e = ContentStream(elt, self)
@@ -2277,40 +2295,57 @@ def clean_forms(
22772295
clean(elt, images, forms) # clean the content
22782296
return images, forms
22792297

2298+
if not isinstance(page, PageObject):
2299+
page = PageObject(self, page.indirect_reference) # pragma: no cover
22802300
if "/Contents" in page:
2281-
content = page["/Contents"].get_object()
2301+
content = cast(ContentStream, page.get_contents())
22822302

2283-
if not isinstance(content, ContentStream):
2284-
content = ContentStream(content, page)
22852303
images, forms = clean_forms(page, [])
22862304

22872305
clean(content, images, forms)
2288-
if isinstance(page["/Contents"], ArrayObject):
2289-
for o in page["/Contents"]:
2290-
self._objects[o.idnum - 1] = NullObject()
2291-
try:
2292-
self._objects[
2293-
cast(IndirectObject, page["/Contents"].indirect_reference).idnum - 1
2294-
] = NullObject()
2295-
except AttributeError:
2296-
pass
2297-
page[NameObject("/Contents")] = self._add_object(content)
2306+
page.replace_contents(content)
22982307

2299-
def remove_images(self, ignore_byte_string_object: Optional[bool] = None) -> None:
2308+
def remove_images(
2309+
self,
2310+
to_delete: ImageType = ImageType.ALL,
2311+
ignore_byte_string_object: Optional[bool] = None,
2312+
) -> None:
23002313
"""
23012314
Remove images from this output.
23022315
23032316
Args:
2317+
to_delete : The type of images to be deleted
2318+
(default = all images types)
23042319
ignore_byte_string_object: deprecated
23052320
"""
2321+
if isinstance(to_delete, bool):
2322+
ignore_byte_string_object = to_delete
2323+
to_delete = ImageType.ALL
23062324
if ignore_byte_string_object is not None:
23072325
warnings.warn(
23082326
"The 'ignore_byte_string_object' argument of remove_images is "
23092327
"deprecated and will be removed in pypdf 4.0.0.",
23102328
category=DeprecationWarning,
23112329
)
2330+
i = (
2331+
(
2332+
ObjectDeletionFlag.XOBJECT_IMAGES
2333+
if to_delete & ImageType.XOBJECT_IMAGES
2334+
else ObjectDeletionFlag.NONE
2335+
)
2336+
| (
2337+
ObjectDeletionFlag.INLINE_IMAGES
2338+
if to_delete & ImageType.INLINE_IMAGES
2339+
else ObjectDeletionFlag.NONE
2340+
)
2341+
| (
2342+
ObjectDeletionFlag.DRAWING_IMAGES
2343+
if to_delete & ImageType.DRAWING_IMAGES
2344+
else ObjectDeletionFlag.NONE
2345+
)
2346+
)
23122347
for page in self.pages:
2313-
self.remove_objects_from_page(page, ObjectDeletionFlag.IMAGES)
2348+
self.remove_objects_from_page(page, i)
23142349

23152350
def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated
23162351
"""
@@ -2319,7 +2354,7 @@ def removeImages(self, ignoreByteStringObject: bool = False) -> None: # depreca
23192354
.. deprecated:: 1.28.0
23202355
"""
23212356
deprecation_with_replacement("removeImages", "remove_images", "3.0.0")
2322-
return self.remove_images(ignoreByteStringObject)
2357+
return self.remove_images()
23232358

23242359
def remove_text(self, ignore_byte_string_object: Optional[bool] = None) -> None:
23252360
"""

pypdf/constants.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
PDF Reference, sixth edition, Version 1.7, 2006.
99
"""
1010

11-
from enum import IntFlag
11+
from enum import IntFlag, auto
1212
from typing import Dict, Tuple
1313

1414

@@ -585,3 +585,12 @@ class AnnotationFlag(IntFlag):
585585
TypArguments,
586586
TypFitArguments,
587587
)
588+
589+
590+
class ImageType(IntFlag):
591+
NONE = 0
592+
XOBJECT_IMAGES = auto()
593+
INLINE_IMAGES = auto()
594+
DRAWING_IMAGES = auto()
595+
ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
596+
IMAGES = ALL # for consistency with ObjectDeletionFlag

tests/test_writer.py

+31
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import pytest
1010

1111
from pypdf import (
12+
ImageType,
1213
ObjectDeletionFlag,
1314
PageObject,
1415
PdfMerger,
@@ -1862,6 +1863,36 @@ def test_object_contains_indirect_reference_to_self():
18621863
writer.append(reader)
18631864

18641865

1866+
def test_remove_image_per_type():
1867+
writer = PdfWriter(clone_from=RESOURCE_ROOT / "reportlab-inline-image.pdf")
1868+
writer.remove_images(ImageType.INLINE_IMAGES)
1869+
1870+
assert all(
1871+
x not in writer.pages[0].get_contents().get_data()
1872+
for x in (b"BI", b"ID", b"EI")
1873+
)
1874+
1875+
with pytest.raises(DeprecationWarning):
1876+
writer.remove_images(True)
1877+
1878+
writer = PdfWriter(clone_from=RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf")
1879+
writer.remove_images(ImageType.DRAWING_IMAGES)
1880+
assert all(
1881+
x not in writer.pages[1].get_contents().get_data()
1882+
for x in (b" re\n", b"W*", b"f*")
1883+
)
1884+
assert all(
1885+
x in writer.pages[1].get_contents().get_data() for x in (b" TJ\n", b"rg", b"Tm")
1886+
)
1887+
assert all(
1888+
x not in writer.pages[9]["/Resources"]["/XObject"]["/Meta84"].get_data()
1889+
for x in (b" re\n", b"W*", b"f*")
1890+
)
1891+
writer.remove_images(ImageType.XOBJECT_IMAGES)
1892+
assert b"Do\n" not in writer.pages[0].get_contents().get_data()
1893+
assert len(writer.pages[0]["/Resources"]["/XObject"]) == 0
1894+
1895+
18651896
@pytest.mark.enable_socket()
18661897
def test_add_outlines_on_empty_dict():
18671898
"""Cf #2233"""

0 commit comments

Comments
 (0)