77
77
FieldFlag ,
78
78
FileSpecificationDictionaryEntries ,
79
79
GoToActionArguments ,
80
+ ImageType ,
80
81
InteractiveFormDictEntries ,
81
82
PageLabelStyle ,
82
83
TypFitArguments ,
132
133
133
134
134
135
class ObjectDeletionFlag (enum .IntFlag ):
136
+ NONE = 0
135
137
TEXT = enum .auto ()
136
- IMAGES = enum .auto ()
137
138
LINKS = enum .auto ()
138
139
ATTACHMENTS = enum .auto ()
139
140
OBJECTS_3D = enum .auto ()
140
141
ALL_ANNOTATIONS = enum .auto ()
142
+ XOBJECT_IMAGES = enum .auto ()
143
+ INLINE_IMAGES = enum .auto ()
144
+ DRAWING_IMAGES = enum .auto ()
145
+ IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES
141
146
142
147
143
148
def _rolling_checksum (stream : BytesIO , blocksize : int = 65536 ) -> str :
@@ -2193,33 +2198,42 @@ def remove_objects_from_page(
2193
2198
if to_delete & ObjectDeletionFlag .ALL_ANNOTATIONS :
2194
2199
return self ._remove_annots_from_page (page , None )
2195
2200
2196
- if to_delete & ObjectDeletionFlag .IMAGES :
2201
+ jump_operators = []
2202
+ if to_delete & ObjectDeletionFlag .DRAWING_IMAGES :
2197
2203
jump_operators = (
2198
2204
[b"w" , b"J" , b"j" , b"M" , b"d" , b"i" ]
2199
2205
+ [b"W" , b"W*" ]
2200
2206
+ [b"b" , b"b*" , b"B" , b"B*" , b"S" , b"s" , b"f" , b"f*" , b"F" , b"n" ]
2201
2207
+ [b"m" , b"l" , b"c" , b"v" , b"y" , b"h" , b"re" ]
2202
2208
+ [b"sh" ]
2203
2209
)
2204
- else : # del text
2210
+ if to_delete & ObjectDeletionFlag . TEXT :
2205
2211
jump_operators = [b"Tj" , b"TJ" , b"'" , b'"' ]
2206
2212
2207
2213
def clean (content : ContentStream , images : List [str ], forms : List [str ]) -> None :
2208
- nonlocal to_delete
2214
+ nonlocal jump_operators , to_delete
2209
2215
i = 0
2210
2216
while i < len (content .operations ):
2211
2217
operands , operator = content .operations [i ]
2212
- if operator in jump_operators :
2218
+ if (
2219
+ (
2220
+ operator == b"INLINE IMAGE"
2221
+ and (
2222
+ cast (ObjectDeletionFlag , to_delete )
2223
+ & ObjectDeletionFlag .INLINE_IMAGES
2224
+ )
2225
+ )
2226
+ or (operator in jump_operators )
2227
+ or (
2228
+ operator == b"Do"
2229
+ and (
2230
+ cast (ObjectDeletionFlag , to_delete )
2231
+ & ObjectDeletionFlag .XOBJECT_IMAGES
2232
+ )
2233
+ and (operands [0 ] in images )
2234
+ )
2235
+ ):
2213
2236
del content .operations [i ]
2214
- elif operator == b"Do" :
2215
- if (
2216
- to_delete & ObjectDeletionFlag .IMAGES
2217
- and operands [0 ] in images
2218
- or to_delete & ObjectDeletionFlag .TEXT
2219
- and operands [0 ] in forms
2220
- ):
2221
- del content .operations [i ]
2222
- i += 1
2223
2237
else :
2224
2238
i += 1
2225
2239
content .get_data () # this ensures ._data is rebuilt from the .operations
@@ -2242,23 +2256,25 @@ def clean_forms(
2242
2256
try :
2243
2257
content : Any = None
2244
2258
if (
2245
- to_delete & ObjectDeletionFlag .IMAGES
2259
+ to_delete
2260
+ & ObjectDeletionFlag .XOBJECT_IMAGES
2246
2261
and o ["/Subtype" ] == "/Image"
2247
2262
):
2248
- content = NullObject ()
2263
+ content = NullObject () # to delete the image keeping the entry
2249
2264
images .append (k )
2250
2265
if o ["/Subtype" ] == "/Form" :
2251
2266
forms .append (k )
2252
2267
if isinstance (o , ContentStream ):
2253
2268
content = o
2254
2269
else :
2255
2270
content = ContentStream (o , self )
2256
- content .update (o .items ())
2257
- for k1 in ["/Length" , "/Filter" , "/DecodeParms" ]:
2258
- try :
2259
- del content [k1 ]
2260
- except KeyError :
2261
- pass
2271
+ content .update (
2272
+ {
2273
+ k1 : v1
2274
+ for k1 , v1 in o .items ()
2275
+ if k1 not in ["/Length" , "/Filter" , "/DecodeParms" ]
2276
+ }
2277
+ )
2262
2278
clean_forms (content , stack + [elt ]) # clean sub forms
2263
2279
if content is not None :
2264
2280
if isinstance (v , IndirectObject ):
@@ -2269,6 +2285,8 @@ def clean_forms(
2269
2285
d [k ] = self ._add_object (content ) # pragma: no cover
2270
2286
except (TypeError , KeyError ):
2271
2287
pass
2288
+ for im in images :
2289
+ del d [im ] # for clean-up
2272
2290
if isinstance (elt , StreamObject ): # for /Form
2273
2291
if not isinstance (elt , ContentStream ): # pragma: no cover
2274
2292
e = ContentStream (elt , self )
@@ -2277,40 +2295,57 @@ def clean_forms(
2277
2295
clean (elt , images , forms ) # clean the content
2278
2296
return images , forms
2279
2297
2298
+ if not isinstance (page , PageObject ):
2299
+ page = PageObject (self , page .indirect_reference ) # pragma: no cover
2280
2300
if "/Contents" in page :
2281
- content = page [ "/Contents" ]. get_object ( )
2301
+ content = cast ( ContentStream , page . get_contents () )
2282
2302
2283
- if not isinstance (content , ContentStream ):
2284
- content = ContentStream (content , page )
2285
2303
images , forms = clean_forms (page , [])
2286
2304
2287
2305
clean (content , images , forms )
2288
- if isinstance (page ["/Contents" ], ArrayObject ):
2289
- for o in page ["/Contents" ]:
2290
- self ._objects [o .idnum - 1 ] = NullObject ()
2291
- try :
2292
- self ._objects [
2293
- cast (IndirectObject , page ["/Contents" ].indirect_reference ).idnum - 1
2294
- ] = NullObject ()
2295
- except AttributeError :
2296
- pass
2297
- page [NameObject ("/Contents" )] = self ._add_object (content )
2306
+ page .replace_contents (content )
2298
2307
2299
- def remove_images (self , ignore_byte_string_object : Optional [bool ] = None ) -> None :
2308
+ def remove_images (
2309
+ self ,
2310
+ to_delete : ImageType = ImageType .ALL ,
2311
+ ignore_byte_string_object : Optional [bool ] = None ,
2312
+ ) -> None :
2300
2313
"""
2301
2314
Remove images from this output.
2302
2315
2303
2316
Args:
2317
+ to_delete : The type of images to be deleted
2318
+ (default = all images types)
2304
2319
ignore_byte_string_object: deprecated
2305
2320
"""
2321
+ if isinstance (to_delete , bool ):
2322
+ ignore_byte_string_object = to_delete
2323
+ to_delete = ImageType .ALL
2306
2324
if ignore_byte_string_object is not None :
2307
2325
warnings .warn (
2308
2326
"The 'ignore_byte_string_object' argument of remove_images is "
2309
2327
"deprecated and will be removed in pypdf 4.0.0." ,
2310
2328
category = DeprecationWarning ,
2311
2329
)
2330
+ i = (
2331
+ (
2332
+ ObjectDeletionFlag .XOBJECT_IMAGES
2333
+ if to_delete & ImageType .XOBJECT_IMAGES
2334
+ else ObjectDeletionFlag .NONE
2335
+ )
2336
+ | (
2337
+ ObjectDeletionFlag .INLINE_IMAGES
2338
+ if to_delete & ImageType .INLINE_IMAGES
2339
+ else ObjectDeletionFlag .NONE
2340
+ )
2341
+ | (
2342
+ ObjectDeletionFlag .DRAWING_IMAGES
2343
+ if to_delete & ImageType .DRAWING_IMAGES
2344
+ else ObjectDeletionFlag .NONE
2345
+ )
2346
+ )
2312
2347
for page in self .pages :
2313
- self .remove_objects_from_page (page , ObjectDeletionFlag . IMAGES )
2348
+ self .remove_objects_from_page (page , i )
2314
2349
2315
2350
def removeImages (self , ignoreByteStringObject : bool = False ) -> None : # deprecated
2316
2351
"""
@@ -2319,7 +2354,7 @@ def removeImages(self, ignoreByteStringObject: bool = False) -> None: # depreca
2319
2354
.. deprecated:: 1.28.0
2320
2355
"""
2321
2356
deprecation_with_replacement ("removeImages" , "remove_images" , "3.0.0" )
2322
- return self .remove_images (ignoreByteStringObject )
2357
+ return self .remove_images ()
2323
2358
2324
2359
def remove_text (self , ignore_byte_string_object : Optional [bool ] = None ) -> None :
2325
2360
"""
0 commit comments