|
1 |
| -"""Module for uncompressing PDF content streams.""" |
2 |
| - |
3 |
| -from pathlib import Path |
4 |
| -from typing import Optional |
5 |
| -import zlib |
6 |
| - |
7 |
| -from pypdf import PdfReader, PdfWriter |
8 |
| -from pypdf.generic import IndirectObject, PdfObject |
9 |
| - |
10 |
| - |
11 |
| -def main(pdf: Path, output: Path) -> None: |
12 |
| - reader = PdfReader(pdf) |
13 |
| - writer = PdfWriter() |
14 |
| - |
15 |
| - for page in reader.pages: |
16 |
| - if "/Contents" in page: |
17 |
| - contents: Optional[PdfObject] = page["/Contents"] |
18 |
| - if isinstance(contents, IndirectObject): |
19 |
| - contents = contents.get_object() |
20 |
| - if contents is not None: |
21 |
| - if isinstance(contents, list): |
22 |
| - for content in contents: |
23 |
| - if isinstance(content, IndirectObject): |
24 |
| - decompress_content_stream(content) |
25 |
| - elif isinstance(contents, IndirectObject): |
26 |
| - decompress_content_stream(contents) |
27 |
| - writer.add_page(page) |
28 |
| - |
29 |
| - with open(output, "wb") as fp: |
30 |
| - writer.write(fp) |
31 |
| - |
32 |
| - orig_size = pdf.stat().st_size |
33 |
| - uncomp_size = output.stat().st_size |
34 |
| - |
35 |
| - print(f"Original Size : {orig_size:,}") |
36 |
| - print( |
37 |
| - f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)" |
38 |
| - ) |
39 |
| - |
40 |
| - |
41 |
| -def decompress_content_stream(content: IndirectObject) -> None: |
42 |
| - """Decompress a content stream if it uses FlateDecode.""" |
43 |
| - if content.get("/Filter") == "/FlateDecode": |
44 |
| - try: |
45 |
| - compressed_data = content.get_data() |
46 |
| - uncompressed_data = zlib.decompress(compressed_data) |
47 |
| - content.set_data(uncompressed_data) |
48 |
| - del content["/Filter"] |
49 |
| - except zlib.error as error: |
50 |
| - print( |
51 |
| - f"Some content stream with /FlateDecode failed to be decompressed: {error}" |
52 |
| - ) |
| 1 | +"""Module for uncompressing PDF content streams.""" |
| 2 | + |
| 3 | +import zlib |
| 4 | +from pathlib import Path |
| 5 | +from typing import Optional |
| 6 | + |
| 7 | +from pypdf import PdfReader, PdfWriter |
| 8 | +from pypdf.generic import IndirectObject, PdfObject |
| 9 | + |
| 10 | + |
| 11 | +def main(pdf: Path, output: Path) -> None: |
| 12 | + reader = PdfReader(pdf) |
| 13 | + writer = PdfWriter() |
| 14 | + |
| 15 | + for page in reader.pages: |
| 16 | + if "/Contents" in page: |
| 17 | + contents: Optional[PdfObject] = page["/Contents"] |
| 18 | + if isinstance(contents, IndirectObject): |
| 19 | + contents = contents.get_object() |
| 20 | + if contents is not None: |
| 21 | + if isinstance(contents, list): |
| 22 | + for content in contents: |
| 23 | + if isinstance(content, IndirectObject): |
| 24 | + decompress_content_stream(content) |
| 25 | + elif isinstance(contents, IndirectObject): |
| 26 | + decompress_content_stream(contents) |
| 27 | + writer.add_page(page) |
| 28 | + |
| 29 | + with open(output, "wb") as fp: |
| 30 | + writer.write(fp) |
| 31 | + |
| 32 | + orig_size = pdf.stat().st_size |
| 33 | + uncomp_size = output.stat().st_size |
| 34 | + |
| 35 | + print(f"Original Size : {orig_size:,}") |
| 36 | + print( |
| 37 | + f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)" |
| 38 | + ) |
| 39 | + |
| 40 | + |
| 41 | +def decompress_content_stream(content: IndirectObject) -> None: |
| 42 | + """Decompress a content stream if it uses FlateDecode.""" |
| 43 | + if content.get("/Filter") == "/FlateDecode": |
| 44 | + try: |
| 45 | + compressed_data = content.get_data() |
| 46 | + uncompressed_data = zlib.decompress(compressed_data) |
| 47 | + content.set_data(uncompressed_data) |
| 48 | + del content["/Filter"] |
| 49 | + except zlib.error as error: |
| 50 | + print( |
| 51 | + f"Some content stream with /FlateDecode failed to be decompressed: {error}" |
| 52 | + ) |
0 commit comments