Skip to content

Commit bff4b8e

Browse files
Kaos599Lucas-C
andauthored
Implement uncompress functionality for PDF files (#75)
Co-authored-by: Lucas Cimon <[email protected]>
1 parent 32f68fd commit bff4b8e

7 files changed

+125
-8
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ $ pdfly --help
3434
│ 2-up Create a booklet-style PDF from a single input. │
3535
│ cat Concatenate pages from PDF files into a single PDF file. │
3636
│ compress Compress a PDF. │
37+
| uncompress Uncompresses a PDF. │
3738
│ extract-images Extract images from PDF without resampling or altering. │
3839
│ extract-text Extract text from a PDF file. │
3940
│ meta Show metadata of a PDF file │

pdfly/cli.py

+25
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import pdfly.metadata
1717
import pdfly.pagemeta
1818
import pdfly.rm
19+
import pdfly.uncompress
1920
import pdfly.up2
2021
import pdfly.update_offsets
2122
import pdfly.x2pdf
@@ -205,6 +206,30 @@ def compress(
205206
pdfly.compress.main(pdf, output)
206207

207208

209+
@entry_point.command(name="uncompress", help=pdfly.uncompress.__doc__) # type: ignore[misc]
210+
def uncompress(
211+
pdf: Annotated[
212+
Path,
213+
typer.Argument(
214+
exists=True,
215+
file_okay=True,
216+
dir_okay=False,
217+
writable=False,
218+
readable=True,
219+
resolve_path=True,
220+
),
221+
],
222+
output: Annotated[
223+
Path,
224+
typer.Argument(
225+
exists=False,
226+
writable=True,
227+
),
228+
],
229+
) -> None:
230+
pdfly.uncompress.main(pdf, output)
231+
232+
208233
@entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc]
209234
def update_offsets(
210235
file_in: Annotated[

pdfly/uncompress.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"""Module for uncompressing PDF content streams."""
2+
3+
import zlib
4+
from pathlib import Path
5+
from typing import Optional
6+
7+
from pypdf import PdfReader, PdfWriter
8+
from pypdf.generic import IndirectObject, PdfObject
9+
10+
11+
def main(pdf: Path, output: Path) -> None:
12+
reader = PdfReader(pdf)
13+
writer = PdfWriter()
14+
15+
for page in reader.pages:
16+
if "/Contents" in page:
17+
contents: Optional[PdfObject] = page["/Contents"]
18+
if isinstance(contents, IndirectObject):
19+
contents = contents.get_object()
20+
if contents is not None:
21+
if isinstance(contents, list):
22+
for content in contents:
23+
if isinstance(content, IndirectObject):
24+
decompress_content_stream(content)
25+
elif isinstance(contents, IndirectObject):
26+
decompress_content_stream(contents)
27+
writer.add_page(page)
28+
29+
with open(output, "wb") as fp:
30+
writer.write(fp)
31+
32+
orig_size = pdf.stat().st_size
33+
uncomp_size = output.stat().st_size
34+
35+
print(f"Original Size : {orig_size:,}")
36+
print(
37+
f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)"
38+
)
39+
40+
41+
def decompress_content_stream(content: IndirectObject) -> None:
42+
"""Decompress a content stream if it uses FlateDecode."""
43+
if content.get("/Filter") == "/FlateDecode":
44+
try:
45+
compressed_data = content.get_data()
46+
uncompressed_data = zlib.decompress(compressed_data)
47+
content.set_data(uncompressed_data)
48+
del content["/Filter"]
49+
except zlib.error as error:
50+
print(
51+
f"Some content stream with /FlateDecode failed to be decompressed: {error}"
52+
)

tests/conftest.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
import os
44
from pathlib import Path
55

6-
from fpdf import FPDF
76
import pytest
7+
from fpdf import FPDF
8+
89
from pdfly.cli import entry_point
910

1011
try:
@@ -35,7 +36,7 @@ def run_cli(args):
3536
return error.code
3637

3738

38-
@pytest.fixture
39+
@pytest.fixture()
3940
def two_pages_pdf_filepath(tmp_path):
4041
"A PDF with 2 pages, and a different image on each page"
4142
# Note: prior to v2.7.9, fpdf2 produced incorrect /Resources dicts for each page (cf. fpdf2 PR #1133),
@@ -50,7 +51,7 @@ def two_pages_pdf_filepath(tmp_path):
5051
return pdf_filepath
5152

5253

53-
@pytest.fixture
54+
@pytest.fixture()
5455
def pdf_file_100(tmp_path):
5556
"""A PDF with 100 pages; each has only the page index on it."""
5657
pdf = FPDF()
@@ -65,7 +66,7 @@ def pdf_file_100(tmp_path):
6566
return pdf_filepath
6667

6768

68-
@pytest.fixture
69+
@pytest.fixture()
6970
def pdf_file_abc(tmp_path):
7071
"""A PDF with 100 pages; each has only the page index on it."""
7172
pdf = FPDF()

tests/test_extract_images.py

-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import pytest
2-
31
from .conftest import RESOURCES_ROOT, chdir, run_cli
42

53

tests/test_uncompress.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
"""Tests for the `uncompress` command."""
2+
3+
from pathlib import Path
4+
5+
import pytest
6+
from pypdf import PdfReader
7+
from typer.testing import CliRunner
8+
9+
from pdfly.cli import entry_point
10+
11+
runner = CliRunner()
12+
13+
14+
@pytest.mark.parametrize(
15+
"input_pdf_filepath", Path("sample-files").glob("*.pdf")
16+
)
17+
def test_uncompress_all_sample_files(
18+
input_pdf_filepath: Path, tmp_path: Path
19+
) -> None:
20+
output_pdf_filepath = tmp_path / "uncompressed_output.pdf"
21+
22+
result = runner.invoke(
23+
entry_point,
24+
["uncompress", str(input_pdf_filepath), str(output_pdf_filepath)],
25+
)
26+
27+
assert (
28+
result.exit_code == 0
29+
), f"Error in uncompressing {input_pdf_filepath}: {result.output}"
30+
assert (
31+
output_pdf_filepath.exists()
32+
), f"Output PDF {output_pdf_filepath} does not exist."
33+
34+
reader = PdfReader(str(output_pdf_filepath))
35+
for page in reader.pages:
36+
contents = page.get("/Contents")
37+
if contents:
38+
assert (
39+
"/Filter" not in contents
40+
), "Content stream is still compressed"

tests/test_update_offsets.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44
Here should only be end-to-end tests.
55
"""
66

7+
import re
78
from pathlib import Path
89

910
import pytest
10-
import re
1111

12-
from .conftest import RESOURCES_ROOT, chdir, run_cli
12+
from .conftest import RESOURCES_ROOT, run_cli
1313

1414

1515
def test_update_offsets(capsys, tmp_path: Path) -> None:

0 commit comments

Comments
 (0)