Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement uncompress functionality for PDF files #75

Merged
merged 7 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ $ pdfly --help
│ 2-up Create a booklet-style PDF from a single input. │
│ cat Concatenate pages from PDF files into a single PDF file. │
│ compress Compress a PDF. │
| uncompress Uncompresses a PDF. │
│ extract-images Extract images from PDF without resampling or altering. │
│ extract-text Extract text from a PDF file. │
│ meta Show metadata of a PDF file │
Expand Down
25 changes: 25 additions & 0 deletions pdfly/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pdfly.metadata
import pdfly.pagemeta
import pdfly.rm
import pdfly.uncompress
import pdfly.up2
import pdfly.update_offsets
import pdfly.x2pdf
Expand Down Expand Up @@ -205,6 +206,30 @@ def compress(
pdfly.compress.main(pdf, output)


@entry_point.command(name="uncompress", help=pdfly.uncompress.__doc__) # type: ignore[misc]
def uncompress(
pdf: Annotated[
Path,
typer.Argument(
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
Comment on lines +214 to +219
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
exists=True,
file_okay=True,
dir_okay=False,
writable=False,
readable=True,
resolve_path=True,
dir_okay=False,
exists=True,
resolve_path=True,

Removing the parameters provided that are identical to the default values: https://github.com/fastapi/typer/blob/master/typer/params.py#L301

),
],
output: Annotated[
Path,
typer.Argument(
exists=False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
exists=False,

It can be really useful to override an existing file, I'm not sure this check is needed.
We alread removed it from other subcommands.

writable=True,
),
],
) -> None:
pdfly.uncompress.main(pdf, output)


@entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc]
def update_offsets(
file_in: Annotated[
Expand Down
52 changes: 52 additions & 0 deletions pdfly/uncompress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Module for uncompressing PDF content streams."""

import zlib
from pathlib import Path
from typing import Optional

from pypdf import PdfReader, PdfWriter
from pypdf.generic import IndirectObject, PdfObject


def main(pdf: Path, output: Path) -> None:
reader = PdfReader(pdf)
writer = PdfWriter()

for page in reader.pages:
if "/Contents" in page:
contents: Optional[PdfObject] = page["/Contents"]
if isinstance(contents, IndirectObject):
contents = contents.get_object()
if contents is not None:
if isinstance(contents, list):
for content in contents:
if isinstance(content, IndirectObject):
decompress_content_stream(content)
elif isinstance(contents, IndirectObject):
decompress_content_stream(contents)
writer.add_page(page)

with open(output, "wb") as fp:
writer.write(fp)

orig_size = pdf.stat().st_size
uncomp_size = output.stat().st_size

print(f"Original Size : {orig_size:,}")
print(
f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)"
)


def decompress_content_stream(content: IndirectObject) -> None:
"""Decompress a content stream if it uses FlateDecode."""
if content.get("/Filter") == "/FlateDecode":
try:
compressed_data = content.get_data()
uncompressed_data = zlib.decompress(compressed_data)
content.set_data(uncompressed_data)
del content["/Filter"]
except zlib.error as error:
print(
f"Some content stream with /FlateDecode failed to be decompressed: {error}"
)
9 changes: 5 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
import os
from pathlib import Path

from fpdf import FPDF
import pytest
from fpdf import FPDF

from pdfly.cli import entry_point

try:
Expand Down Expand Up @@ -35,7 +36,7 @@ def run_cli(args):
return error.code


@pytest.fixture
@pytest.fixture()
def two_pages_pdf_filepath(tmp_path):
"A PDF with 2 pages, and a different image on each page"
# Note: prior to v2.7.9, fpdf2 produced incorrect /Resources dicts for each page (cf. fpdf2 PR #1133),
Expand All @@ -50,7 +51,7 @@ def two_pages_pdf_filepath(tmp_path):
return pdf_filepath


@pytest.fixture
@pytest.fixture()
def pdf_file_100(tmp_path):
"""A PDF with 100 pages; each has only the page index on it."""
pdf = FPDF()
Expand All @@ -65,7 +66,7 @@ def pdf_file_100(tmp_path):
return pdf_filepath


@pytest.fixture
@pytest.fixture()
def pdf_file_abc(tmp_path):
"""A PDF with 100 pages; each has only the page index on it."""
pdf = FPDF()
Expand Down
2 changes: 0 additions & 2 deletions tests/test_extract_images.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import pytest

from .conftest import RESOURCES_ROOT, chdir, run_cli


Expand Down
40 changes: 40 additions & 0 deletions tests/test_uncompress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Tests for the `uncompress` command."""

from pathlib import Path

import pytest
from pypdf import PdfReader
from typer.testing import CliRunner

from pdfly.cli import entry_point

runner = CliRunner()


@pytest.mark.parametrize(
"input_pdf_filepath", Path("sample-files").glob("*.pdf")
)
def test_uncompress_all_sample_files(
input_pdf_filepath: Path, tmp_path: Path
) -> None:
output_pdf_filepath = tmp_path / "uncompressed_output.pdf"

result = runner.invoke(
entry_point,
["uncompress", str(input_pdf_filepath), str(output_pdf_filepath)],
)

assert (
result.exit_code == 0
), f"Error in uncompressing {input_pdf_filepath}: {result.output}"
assert (
output_pdf_filepath.exists()
), f"Output PDF {output_pdf_filepath} does not exist."

reader = PdfReader(str(output_pdf_filepath))
for page in reader.pages:
contents = page.get("/Contents")
if contents:
assert (
"/Filter" not in contents
), "Content stream is still compressed"
4 changes: 2 additions & 2 deletions tests/test_update_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
Here should only be end-to-end tests.
"""

import re
from pathlib import Path

import pytest
import re

from .conftest import RESOURCES_ROOT, chdir, run_cli
from .conftest import RESOURCES_ROOT, run_cli


def test_update_offsets(capsys, tmp_path: Path) -> None:
Expand Down
Loading