From 0b4b4b7b75d8f89e3a21162ffdc77b8204587d8f Mon Sep 17 00:00:00 2001 From: Harsh Date: Fri, 15 Nov 2024 09:47:35 +0000 Subject: [PATCH 1/7] Add requested changes 1 --- pdfly/cli.py | 25 ++++++++++++++++++++++ pdfly/uncompress.py | 52 +++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 3 files changed, 78 insertions(+) create mode 100644 pdfly/uncompress.py diff --git a/pdfly/cli.py b/pdfly/cli.py index d736d33..7ceeb40 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -19,6 +19,7 @@ import pdfly.up2 import pdfly.update_offsets import pdfly.x2pdf +import pdfly.uncompress def version_callback(value: bool) -> None: @@ -205,6 +206,30 @@ def compress( pdfly.compress.main(pdf, output) +@entry_point.command(name="uncompress", help=pdfly.uncompress.__doc__) # type: ignore[misc] +def uncompress( + pdf: Annotated[ + Path, + typer.Argument( + exists=True, + file_okay=True, + dir_okay=False, + writable=False, + readable=True, + resolve_path=True, + ), + ], + output: Annotated[ + Path, + typer.Argument( + exists=False, + writable=True, + ), + ], +) -> None: + pdfly.uncompress.main(pdf, output) + + @entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc] def update_offsets( file_in: Annotated[ diff --git a/pdfly/uncompress.py b/pdfly/uncompress.py new file mode 100644 index 0000000..a834384 --- /dev/null +++ b/pdfly/uncompress.py @@ -0,0 +1,52 @@ +"""Module for uncompressing PDF content streams.""" + +from pathlib import Path +from typing import Optional +import zlib + +from pypdf import PdfReader, PdfWriter +from pypdf.generic import IndirectObject, PdfObject + + +def main(pdf: Path, output: Path) -> None: + reader = PdfReader(pdf) + writer = PdfWriter() + + for page in reader.pages: + if "/Contents" in page: + contents: Optional[PdfObject] = page["/Contents"] + if isinstance(contents, IndirectObject): + contents = contents.get_object() + if contents is not None: + if isinstance(contents, list): + for content in contents: + if isinstance(content, IndirectObject): + decompress_content_stream(content) + elif isinstance(contents, IndirectObject): + decompress_content_stream(contents) + writer.add_page(page) + + with open(output, "wb") as fp: + writer.write(fp) + + orig_size = pdf.stat().st_size + uncomp_size = output.stat().st_size + + print(f"Original Size : {orig_size:,}") + print( + f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)" + ) + + +def decompress_content_stream(content: IndirectObject) -> None: + """Decompress a content stream if it uses FlateDecode.""" + if content.get("/Filter") == "/FlateDecode": + try: + compressed_data = content.get_data() + uncompressed_data = zlib.decompress(compressed_data) + content.set_data(uncompressed_data) + del content["/Filter"] + except zlib.error as error: + print( + f"Some content stream with /FlateDecode failed to be decompressed: {error}" + ) diff --git a/pyproject.toml b/pyproject.toml index 7394a52..6adc60c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -138,6 +138,7 @@ ignore = [ "SLF001", # Private member accessed "INP001", # File `docs/conf.py` is part of an implicit namespace package. Add an `__init__.py`. "FA100", # Missing `from __future__ import annotations`, but uses `typing.Optional` + "I001" #Imports not at the top of the file. ] [tool.ruff.mccabe] From 85a9e26f9075f354a19dc04c95fe78f5f11fab3c Mon Sep 17 00:00:00 2001 From: Harsh Date: Fri, 15 Nov 2024 10:31:31 +0000 Subject: [PATCH 2/7] Added uncompress comand to README & Unit tests for uncompress.py --- README.md | 1 + tests/test_uncompress.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 tests/test_uncompress.py diff --git a/README.md b/README.md index 0667e65..a749d60 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ $ pdfly --help │ 2-up Create a booklet-style PDF from a single input. │ │ cat Concatenate pages from PDF files into a single PDF file. │ │ compress Compress a PDF. │ +| uncompress Uncompresses a PDF. │ │ extract-images Extract images from PDF without resampling or altering. │ │ extract-text Extract text from a PDF file. │ │ meta Show metadata of a PDF file │ diff --git a/tests/test_uncompress.py b/tests/test_uncompress.py new file mode 100644 index 0000000..1aec7de --- /dev/null +++ b/tests/test_uncompress.py @@ -0,0 +1,37 @@ +"""Tests for the `uncompress` command.""" + +import pytest +from pathlib import Path +from pdfly.cli import entry_point +from typer.testing import CliRunner + +runner = CliRunner() + +@pytest.mark.parametrize( + "input_pdf_filepath", Path("sample-files").glob("*.pdf") +) +def test_uncompress_all_sample_files(input_pdf_filepath: Path, tmp_path: Path) -> None: + + output_pdf_filepath = tmp_path / "uncompressed_output.pdf" + + result = runner.invoke( + entry_point, + ["uncompress", str(input_pdf_filepath), str(output_pdf_filepath)], + ) + + assert ( + result.exit_code == 0 + ), f"Error in uncompressing {input_pdf_filepath}: {result.output}" + assert ( + output_pdf_filepath.exists() + ), f"Output PDF {output_pdf_filepath} does not exist." + + from pypdf import PdfReader + + reader = PdfReader(str(output_pdf_filepath)) + for page in reader.pages: + contents = page.get("/Contents") + if contents: + assert ( + "/Filter" not in contents + ), "Content stream is still compressed" \ No newline at end of file From e263a3075653820c08bc805261e8c0d65986219d Mon Sep 17 00:00:00 2001 From: Harsh Date: Fri, 15 Nov 2024 10:33:04 +0000 Subject: [PATCH 3/7] Fixed Black linting --- tests/test_uncompress.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_uncompress.py b/tests/test_uncompress.py index 1aec7de..32c0373 100644 --- a/tests/test_uncompress.py +++ b/tests/test_uncompress.py @@ -7,11 +7,14 @@ runner = CliRunner() + @pytest.mark.parametrize( "input_pdf_filepath", Path("sample-files").glob("*.pdf") ) -def test_uncompress_all_sample_files(input_pdf_filepath: Path, tmp_path: Path) -> None: - +def test_uncompress_all_sample_files( + input_pdf_filepath: Path, tmp_path: Path +) -> None: + output_pdf_filepath = tmp_path / "uncompressed_output.pdf" result = runner.invoke( @@ -34,4 +37,4 @@ def test_uncompress_all_sample_files(input_pdf_filepath: Path, tmp_path: Path) - if contents: assert ( "/Filter" not in contents - ), "Content stream is still compressed" \ No newline at end of file + ), "Content stream is still compressed" From 4110a439b0aecabf351b122ea1c6a2334f03ac48 Mon Sep 17 00:00:00 2001 From: Harsh Date: Fri, 15 Nov 2024 10:36:52 +0000 Subject: [PATCH 4/7] fixed black 2 --- tests/test_uncompress.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/test_uncompress.py b/tests/test_uncompress.py index 32c0373..965467e 100644 --- a/tests/test_uncompress.py +++ b/tests/test_uncompress.py @@ -8,12 +8,8 @@ runner = CliRunner() -@pytest.mark.parametrize( - "input_pdf_filepath", Path("sample-files").glob("*.pdf") -) -def test_uncompress_all_sample_files( - input_pdf_filepath: Path, tmp_path: Path -) -> None: +@pytest.mark.parametrize("input_pdf_filepath", Path("sample-files").glob("*.pdf")) +def test_uncompress_all_sample_files(input_pdf_filepath: Path, tmp_path: Path) -> None: output_pdf_filepath = tmp_path / "uncompressed_output.pdf" @@ -35,6 +31,4 @@ def test_uncompress_all_sample_files( for page in reader.pages: contents = page.get("/Contents") if contents: - assert ( - "/Filter" not in contents - ), "Content stream is still compressed" + assert "/Filter" not in contents, "Content stream is still compressed" From dfa139a27b7f18cb88b5fb0b57f8635cd2ba3f81 Mon Sep 17 00:00:00 2001 From: Harsh Date: Fri, 15 Nov 2024 10:38:14 +0000 Subject: [PATCH 5/7] fixed black 2 --- tests/test_uncompress.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_uncompress.py b/tests/test_uncompress.py index 965467e..32c0373 100644 --- a/tests/test_uncompress.py +++ b/tests/test_uncompress.py @@ -8,8 +8,12 @@ runner = CliRunner() -@pytest.mark.parametrize("input_pdf_filepath", Path("sample-files").glob("*.pdf")) -def test_uncompress_all_sample_files(input_pdf_filepath: Path, tmp_path: Path) -> None: +@pytest.mark.parametrize( + "input_pdf_filepath", Path("sample-files").glob("*.pdf") +) +def test_uncompress_all_sample_files( + input_pdf_filepath: Path, tmp_path: Path +) -> None: output_pdf_filepath = tmp_path / "uncompressed_output.pdf" @@ -31,4 +35,6 @@ def test_uncompress_all_sample_files(input_pdf_filepath: Path, tmp_path: Path) - for page in reader.pages: contents = page.get("/Contents") if contents: - assert "/Filter" not in contents, "Content stream is still compressed" + assert ( + "/Filter" not in contents + ), "Content stream is still compressed" From 1806128a8bc78bed9e1c635e2297f3e2fc47144f Mon Sep 17 00:00:00 2001 From: Lucas Cimon <925560+Lucas-C@users.noreply.github.com> Date: Fri, 15 Nov 2024 14:54:57 +0100 Subject: [PATCH 6/7] Pleasing black --- tests/test_uncompress.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_uncompress.py b/tests/test_uncompress.py index 32c0373..d3239ad 100644 --- a/tests/test_uncompress.py +++ b/tests/test_uncompress.py @@ -14,7 +14,6 @@ def test_uncompress_all_sample_files( input_pdf_filepath: Path, tmp_path: Path ) -> None: - output_pdf_filepath = tmp_path / "uncompressed_output.pdf" result = runner.invoke( From 32c4ec0b0b4d3ec58f1c58098a078da68de421c7 Mon Sep 17 00:00:00 2001 From: Lucas Cimon <925560+Lucas-C@users.noreply.github.com> Date: Fri, 15 Nov 2024 15:01:59 +0100 Subject: [PATCH 7/7] Pleasing ruff --- pdfly/cli.py | 2 +- pdfly/uncompress.py | 104 +++++++++++++++++------------------ pyproject.toml | 1 - tests/conftest.py | 9 +-- tests/test_extract_images.py | 2 - tests/test_uncompress.py | 9 +-- tests/test_update_offsets.py | 4 +- 7 files changed, 65 insertions(+), 66 deletions(-) diff --git a/pdfly/cli.py b/pdfly/cli.py index 7ceeb40..9c5fa31 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -16,10 +16,10 @@ import pdfly.metadata import pdfly.pagemeta import pdfly.rm +import pdfly.uncompress import pdfly.up2 import pdfly.update_offsets import pdfly.x2pdf -import pdfly.uncompress def version_callback(value: bool) -> None: diff --git a/pdfly/uncompress.py b/pdfly/uncompress.py index a834384..a543473 100644 --- a/pdfly/uncompress.py +++ b/pdfly/uncompress.py @@ -1,52 +1,52 @@ -"""Module for uncompressing PDF content streams.""" - -from pathlib import Path -from typing import Optional -import zlib - -from pypdf import PdfReader, PdfWriter -from pypdf.generic import IndirectObject, PdfObject - - -def main(pdf: Path, output: Path) -> None: - reader = PdfReader(pdf) - writer = PdfWriter() - - for page in reader.pages: - if "/Contents" in page: - contents: Optional[PdfObject] = page["/Contents"] - if isinstance(contents, IndirectObject): - contents = contents.get_object() - if contents is not None: - if isinstance(contents, list): - for content in contents: - if isinstance(content, IndirectObject): - decompress_content_stream(content) - elif isinstance(contents, IndirectObject): - decompress_content_stream(contents) - writer.add_page(page) - - with open(output, "wb") as fp: - writer.write(fp) - - orig_size = pdf.stat().st_size - uncomp_size = output.stat().st_size - - print(f"Original Size : {orig_size:,}") - print( - f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)" - ) - - -def decompress_content_stream(content: IndirectObject) -> None: - """Decompress a content stream if it uses FlateDecode.""" - if content.get("/Filter") == "/FlateDecode": - try: - compressed_data = content.get_data() - uncompressed_data = zlib.decompress(compressed_data) - content.set_data(uncompressed_data) - del content["/Filter"] - except zlib.error as error: - print( - f"Some content stream with /FlateDecode failed to be decompressed: {error}" - ) +"""Module for uncompressing PDF content streams.""" + +import zlib +from pathlib import Path +from typing import Optional + +from pypdf import PdfReader, PdfWriter +from pypdf.generic import IndirectObject, PdfObject + + +def main(pdf: Path, output: Path) -> None: + reader = PdfReader(pdf) + writer = PdfWriter() + + for page in reader.pages: + if "/Contents" in page: + contents: Optional[PdfObject] = page["/Contents"] + if isinstance(contents, IndirectObject): + contents = contents.get_object() + if contents is not None: + if isinstance(contents, list): + for content in contents: + if isinstance(content, IndirectObject): + decompress_content_stream(content) + elif isinstance(contents, IndirectObject): + decompress_content_stream(contents) + writer.add_page(page) + + with open(output, "wb") as fp: + writer.write(fp) + + orig_size = pdf.stat().st_size + uncomp_size = output.stat().st_size + + print(f"Original Size : {orig_size:,}") + print( + f"Uncompressed Size: {uncomp_size:,} ({(uncomp_size / orig_size) * 100:.1f}% of original)" + ) + + +def decompress_content_stream(content: IndirectObject) -> None: + """Decompress a content stream if it uses FlateDecode.""" + if content.get("/Filter") == "/FlateDecode": + try: + compressed_data = content.get_data() + uncompressed_data = zlib.decompress(compressed_data) + content.set_data(uncompressed_data) + del content["/Filter"] + except zlib.error as error: + print( + f"Some content stream with /FlateDecode failed to be decompressed: {error}" + ) diff --git a/pyproject.toml b/pyproject.toml index 6adc60c..7394a52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -138,7 +138,6 @@ ignore = [ "SLF001", # Private member accessed "INP001", # File `docs/conf.py` is part of an implicit namespace package. Add an `__init__.py`. "FA100", # Missing `from __future__ import annotations`, but uses `typing.Optional` - "I001" #Imports not at the top of the file. ] [tool.ruff.mccabe] diff --git a/tests/conftest.py b/tests/conftest.py index 9ab40d4..181d60c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,8 +3,9 @@ import os from pathlib import Path -from fpdf import FPDF import pytest +from fpdf import FPDF + from pdfly.cli import entry_point try: @@ -35,7 +36,7 @@ def run_cli(args): return error.code -@pytest.fixture +@pytest.fixture() def two_pages_pdf_filepath(tmp_path): "A PDF with 2 pages, and a different image on each page" # Note: prior to v2.7.9, fpdf2 produced incorrect /Resources dicts for each page (cf. fpdf2 PR #1133), @@ -50,7 +51,7 @@ def two_pages_pdf_filepath(tmp_path): return pdf_filepath -@pytest.fixture +@pytest.fixture() def pdf_file_100(tmp_path): """A PDF with 100 pages; each has only the page index on it.""" pdf = FPDF() @@ -65,7 +66,7 @@ def pdf_file_100(tmp_path): return pdf_filepath -@pytest.fixture +@pytest.fixture() def pdf_file_abc(tmp_path): """A PDF with 100 pages; each has only the page index on it.""" pdf = FPDF() diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py index 3a3025e..4a0df2b 100644 --- a/tests/test_extract_images.py +++ b/tests/test_extract_images.py @@ -1,5 +1,3 @@ -import pytest - from .conftest import RESOURCES_ROOT, chdir, run_cli diff --git a/tests/test_uncompress.py b/tests/test_uncompress.py index d3239ad..dc19723 100644 --- a/tests/test_uncompress.py +++ b/tests/test_uncompress.py @@ -1,10 +1,13 @@ """Tests for the `uncompress` command.""" -import pytest from pathlib import Path -from pdfly.cli import entry_point + +import pytest +from pypdf import PdfReader from typer.testing import CliRunner +from pdfly.cli import entry_point + runner = CliRunner() @@ -28,8 +31,6 @@ def test_uncompress_all_sample_files( output_pdf_filepath.exists() ), f"Output PDF {output_pdf_filepath} does not exist." - from pypdf import PdfReader - reader = PdfReader(str(output_pdf_filepath)) for page in reader.pages: contents = page.get("/Contents") diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index bd5d506..c239577 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -4,12 +4,12 @@ Here should only be end-to-end tests. """ +import re from pathlib import Path import pytest -import re -from .conftest import RESOURCES_ROOT, chdir, run_cli +from .conftest import RESOURCES_ROOT, run_cli def test_update_offsets(capsys, tmp_path: Path) -> None: