diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d9b410..512b165 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # CHANGELOG +## Version 0.5.0, not released yet + +### New Features (ENH) +- New `extract-annotated-pages` to filter out only the user annotated pages ([PR #98](https://github.com/py-pdf/pdfly/pull/98)) + + ## Version 0.4.0, 2024-12-08 ### New Features (ENH) diff --git a/README.md b/README.md index c32b269..3eef4b5 100644 --- a/README.md +++ b/README.md @@ -33,23 +33,24 @@ $ pdfly --help pdfly is a pure-python cli application for manipulating PDF files. -╭─ Options ───────────────────────────────────────────────────────────────────╮ -│ --version │ -│ --help Show this message and exit. │ -╰─────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ──────────────────────────────────────────────────────────────────╮ -│ 2-up Create a booklet-style PDF from a single input. │ -│ cat Concatenate pages from PDF files into a single PDF file. │ -│ compress Compress a PDF. │ -| uncompress Uncompresses a PDF. │ -│ extract-images Extract images from PDF without resampling or altering. │ -│ extract-text Extract text from a PDF file. │ -│ meta Show metadata of a PDF file │ -│ pagemeta Give details about a single page. │ -│ rm Remove pages from PDF files. │ -│ update-offsets Updates offsets and lengths in a simple PDF file. │ -│ x2pdf Convert one or more files to PDF. Each file is a page. │ -╰─────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────╮ +│ --version │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────────────╮ +│ 2-up Create a booklet-style PDF from a single input. │ +│ cat Concatenate pages from PDF files into a single PDF file. │ +│ compress Compress a PDF. │ +| uncompress Uncompresses a PDF. │ +| extract-annotated-pages Extract only the annotated pages from a PDF. | +│ extract-images Extract images from PDF without resampling or altering. │ +│ extract-text Extract text from a PDF file. │ +│ meta Show metadata of a PDF file │ +│ pagemeta Give details about a single page. │ +│ rm Remove pages from PDF files. │ +│ update-offsets Updates offsets and lengths in a simple PDF file. │ +│ x2pdf Convert one or more files to PDF. Each file is a page. │ +╰──────────────────────────────────────────────────────────────────────────────────────╯ ``` You can see the help of every subcommand by typing `--help`: @@ -63,13 +64,13 @@ $ pdfly 2-up --help Pairs of two pages will be put on one page (left and right) usage: python 2-up.py input_file output_file -╭─ Arguments ─────────────────────────────────────────────────────────────────╮ -│ * pdf PATH [default: None] [required] │ -│ * out PATH [default: None] [required] │ -╰─────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ───────────────────────────────────────────────────────────────────╮ -│ --help Show this message and exit. │ -╰─────────────────────────────────────────────────────────────────────────────╯ +╭─ Arguments ──────────────────────────────────────────────────────────────────────────╮ +│ * pdf PATH [default: None] [required] │ +│ * out PATH [default: None] [required] │ +╰──────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────╮ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────╯ ``` ## Contributors ✨ diff --git a/pdfly/cli.py b/pdfly/cli.py index 4559eed..279b694 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -13,6 +13,7 @@ import pdfly.booklet import pdfly.cat import pdfly.compress +import pdfly.extract_annotated_pages import pdfly.extract_images import pdfly.metadata import pdfly.pagemeta @@ -285,7 +286,7 @@ def update_offsets( ], file_out: Annotated[ Path, typer.Option("-o", "--output") # noqa - ] = None, # type: ignore + ] = None, # type: ignore[assignment] encoding: str = typer.Option( "ISO-8859-1", help="Encoding used to read and write the files, e.g. UTF-8.", @@ -321,3 +322,27 @@ def x2pdf( exit_code = pdfly.x2pdf.main(x, output) if exit_code: raise typer.Exit(code=exit_code) + + +@entry_point.command(name="extract-annotated-pages", help=pdfly.extract_annotated_pages.__doc__) # type: ignore[misc] +def extract_annotated_pages( + input_pdf: Annotated[ + Path, + typer.Argument( + dir_okay=False, + exists=True, + resolve_path=True, + help="Input PDF file.", + ), + ], + output_pdf: Annotated[ + Optional[Path], + typer.Option( + "--output", + "-o", + writable=True, + help="Output PDF file. Defaults to 'input_pdf_annotated'.", + ), + ] = None, +) -> None: + pdfly.extract_annotated_pages.main(input_pdf, output_pdf) diff --git a/pdfly/extract_annotated_pages.py b/pdfly/extract_annotated_pages.py new file mode 100644 index 0000000..91e7afb --- /dev/null +++ b/pdfly/extract_annotated_pages.py @@ -0,0 +1,39 @@ +""" +Extract only the annotated pages from a PDF. + +Q: Why does this help? +A: https://github.com/py-pdf/pdfly/issues/97 +""" + +from pathlib import Path +from typing import Optional + +from pypdf import PdfReader, PdfWriter +from pypdf.annotations import AnnotationDictionary +from pypdf.generic import ArrayObject # noqa: TCH002 + + +# Check if an annotation is manipulable. +def is_manipulable(annot: AnnotationDictionary) -> bool: + return annot.get("/Subtype") not in ["/Link"] + + +# Main function. +def main(input_pdf: Path, output_pdf: Optional[Path]) -> None: + if not output_pdf: + output_pdf = input_pdf.with_name(input_pdf.stem + "_annotated.pdf") + input = PdfReader(input_pdf) + output = PdfWriter() + output_pages = 0 + # Copy only the pages with annotations + for page in input.pages: + if "/Annots" not in page: + continue + page_annots: ArrayObject = page["/Annots"] # type: ignore[assignment] + if not any(is_manipulable(annot) for annot in page_annots): + continue + output.add_page(page) + output_pages += 1 + # Save the output PDF + output.write(output_pdf) + print(f"Extracted {output_pages} pages with annotations to {output_pdf}") diff --git a/resources/input8.pdf b/resources/input8.pdf index f11b56d..513b7e7 100644 Binary files a/resources/input8.pdf and b/resources/input8.pdf differ diff --git a/tests/test_extract_annotated_pages.py b/tests/test_extract_annotated_pages.py new file mode 100644 index 0000000..98b45cd --- /dev/null +++ b/tests/test_extract_annotated_pages.py @@ -0,0 +1,14 @@ +from .conftest import RESOURCES_ROOT, chdir, run_cli + + +def test_extract_annotated_pages_input8(capsys, tmp_path): + with chdir(tmp_path): + run_cli( + [ + "extract-annotated-pages", + str(RESOURCES_ROOT / "input8.pdf"), + ] + ) + captured = capsys.readouterr() + assert not captured.err + assert "Extracted 1 pages with annotations" in captured.out