ENH: With extract-annotated-pages command (#98)

wolfram77 · Lucas-C · web-flow · commit 407e6cd1abdd · 2025-02-17T16:47:55.000+01:00
Co-authored-by: Lucas Cimon &lt;925560+Lucas-C@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # CHANGELOG
 
+## Version 0.5.0, not released yet
+
+### New Features (ENH)
+- New `extract-annotated-pages` to filter out only the user annotated pages ([PR #98](https://github.com/py-pdf/pdfly/pull/98))
+
+
 ## Version 0.4.0, 2024-12-08
 
 ### New Features (ENH)
diff --git a/README.md b/README.md
@@ -33,23 +33,24 @@ $ pdfly --help
 
  pdfly is a pure-python cli application for manipulating PDF files.
 
-╭─ Options ───────────────────────────────────────────────────────────────────╮
-│ --version                                                                   │
-│ --help             Show this message and exit.                              │
-╰─────────────────────────────────────────────────────────────────────────────╯
-╭─ Commands ──────────────────────────────────────────────────────────────────╮
-│ 2-up             Create a booklet-style PDF from a single input.            │
-│ cat              Concatenate pages from PDF files into a single PDF file.   │
-│ compress         Compress a PDF.                                            │
-| uncompress       Uncompresses a PDF.                                        │
-│ extract-images   Extract images from PDF without resampling or altering.    │
-│ extract-text     Extract text from a PDF file.                              │
-│ meta             Show metadata of a PDF file                                │
-│ pagemeta         Give details about a single page.                          │
-│ rm               Remove pages from PDF files.                               │
-│ update-offsets   Updates offsets and lengths in a simple PDF file.          │
-│ x2pdf            Convert one or more files to PDF. Each file is a page.     │
-╰─────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ────────────────────────────────────────────────────────────────────────────╮
+│ --version                                                                            │
+│ --help                    Show this message and exit.                                │
+╰──────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Commands ───────────────────────────────────────────────────────────────────────────╮
+│ 2-up                      Create a booklet-style PDF from a single input.            │
+│ cat                       Concatenate pages from PDF files into a single PDF file.   │
+│ compress                  Compress a PDF.                                            │
+| uncompress                Uncompresses a PDF.                                        │
+| extract-annotated-pages   Extract only the annotated pages from a PDF.               |
+│ extract-images            Extract images from PDF without resampling or altering.    │
+│ extract-text              Extract text from a PDF file.                              │
+│ meta                      Show metadata of a PDF file                                │
+│ pagemeta                  Give details about a single page.                          │
+│ rm                        Remove pages from PDF files.                               │
+│ update-offsets            Updates offsets and lengths in a simple PDF file.          │
+│ x2pdf                     Convert one or more files to PDF. Each file is a page.     │
+╰──────────────────────────────────────────────────────────────────────────────────────╯
 ```
 
 You can see the help of every subcommand by typing `--help`:
@@ -63,13 +64,13 @@ $ pdfly 2-up --help
  Pairs of two pages will be put on one page (left and right)
  usage: python 2-up.py input_file output_file
 
-╭─ Arguments ─────────────────────────────────────────────────────────────────╮
-│ *    pdf      PATH  [default: None] [required]                              │
-│ *    out      PATH  [default: None] [required]                              │
-╰─────────────────────────────────────────────────────────────────────────────╯
-╭─ Options ───────────────────────────────────────────────────────────────────╮
-│ --help          Show this message and exit.                                 │
-╰─────────────────────────────────────────────────────────────────────────────╯
+╭─ Arguments ──────────────────────────────────────────────────────────────────────────╮
+│ *    pdf      PATH  [default: None] [required]                                       │
+│ *    out      PATH  [default: None] [required]                                       │
+╰──────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ────────────────────────────────────────────────────────────────────────────╮
+│ --help          Show this message and exit.                                          │
+╰──────────────────────────────────────────────────────────────────────────────────────╯
 ```
 
 ## Contributors ✨
diff --git a/pdfly/cli.py b/pdfly/cli.py
@@ -13,6 +13,7 @@
 import pdfly.booklet
 import pdfly.cat
 import pdfly.compress
+import pdfly.extract_annotated_pages
 import pdfly.extract_images
 import pdfly.metadata
 import pdfly.pagemeta
@@ -285,7 +286,7 @@ def update_offsets(
     ],
     file_out: Annotated[
         Path, typer.Option("-o", "--output")  # noqa
-    ] = None,  # type: ignore
+    ] = None,  # type: ignore[assignment]
     encoding: str = typer.Option(
         "ISO-8859-1",
         help="Encoding used to read and write the files, e.g. UTF-8.",
@@ -321,3 +322,27 @@ def x2pdf(
     exit_code = pdfly.x2pdf.main(x, output)
     if exit_code:
         raise typer.Exit(code=exit_code)
+
+
+@entry_point.command(name="extract-annotated-pages", help=pdfly.extract_annotated_pages.__doc__)  # type: ignore[misc]
+def extract_annotated_pages(
+    input_pdf: Annotated[
+        Path,
+        typer.Argument(
+            dir_okay=False,
+            exists=True,
+            resolve_path=True,
+            help="Input PDF file.",
+        ),
+    ],
+    output_pdf: Annotated[
+        Optional[Path],
+        typer.Option(
+            "--output",
+            "-o",
+            writable=True,
+            help="Output PDF file. Defaults to 'input_pdf_annotated'.",
+        ),
+    ] = None,
+) -> None:
+    pdfly.extract_annotated_pages.main(input_pdf, output_pdf)
diff --git a/pdfly/extract_annotated_pages.py b/pdfly/extract_annotated_pages.py
@@ -0,0 +1,39 @@
+"""
+Extract only the annotated pages from a PDF.
+
+Q: Why does this help?
+A: https://github.com/py-pdf/pdfly/issues/97
+"""
+
+from pathlib import Path
+from typing import Optional
+
+from pypdf import PdfReader, PdfWriter
+from pypdf.annotations import AnnotationDictionary
+from pypdf.generic import ArrayObject  # noqa: TCH002
+
+
+# Check if an annotation is manipulable.
+def is_manipulable(annot: AnnotationDictionary) -> bool:
+    return annot.get("/Subtype") not in ["/Link"]
+
+
+# Main function.
+def main(input_pdf: Path, output_pdf: Optional[Path]) -> None:
+    if not output_pdf:
+        output_pdf = input_pdf.with_name(input_pdf.stem + "_annotated.pdf")
+    input = PdfReader(input_pdf)
+    output = PdfWriter()
+    output_pages = 0
+    # Copy only the pages with annotations
+    for page in input.pages:
+        if "/Annots" not in page:
+            continue
+        page_annots: ArrayObject = page["/Annots"]  # type: ignore[assignment]
+        if not any(is_manipulable(annot) for annot in page_annots):
+            continue
+        output.add_page(page)
+        output_pages += 1
+    # Save the output PDF
+    output.write(output_pdf)
+    print(f"Extracted {output_pages} pages with annotations to {output_pdf}")
diff --git a/resources/input8.pdf b/resources/input8.pdf
diff --git a/tests/test_extract_annotated_pages.py b/tests/test_extract_annotated_pages.py
@@ -0,0 +1,14 @@
+from .conftest import RESOURCES_ROOT, chdir, run_cli
+
+
+def test_extract_annotated_pages_input8(capsys, tmp_path):
+    with chdir(tmp_path):
+        run_cli(
+            [
+                "extract-annotated-pages",
+                str(RESOURCES_ROOT / "input8.pdf"),
+            ]
+        )
+    captured = capsys.readouterr()
+    assert not captured.err
+    assert "Extracted 1 pages with annotations" in captured.out