Skip to content

Commit 407e6cd

Browse files
wolfram77Lucas-C
andauthored
ENH: With extract-annotated-pages command (#98)
Co-authored-by: Lucas Cimon <[email protected]>
1 parent 3f254f4 commit 407e6cd

File tree

6 files changed

+110
-25
lines changed

6 files changed

+110
-25
lines changed

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# CHANGELOG
22

3+
## Version 0.5.0, not released yet
4+
5+
### New Features (ENH)
6+
- New `extract-annotated-pages` to filter out only the user annotated pages ([PR #98](https://github.com/py-pdf/pdfly/pull/98))
7+
8+
39
## Version 0.4.0, 2024-12-08
410

511
### New Features (ENH)

README.md

+25-24
Original file line numberDiff line numberDiff line change
@@ -33,23 +33,24 @@ $ pdfly --help
3333

3434
pdfly is a pure-python cli application for manipulating PDF files.
3535

36-
╭─ Options ───────────────────────────────────────────────────────────────────╮
37-
│ --version │
38-
│ --help Show this message and exit. │
39-
╰─────────────────────────────────────────────────────────────────────────────╯
40-
╭─ Commands ──────────────────────────────────────────────────────────────────╮
41-
│ 2-up Create a booklet-style PDF from a single input. │
42-
│ cat Concatenate pages from PDF files into a single PDF file. │
43-
│ compress Compress a PDF. │
44-
| uncompress Uncompresses a PDF. │
45-
│ extract-images Extract images from PDF without resampling or altering. │
46-
│ extract-text Extract text from a PDF file. │
47-
│ meta Show metadata of a PDF file │
48-
│ pagemeta Give details about a single page. │
49-
│ rm Remove pages from PDF files. │
50-
│ update-offsets Updates offsets and lengths in a simple PDF file. │
51-
│ x2pdf Convert one or more files to PDF. Each file is a page. │
52-
╰─────────────────────────────────────────────────────────────────────────────╯
36+
╭─ Options ────────────────────────────────────────────────────────────────────────────╮
37+
│ --version │
38+
│ --help Show this message and exit. │
39+
╰──────────────────────────────────────────────────────────────────────────────────────╯
40+
╭─ Commands ───────────────────────────────────────────────────────────────────────────╮
41+
│ 2-up Create a booklet-style PDF from a single input. │
42+
│ cat Concatenate pages from PDF files into a single PDF file. │
43+
│ compress Compress a PDF. │
44+
| uncompress Uncompresses a PDF. │
45+
| extract-annotated-pages Extract only the annotated pages from a PDF. |
46+
│ extract-images Extract images from PDF without resampling or altering. │
47+
│ extract-text Extract text from a PDF file. │
48+
│ meta Show metadata of a PDF file │
49+
│ pagemeta Give details about a single page. │
50+
│ rm Remove pages from PDF files. │
51+
│ update-offsets Updates offsets and lengths in a simple PDF file. │
52+
│ x2pdf Convert one or more files to PDF. Each file is a page. │
53+
╰──────────────────────────────────────────────────────────────────────────────────────╯
5354
```
5455

5556
You can see the help of every subcommand by typing `--help`:
@@ -63,13 +64,13 @@ $ pdfly 2-up --help
6364
Pairs of two pages will be put on one page (left and right)
6465
usage: python 2-up.py input_file output_file
6566

66-
╭─ Arguments ─────────────────────────────────────────────────────────────────╮
67-
│ * pdf PATH [default: None] [required] │
68-
│ * out PATH [default: None] [required] │
69-
╰─────────────────────────────────────────────────────────────────────────────╯
70-
╭─ Options ───────────────────────────────────────────────────────────────────╮
71-
│ --help Show this message and exit. │
72-
╰─────────────────────────────────────────────────────────────────────────────╯
67+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────
68+
│ * pdf PATH [default: None] [required]
69+
│ * out PATH [default: None] [required]
70+
╰──────────────────────────────────────────────────────────────────────────────────────
71+
╭─ Options ────────────────────────────────────────────────────────────────────────────
72+
│ --help Show this message and exit.
73+
╰──────────────────────────────────────────────────────────────────────────────────────
7374
```
7475

7576
## Contributors ✨

pdfly/cli.py

+26-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import pdfly.booklet
1414
import pdfly.cat
1515
import pdfly.compress
16+
import pdfly.extract_annotated_pages
1617
import pdfly.extract_images
1718
import pdfly.metadata
1819
import pdfly.pagemeta
@@ -285,7 +286,7 @@ def update_offsets(
285286
],
286287
file_out: Annotated[
287288
Path, typer.Option("-o", "--output") # noqa
288-
] = None, # type: ignore
289+
] = None, # type: ignore[assignment]
289290
encoding: str = typer.Option(
290291
"ISO-8859-1",
291292
help="Encoding used to read and write the files, e.g. UTF-8.",
@@ -321,3 +322,27 @@ def x2pdf(
321322
exit_code = pdfly.x2pdf.main(x, output)
322323
if exit_code:
323324
raise typer.Exit(code=exit_code)
325+
326+
327+
@entry_point.command(name="extract-annotated-pages", help=pdfly.extract_annotated_pages.__doc__) # type: ignore[misc]
328+
def extract_annotated_pages(
329+
input_pdf: Annotated[
330+
Path,
331+
typer.Argument(
332+
dir_okay=False,
333+
exists=True,
334+
resolve_path=True,
335+
help="Input PDF file.",
336+
),
337+
],
338+
output_pdf: Annotated[
339+
Optional[Path],
340+
typer.Option(
341+
"--output",
342+
"-o",
343+
writable=True,
344+
help="Output PDF file. Defaults to 'input_pdf_annotated'.",
345+
),
346+
] = None,
347+
) -> None:
348+
pdfly.extract_annotated_pages.main(input_pdf, output_pdf)

pdfly/extract_annotated_pages.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
Extract only the annotated pages from a PDF.
3+
4+
Q: Why does this help?
5+
A: https://github.com/py-pdf/pdfly/issues/97
6+
"""
7+
8+
from pathlib import Path
9+
from typing import Optional
10+
11+
from pypdf import PdfReader, PdfWriter
12+
from pypdf.annotations import AnnotationDictionary
13+
from pypdf.generic import ArrayObject # noqa: TCH002
14+
15+
16+
# Check if an annotation is manipulable.
17+
def is_manipulable(annot: AnnotationDictionary) -> bool:
18+
return annot.get("/Subtype") not in ["/Link"]
19+
20+
21+
# Main function.
22+
def main(input_pdf: Path, output_pdf: Optional[Path]) -> None:
23+
if not output_pdf:
24+
output_pdf = input_pdf.with_name(input_pdf.stem + "_annotated.pdf")
25+
input = PdfReader(input_pdf)
26+
output = PdfWriter()
27+
output_pages = 0
28+
# Copy only the pages with annotations
29+
for page in input.pages:
30+
if "/Annots" not in page:
31+
continue
32+
page_annots: ArrayObject = page["/Annots"] # type: ignore[assignment]
33+
if not any(is_manipulable(annot) for annot in page_annots):
34+
continue
35+
output.add_page(page)
36+
output_pages += 1
37+
# Save the output PDF
38+
output.write(output_pdf)
39+
print(f"Extracted {output_pages} pages with annotations to {output_pdf}")

resources/input8.pdf

1.91 KB
Binary file not shown.

tests/test_extract_annotated_pages.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from .conftest import RESOURCES_ROOT, chdir, run_cli
2+
3+
4+
def test_extract_annotated_pages_input8(capsys, tmp_path):
5+
with chdir(tmp_path):
6+
run_cli(
7+
[
8+
"extract-annotated-pages",
9+
str(RESOURCES_ROOT / "input8.pdf"),
10+
]
11+
)
12+
captured = capsys.readouterr()
13+
assert not captured.err
14+
assert "Extracted 1 pages with annotations" in captured.out

0 commit comments

Comments
 (0)