Skip to content

Commit 12b28ba

Browse files
committed
🐛 with extract-annotated-pages command
1 parent 7518a6c commit 12b28ba

File tree

6 files changed

+104
-24
lines changed

6 files changed

+104
-24
lines changed

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# CHANGELOG
22

3+
## Version 0.5.0, not released yet
4+
5+
### New Features (ENH)
6+
- New `extract-annotated-pages` to filter out only the user annotated pages ([PR #98](https://github.com/py-pdf/pdfly/pull/98))
7+
8+
39
## Version 0.4.0, 2024-12-08
410

511
### New Features (ENH)

README.md

+25-24
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,24 @@ $ pdfly --help
2626

2727
pdfly is a pure-python cli application for manipulating PDF files.
2828

29-
╭─ Options ───────────────────────────────────────────────────────────────────╮
30-
│ --version │
31-
│ --help Show this message and exit. │
32-
╰─────────────────────────────────────────────────────────────────────────────╯
33-
╭─ Commands ──────────────────────────────────────────────────────────────────╮
34-
│ 2-up Create a booklet-style PDF from a single input. │
35-
│ cat Concatenate pages from PDF files into a single PDF file. │
36-
│ compress Compress a PDF. │
37-
| uncompress Uncompresses a PDF. │
38-
│ extract-images Extract images from PDF without resampling or altering. │
39-
│ extract-text Extract text from a PDF file. │
40-
│ meta Show metadata of a PDF file │
41-
│ pagemeta Give details about a single page. │
42-
│ rm Remove pages from PDF files. │
43-
│ update-offsets Updates offsets and lengths in a simple PDF file. │
44-
│ x2pdf Convert one or more files to PDF. Each file is a page. │
45-
╰─────────────────────────────────────────────────────────────────────────────╯
29+
╭─ Options ────────────────────────────────────────────────────────────────────────────╮
30+
│ --version │
31+
│ --help Show this message and exit. │
32+
╰──────────────────────────────────────────────────────────────────────────────────────╯
33+
╭─ Commands ───────────────────────────────────────────────────────────────────────────╮
34+
│ 2-up Create a booklet-style PDF from a single input. │
35+
│ cat Concatenate pages from PDF files into a single PDF file. │
36+
│ compress Compress a PDF. │
37+
| uncompress Uncompresses a PDF. │
38+
| extract-annotated-pages Extract only the annotated pages from a PDF. |
39+
│ extract-images Extract images from PDF without resampling or altering. │
40+
│ extract-text Extract text from a PDF file. │
41+
│ meta Show metadata of a PDF file │
42+
│ pagemeta Give details about a single page. │
43+
│ rm Remove pages from PDF files. │
44+
│ update-offsets Updates offsets and lengths in a simple PDF file. │
45+
│ x2pdf Convert one or more files to PDF. Each file is a page. │
46+
╰──────────────────────────────────────────────────────────────────────────────────────╯
4647
```
4748

4849
You can see the help of every subcommand by typing:
@@ -56,13 +57,13 @@ $ pdfly 2-up --help
5657
Pairs of two pages will be put on one page (left and right)
5758
usage: python 2-up.py input_file output_file
5859

59-
╭─ Arguments ─────────────────────────────────────────────────────────────────╮
60-
│ * pdf PATH [default: None] [required] │
61-
│ * out PATH [default: None] [required] │
62-
╰─────────────────────────────────────────────────────────────────────────────╯
63-
╭─ Options ───────────────────────────────────────────────────────────────────╮
64-
│ --help Show this message and exit. │
65-
╰─────────────────────────────────────────────────────────────────────────────╯
60+
╭─ Arguments ──────────────────────────────────────────────────────────────────────────
61+
│ * pdf PATH [default: None] [required]
62+
│ * out PATH [default: None] [required]
63+
╰──────────────────────────────────────────────────────────────────────────────────────
64+
╭─ Options ────────────────────────────────────────────────────────────────────────────
65+
│ --help Show this message and exit.
66+
╰──────────────────────────────────────────────────────────────────────────────────────
6667
```
6768

6869
## Contributors ✨

pdfly/cli.py

+25
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import pdfly.booklet
1414
import pdfly.cat
1515
import pdfly.compress
16+
import pdfly.extract_annotated_pages
1617
import pdfly.extract_images
1718
import pdfly.metadata
1819
import pdfly.pagemeta
@@ -319,3 +320,27 @@ def x2pdf(
319320
exit_code = pdfly.x2pdf.main(x, output)
320321
if exit_code:
321322
raise typer.Exit(code=exit_code)
323+
324+
325+
@entry_point.command(name="extract-annotated-pages", help=pdfly.extract_annotated_pages.__doc__) # type: ignore[misc]
326+
def extract_annotated_pages(
327+
input_pdf: Annotated[
328+
Path,
329+
typer.Argument(
330+
dir_okay=False,
331+
exists=True,
332+
resolve_path=True,
333+
help="Input PDF file.",
334+
),
335+
],
336+
output_pdf: Annotated[
337+
Optional[Path],
338+
typer.Option(
339+
"--output",
340+
"-o",
341+
writable=True,
342+
help="Output PDF file. Defaults to 'input_pdf_annotated'.",
343+
),
344+
] = None,
345+
) -> None:
346+
pdfly.extract_annotated_pages.main(input_pdf, output_pdf)

pdfly/extract_annotated_pages.py

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
"""
2+
Extract only the annotated pages from a PDF.
3+
4+
Q: Why does this help?
5+
A: https://github.com/py-pdf/pdfly/issues/97
6+
"""
7+
8+
from pathlib import Path
9+
from pypdf import PdfReader, PdfWriter
10+
11+
12+
# Check if an annotation is manipulable.
13+
def is_manipulable(annot) -> bool:
14+
return annot.get("/Subtype") not in ["/Link"]
15+
16+
17+
# Main function.
18+
def main(input_pdf: Path, output_pdf: Path) -> None:
19+
if not output_pdf:
20+
output_pdf = input_pdf.with_stem(input_pdf.stem + "_annotated")
21+
input = PdfReader(input_pdf)
22+
output = PdfWriter()
23+
output_pages = 0
24+
# Copy only the pages with annotations
25+
for page in input.pages:
26+
if not "/Annots" in page:
27+
continue
28+
if not any(is_manipulable(annot) for annot in page["/Annots"]):
29+
continue
30+
output.add_page(page)
31+
output_pages += 1
32+
# Save the output PDF
33+
output.write(output_pdf)
34+
print(f"Extracted {output_pages} pages with annotations to {output_pdf}")

resources/input8.pdf

1.91 KB
Binary file not shown.

tests/test_extract_annotated_pages.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from .conftest import RESOURCES_ROOT, chdir, run_cli
2+
3+
4+
def test_extract_annotated_pages_input8(capsys, tmp_path):
5+
with chdir(tmp_path):
6+
run_cli(
7+
[
8+
"extract-annotated-pages",
9+
str(RESOURCES_ROOT / "input8.pdf"),
10+
]
11+
)
12+
captured = capsys.readouterr()
13+
assert not captured.err
14+
assert "Extracted 1 pages with annotations" in captured.out

0 commit comments

Comments
 (0)