Skip to content

Commit ad0c33d

Browse files
authored
Add rm funcionality (#59)
1 parent b9b77ec commit ad0c33d

File tree

4 files changed

+318
-3
lines changed

4 files changed

+318
-3
lines changed

pdfly/cat.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,11 @@
5353

5454

5555
def main(
56-
filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool
56+
filename: Path,
57+
fn_pgrgs: List[str],
58+
output: Path,
59+
verbose: bool,
60+
inverted_page_selection: bool = False,
5761
) -> None:
5862
filename_page_ranges = parse_filepaths_and_pagerange_args(
5963
filename, fn_pgrgs
@@ -87,8 +91,15 @@ def main(
8791
f"WARNING: Page range {page_range} is out of bounds",
8892
file=sys.stderr,
8993
)
90-
for page_num in range(*page_range.indices(len(reader.pages))):
91-
writer.add_page(reader.pages[page_num])
94+
if inverted_page_selection:
95+
all_page_nums = set(range(len(reader.pages)))
96+
page_nums = set(range(*page_range.indices(len(reader.pages))))
97+
inverted_page_nums = all_page_nums - page_nums
98+
for page_num in inverted_page_nums:
99+
writer.add_page(reader.pages[page_num])
100+
else:
101+
for page_num in range(*page_range.indices(len(reader.pages))):
102+
writer.add_page(reader.pages[page_num])
92103
writer.write(output_fh)
93104
except Exception:
94105
print(traceback.format_exc(), file=sys.stderr)

pdfly/cli.py

+25
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pdfly.extract_images
1616
import pdfly.metadata
1717
import pdfly.pagemeta
18+
import pdfly.rm
1819
import pdfly.up2
1920
import pdfly.x2pdf
2021

@@ -104,6 +105,30 @@ def cat(
104105
pdfly.cat.main(filename, fn_pgrgs, output, verbose)
105106

106107

108+
@entry_point.command(name="rm", help=pdfly.rm.__doc__)
109+
def rm(
110+
filename: Annotated[
111+
Path,
112+
typer.Argument(
113+
exists=True,
114+
file_okay=True,
115+
dir_okay=False,
116+
writable=False,
117+
readable=True,
118+
resolve_path=True,
119+
),
120+
],
121+
output: Path = typer.Option(..., "-o", "--output"), # noqa
122+
fn_pgrgs: List[str] = typer.Argument( # noqa
123+
..., help="filenames and/or page ranges"
124+
),
125+
verbose: bool = typer.Option(
126+
False, help="show page ranges as they are being read"
127+
),
128+
) -> None:
129+
pdfly.rm.main(filename, fn_pgrgs, output, verbose)
130+
131+
107132
@entry_point.command(name="meta", help=pdfly.metadata.__doc__) # type: ignore[misc]
108133
def metadata(
109134
pdf: Annotated[

pdfly/rm.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""
2+
Remove pages from PDF files.
3+
4+
Page ranges refer to the previously-named file.
5+
A file not followed by a page range means all the pages of the file.
6+
7+
PAGE RANGES are like Python slices.
8+
9+
Remember, page indices start with zero.
10+
11+
Page range expression examples:
12+
13+
: all pages. -1 last page.
14+
22 just the 23rd page. :-1 all but the last page.
15+
0:3 the first three pages. -2 second-to-last page.
16+
:3 the first three pages. -2: last two pages.
17+
5: from the sixth page onward. -3:-1 third & second to last.
18+
19+
The third, "stride" or "step" number is also recognized.
20+
21+
::2 0 2 4 ... to the end. 3:0:-1 3 2 1 but not 0.
22+
1:10:2 1 3 5 7 9 2::-1 2 1 0.
23+
::-1 all pages in reverse order.
24+
25+
Examples
26+
pdfly rm -o output.pdf document.pdf 2:5
27+
28+
Remove pages 2 to 4 from document.pdf, producing output.pdf.
29+
30+
pdfly rm document.pdf :-1
31+
32+
Removes all pages except the last one from document.pdf, modifying the original file.
33+
34+
pdfly rm report.pdf :6 7:
35+
36+
Remove all pages except page seven from report.pdf,
37+
producing a single-page report.pdf.
38+
39+
"""
40+
41+
from pathlib import Path
42+
from typing import List
43+
44+
from pdfly.cat import main as cat_main
45+
46+
47+
def main(
48+
filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool
49+
) -> None:
50+
cat_main(filename, fn_pgrgs, output, verbose, inverted_page_selection=True)

tests/test_rm.py

+229
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
"""Tests for the `rm` command."""
2+
3+
from pathlib import Path
4+
from typing import List
5+
6+
import pytest
7+
from _pytest.capture import CaptureFixture
8+
from pypdf import PdfReader
9+
10+
from .conftest import RESOURCES_ROOT, chdir, run_cli
11+
from .test_cat import extract_embedded_images
12+
13+
14+
def test_rm_incorrect_number_of_args(
15+
capsys: CaptureFixture, tmp_path: Path
16+
) -> None:
17+
with chdir(tmp_path):
18+
exit_code = run_cli(["rm", str(RESOURCES_ROOT / "box.pdf")])
19+
assert exit_code == 2
20+
captured = capsys.readouterr()
21+
assert "Missing argument" in captured.err
22+
23+
24+
def test_rm_subset_ok(capsys: CaptureFixture, tmp_path: Path) -> None:
25+
with chdir(tmp_path):
26+
exit_code = run_cli(
27+
[
28+
"rm",
29+
str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"),
30+
"13:15",
31+
"--output",
32+
"./out.pdf",
33+
]
34+
)
35+
captured = capsys.readouterr()
36+
assert exit_code == 0, captured
37+
assert not captured.err
38+
inp_reader = PdfReader(
39+
RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"
40+
)
41+
out_reader = PdfReader(tmp_path / "out.pdf")
42+
assert len(out_reader.pages) == len(inp_reader.pages) - 2
43+
44+
45+
@pytest.mark.parametrize(
46+
"page_range",
47+
["a", "-", "1-", "1-1-1", "1:1:1:1"],
48+
)
49+
def test_rm_subset_invalid_args(
50+
capsys: CaptureFixture, tmp_path: Path, page_range: str
51+
) -> None:
52+
with chdir(tmp_path):
53+
exit_code = run_cli(
54+
[
55+
"rm",
56+
str(RESOURCES_ROOT / "jpeg.pdf"),
57+
page_range,
58+
"--output",
59+
"./out.pdf",
60+
]
61+
)
62+
captured = capsys.readouterr()
63+
assert exit_code == 2, captured
64+
assert "Invalid file path or page range provided" in captured.err
65+
66+
67+
def test_rm_subset_warn_on_missing_pages(
68+
capsys: CaptureFixture, tmp_path: Path
69+
) -> None:
70+
with chdir(tmp_path):
71+
exit_code = run_cli(
72+
[
73+
"rm",
74+
str(RESOURCES_ROOT / "jpeg.pdf"),
75+
"2",
76+
"--output",
77+
"./out.pdf",
78+
]
79+
)
80+
captured = capsys.readouterr()
81+
assert exit_code == 0, captured
82+
assert "WARN" in captured.err
83+
84+
85+
def test_rm_subset_ensure_reduced_size(
86+
tmp_path: Path, two_pages_pdf_filepath: Path
87+
) -> None:
88+
exit_code = run_cli(
89+
[
90+
"rm",
91+
str(two_pages_pdf_filepath),
92+
"0",
93+
"--output",
94+
str(tmp_path / "page1.pdf"),
95+
]
96+
)
97+
assert exit_code == 0
98+
# The extracted PDF should only contain ONE image:
99+
embedded_images = extract_embedded_images(tmp_path / "page1.pdf")
100+
assert len(embedded_images) == 1
101+
102+
exit_code = run_cli(
103+
[
104+
"rm",
105+
str(two_pages_pdf_filepath),
106+
"1",
107+
"--output",
108+
str(tmp_path / "page2.pdf"),
109+
]
110+
)
111+
assert exit_code == 0
112+
# The extracted PDF should only contain ONE image:
113+
embedded_images = extract_embedded_images(tmp_path / "page2.pdf")
114+
assert len(embedded_images) == 1
115+
116+
117+
def test_rm_combine_files(
118+
pdf_file_100: Path,
119+
pdf_file_abc: Path,
120+
tmp_path: Path,
121+
capsys: CaptureFixture,
122+
) -> None:
123+
with chdir(tmp_path):
124+
output_pdf_path = tmp_path / "out.pdf"
125+
126+
# Run pdfly rm command
127+
exit_code = run_cli(
128+
[
129+
"rm",
130+
str(pdf_file_100),
131+
"1:10:2",
132+
str(pdf_file_abc),
133+
"::2",
134+
str(pdf_file_abc),
135+
"1::2",
136+
"--output",
137+
str(output_pdf_path),
138+
]
139+
)
140+
captured = capsys.readouterr()
141+
142+
# Check if the command was successful
143+
assert exit_code == 0, captured.out
144+
145+
# Extract text from the original and modified PDFs
146+
extracted_pages = []
147+
reader = PdfReader(output_pdf_path)
148+
extracted_pages = [page.extract_text() for page in reader.pages]
149+
150+
# Compare the extracted text
151+
l1 = [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))]
152+
assert extracted_pages == l1 + [
153+
"b",
154+
"d",
155+
"f",
156+
"h",
157+
"j",
158+
"l",
159+
"n",
160+
"p",
161+
"r",
162+
"t",
163+
"v",
164+
"x",
165+
"z",
166+
"a",
167+
"c",
168+
"e",
169+
"g",
170+
"i",
171+
"k",
172+
"m",
173+
"o",
174+
"q",
175+
"s",
176+
"u",
177+
"w",
178+
"y",
179+
]
180+
181+
182+
@pytest.mark.parametrize(
183+
("page_range", "expected"),
184+
[
185+
("22", [str(el) for el in range(100) if el != 22]),
186+
("0:3", [str(el) for el in range(3, 100)]),
187+
(":3", [str(el) for el in range(3, 100)]),
188+
(":", []),
189+
("5:", ["0", "1", "2", "3", "4"]),
190+
("::2", [str(el) for el in list(range(100))[1::2]]),
191+
(
192+
"1:10:2",
193+
[str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))],
194+
),
195+
("::1", []),
196+
("::-1", []),
197+
],
198+
)
199+
def test_rm_commands(
200+
pdf_file_100: Path,
201+
capsys: CaptureFixture,
202+
tmp_path: Path,
203+
page_range: str,
204+
expected: List[str],
205+
) -> None:
206+
with chdir(tmp_path):
207+
output_pdf_path = tmp_path / "out.pdf"
208+
209+
# Run pdfly rm command
210+
exit_code = run_cli(
211+
[
212+
"rm",
213+
str(pdf_file_100),
214+
page_range,
215+
"--output",
216+
str(output_pdf_path),
217+
]
218+
)
219+
220+
# Check if the command was successful
221+
assert exit_code == 0
222+
223+
# Extract text from the original and modified PDFs
224+
extracted_pages = []
225+
reader = PdfReader(output_pdf_path)
226+
extracted_pages = [page.extract_text() for page in reader.pages]
227+
228+
# Compare the extracted text
229+
assert extracted_pages == expected

0 commit comments

Comments
 (0)