Skip to content

Commit 8eb34ac

Browse files
committed
Add rm funcionality
1 parent 9e3523e commit 8eb34ac

File tree

3 files changed

+255
-3
lines changed

3 files changed

+255
-3
lines changed

pdfly/cat.py

+16-3
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
In case you don't want chapter 10 before chapter 2.
3838
3939
"""
40+
4041
# Copyright (c) 2014, Steve Witham <[email protected]>.
4142
# All rights reserved. This software is available under a BSD license;
4243
# see https://github.com/py-pdf/pypdf/LICENSE
@@ -52,7 +53,11 @@
5253

5354

5455
def main(
55-
filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool
56+
filename: Path,
57+
fn_pgrgs: List[str],
58+
output: Path,
59+
verbose: bool,
60+
use_complements: bool = False,
5661
) -> None:
5762
filename_page_ranges = parse_filepaths_and_pagerange_args(
5863
filename, fn_pgrgs
@@ -73,8 +78,16 @@ def main(
7378
in_fs[filename] = open(filename, "rb")
7479

7580
reader = PdfReader(in_fs[filename])
76-
for page_num in range(*page_range.indices(len(reader.pages))):
77-
writer.add_page(reader.pages[page_num])
81+
if not use_complements:
82+
for page_num in range(*page_range.indices(len(reader.pages))):
83+
writer.add_page(reader.pages[page_num])
84+
else:
85+
all_page_nums = set(range(len(reader.pages)))
86+
page_nums = set(range(*page_range.indices(len(reader.pages))))
87+
compl_page_nums = all_page_nums - page_nums
88+
for page_num in compl_page_nums:
89+
writer.add_page(reader.pages[page_num])
90+
7891
writer.write(output_fh)
7992
except Exception:
8093
print(traceback.format_exc(), file=sys.stderr)

pdfly/cli.py

+24
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,30 @@ def cat(
103103
pdfly.cat.main(filename, fn_pgrgs, output, verbose)
104104

105105

106+
@entry_point.command(name="rm") # type: ignore[misc]
107+
def rm(
108+
filename: Annotated[
109+
Path,
110+
typer.Argument(
111+
exists=True,
112+
file_okay=True,
113+
dir_okay=False,
114+
writable=False,
115+
readable=True,
116+
resolve_path=True,
117+
),
118+
],
119+
output: Path = typer.Option(..., "-o", "--output"), # noqa
120+
fn_pgrgs: List[str] = typer.Argument( # noqa
121+
..., help="filenames and/or page ranges"
122+
),
123+
verbose: bool = typer.Option(
124+
False, help="show page ranges as they are being read"
125+
),
126+
) -> None:
127+
pdfly.cat.main(filename, fn_pgrgs, output, verbose, use_complements=True)
128+
129+
106130
@entry_point.command(name="meta") # type: ignore[misc]
107131
def metadata(
108132
pdf: Annotated[

tests/test_rm.py

+215
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
import pytest
2+
from pypdf import PdfReader
3+
4+
from .conftest import RESOURCES_ROOT, chdir, run_cli
5+
6+
7+
def test_rm_incorrect_number_of_args(capsys, tmp_path):
8+
with chdir(tmp_path):
9+
exit_code = run_cli(["rm", str(RESOURCES_ROOT / "box.pdf")])
10+
assert exit_code == 2
11+
captured = capsys.readouterr()
12+
assert "Missing argument" in captured.err
13+
14+
15+
def test_rm_subset_ok(capsys, tmp_path):
16+
with chdir(tmp_path):
17+
exit_code = run_cli(
18+
[
19+
"rm",
20+
str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"),
21+
"13:15",
22+
"--output",
23+
"./out.pdf",
24+
]
25+
)
26+
captured = capsys.readouterr()
27+
assert exit_code == 0, captured
28+
assert not captured.err
29+
inp_reader = PdfReader(
30+
RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"
31+
)
32+
out_reader = PdfReader(tmp_path / "out.pdf")
33+
assert len(out_reader.pages) == len(inp_reader.pages) - 2
34+
35+
36+
@pytest.mark.parametrize(
37+
"page_range",
38+
["a", "-", "1-", "1-1-1", "1:1:1:1"],
39+
)
40+
def test_rm_subset_invalid_args(capsys, tmp_path, page_range):
41+
with chdir(tmp_path):
42+
exit_code = run_cli(
43+
[
44+
"rm",
45+
str(RESOURCES_ROOT / "jpeg.pdf"),
46+
page_range,
47+
"--output",
48+
"./out.pdf",
49+
]
50+
)
51+
captured = capsys.readouterr()
52+
assert exit_code == 2, captured
53+
assert "Invalid file path or page range provided" in captured.err
54+
55+
56+
@pytest.mark.skip(reason="This check is not implemented yet")
57+
def test_rm_subset_warn_on_missing_pages(capsys, tmp_path):
58+
with chdir(tmp_path):
59+
exit_code = run_cli(
60+
[
61+
"rm",
62+
str(RESOURCES_ROOT / "jpeg.pdf"),
63+
"2",
64+
"--output",
65+
"./out.pdf",
66+
]
67+
)
68+
captured = capsys.readouterr()
69+
assert exit_code == 0, captured
70+
assert "WARN" in captured.out
71+
72+
73+
@pytest.mark.xfail() # There is currently a bug there
74+
def test_rm_subset_ensure_reduced_size(tmp_path, two_pages_pdf_filepath):
75+
exit_code = run_cli(
76+
[
77+
"rm",
78+
str(two_pages_pdf_filepath),
79+
"0",
80+
"--output",
81+
str(tmp_path / "page1.pdf"),
82+
]
83+
)
84+
assert exit_code == 0
85+
# The extracted PDF should only contain ONE image:
86+
embedded_images = extract_embedded_images(tmp_path / "page1.pdf")
87+
assert len(embedded_images) == 1
88+
89+
exit_code = run_cli(
90+
[
91+
"rm",
92+
str(two_pages_pdf_filepath),
93+
"1",
94+
"--output",
95+
str(tmp_path / "page2.pdf"),
96+
]
97+
)
98+
assert exit_code == 0
99+
# The extracted PDF should only contain ONE image:
100+
embedded_images = extract_embedded_images(tmp_path / "page2.pdf")
101+
assert len(embedded_images) == 1
102+
103+
104+
def extract_embedded_images(pdf_filepath):
105+
images = []
106+
reader = PdfReader(pdf_filepath)
107+
for page in reader.pages:
108+
images.extend(page.images)
109+
return images
110+
111+
112+
def test_rm_combine_files(pdf_file_100, pdf_file_abc, tmp_path, capsys):
113+
with chdir(tmp_path):
114+
output_pdf_path = tmp_path / "out.pdf"
115+
116+
# Run pdfly rm command
117+
exit_code = run_cli(
118+
[
119+
"rm",
120+
str(pdf_file_100),
121+
"1:10:2",
122+
str(pdf_file_abc),
123+
"::2",
124+
str(pdf_file_abc),
125+
"1::2",
126+
"--output",
127+
str(output_pdf_path),
128+
]
129+
)
130+
captured = capsys.readouterr()
131+
132+
# Check if the command was successful
133+
assert exit_code == 0, captured.out
134+
135+
# Extract text from the original and modified PDFs
136+
extracted_pages = []
137+
reader = PdfReader(output_pdf_path)
138+
for page in reader.pages:
139+
extracted_pages.append(page.extract_text())
140+
141+
# Compare the extracted text
142+
l1 = [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))]
143+
assert extracted_pages == l1 + [
144+
"b",
145+
"d",
146+
"f",
147+
"h",
148+
"j",
149+
"l",
150+
"n",
151+
"p",
152+
"r",
153+
"t",
154+
"v",
155+
"x",
156+
"z",
157+
"a",
158+
"c",
159+
"e",
160+
"g",
161+
"i",
162+
"k",
163+
"m",
164+
"o",
165+
"q",
166+
"s",
167+
"u",
168+
"w",
169+
"y",
170+
]
171+
172+
173+
@pytest.mark.parametrize(
174+
("page_range", "expected"),
175+
[
176+
("22", [str(el) for el in range(100) if el != 22]),
177+
("0:3", [str(el) for el in range(3, 100)]),
178+
(":3", [str(el) for el in range(3, 100)]),
179+
(":", []),
180+
("5:", ["0", "1", "2", "3", "4"]),
181+
("::2", [str(el) for el in list(range(100))[1::2]]),
182+
(
183+
"1:10:2",
184+
[str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))],
185+
),
186+
("::1", []),
187+
("::-1", []),
188+
],
189+
)
190+
def test_rm_commands(pdf_file_100, capsys, tmp_path, page_range, expected):
191+
with chdir(tmp_path):
192+
output_pdf_path = tmp_path / "out.pdf"
193+
194+
# Run pdfly rm command
195+
exit_code = run_cli(
196+
[
197+
"rm",
198+
str(pdf_file_100),
199+
page_range,
200+
"--output",
201+
str(output_pdf_path),
202+
]
203+
)
204+
205+
# Check if the command was successful
206+
assert exit_code == 0
207+
208+
# Extract text from the original and modified PDFs
209+
extracted_pages = []
210+
reader = PdfReader(output_pdf_path)
211+
for page in reader.pages:
212+
extracted_pages.append(page.extract_text())
213+
214+
# Compare the extracted text
215+
assert extracted_pages == expected

0 commit comments

Comments
 (0)