|
| 1 | +"""Tests for the `rm` command.""" |
| 2 | + |
| 3 | +from pathlib import Path |
| 4 | +from typing import List |
| 5 | + |
| 6 | +import pytest |
| 7 | +from _pytest.capture import CaptureFixture |
| 8 | +from pypdf import PdfReader |
| 9 | + |
| 10 | +from .conftest import RESOURCES_ROOT, chdir, run_cli |
| 11 | +from .test_cat import extract_embedded_images |
| 12 | + |
| 13 | + |
| 14 | +def test_rm_incorrect_number_of_args( |
| 15 | + capsys: CaptureFixture, tmp_path: Path |
| 16 | +) -> None: |
| 17 | + with chdir(tmp_path): |
| 18 | + exit_code = run_cli(["rm", str(RESOURCES_ROOT / "box.pdf")]) |
| 19 | + assert exit_code == 2 |
| 20 | + captured = capsys.readouterr() |
| 21 | + assert "Missing argument" in captured.err |
| 22 | + |
| 23 | + |
| 24 | +def test_rm_subset_ok(capsys: CaptureFixture, tmp_path: Path) -> None: |
| 25 | + with chdir(tmp_path): |
| 26 | + exit_code = run_cli( |
| 27 | + [ |
| 28 | + "rm", |
| 29 | + str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"), |
| 30 | + "13:15", |
| 31 | + "--output", |
| 32 | + "./out.pdf", |
| 33 | + ] |
| 34 | + ) |
| 35 | + captured = capsys.readouterr() |
| 36 | + assert exit_code == 0, captured |
| 37 | + assert not captured.err |
| 38 | + inp_reader = PdfReader( |
| 39 | + RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf" |
| 40 | + ) |
| 41 | + out_reader = PdfReader(tmp_path / "out.pdf") |
| 42 | + assert len(out_reader.pages) == len(inp_reader.pages) - 2 |
| 43 | + |
| 44 | + |
| 45 | +@pytest.mark.parametrize( |
| 46 | + "page_range", |
| 47 | + ["a", "-", "1-", "1-1-1", "1:1:1:1"], |
| 48 | +) |
| 49 | +def test_rm_subset_invalid_args( |
| 50 | + capsys: CaptureFixture, tmp_path: Path, page_range: str |
| 51 | +) -> None: |
| 52 | + with chdir(tmp_path): |
| 53 | + exit_code = run_cli( |
| 54 | + [ |
| 55 | + "rm", |
| 56 | + str(RESOURCES_ROOT / "jpeg.pdf"), |
| 57 | + page_range, |
| 58 | + "--output", |
| 59 | + "./out.pdf", |
| 60 | + ] |
| 61 | + ) |
| 62 | + captured = capsys.readouterr() |
| 63 | + assert exit_code == 2, captured |
| 64 | + assert "Invalid file path or page range provided" in captured.err |
| 65 | + |
| 66 | + |
| 67 | +def test_rm_subset_warn_on_missing_pages( |
| 68 | + capsys: CaptureFixture, tmp_path: Path |
| 69 | +) -> None: |
| 70 | + with chdir(tmp_path): |
| 71 | + exit_code = run_cli( |
| 72 | + [ |
| 73 | + "rm", |
| 74 | + str(RESOURCES_ROOT / "jpeg.pdf"), |
| 75 | + "2", |
| 76 | + "--output", |
| 77 | + "./out.pdf", |
| 78 | + ] |
| 79 | + ) |
| 80 | + captured = capsys.readouterr() |
| 81 | + assert exit_code == 0, captured |
| 82 | + assert "WARN" in captured.err |
| 83 | + |
| 84 | + |
| 85 | +def test_rm_subset_ensure_reduced_size( |
| 86 | + tmp_path: Path, two_pages_pdf_filepath: Path |
| 87 | +) -> None: |
| 88 | + exit_code = run_cli( |
| 89 | + [ |
| 90 | + "rm", |
| 91 | + str(two_pages_pdf_filepath), |
| 92 | + "0", |
| 93 | + "--output", |
| 94 | + str(tmp_path / "page1.pdf"), |
| 95 | + ] |
| 96 | + ) |
| 97 | + assert exit_code == 0 |
| 98 | + # The extracted PDF should only contain ONE image: |
| 99 | + embedded_images = extract_embedded_images(tmp_path / "page1.pdf") |
| 100 | + assert len(embedded_images) == 1 |
| 101 | + |
| 102 | + exit_code = run_cli( |
| 103 | + [ |
| 104 | + "rm", |
| 105 | + str(two_pages_pdf_filepath), |
| 106 | + "1", |
| 107 | + "--output", |
| 108 | + str(tmp_path / "page2.pdf"), |
| 109 | + ] |
| 110 | + ) |
| 111 | + assert exit_code == 0 |
| 112 | + # The extracted PDF should only contain ONE image: |
| 113 | + embedded_images = extract_embedded_images(tmp_path / "page2.pdf") |
| 114 | + assert len(embedded_images) == 1 |
| 115 | + |
| 116 | + |
| 117 | +def test_rm_combine_files( |
| 118 | + pdf_file_100: Path, |
| 119 | + pdf_file_abc: Path, |
| 120 | + tmp_path: Path, |
| 121 | + capsys: CaptureFixture, |
| 122 | +) -> None: |
| 123 | + with chdir(tmp_path): |
| 124 | + output_pdf_path = tmp_path / "out.pdf" |
| 125 | + |
| 126 | + # Run pdfly rm command |
| 127 | + exit_code = run_cli( |
| 128 | + [ |
| 129 | + "rm", |
| 130 | + str(pdf_file_100), |
| 131 | + "1:10:2", |
| 132 | + str(pdf_file_abc), |
| 133 | + "::2", |
| 134 | + str(pdf_file_abc), |
| 135 | + "1::2", |
| 136 | + "--output", |
| 137 | + str(output_pdf_path), |
| 138 | + ] |
| 139 | + ) |
| 140 | + captured = capsys.readouterr() |
| 141 | + |
| 142 | + # Check if the command was successful |
| 143 | + assert exit_code == 0, captured.out |
| 144 | + |
| 145 | + # Extract text from the original and modified PDFs |
| 146 | + extracted_pages = [] |
| 147 | + reader = PdfReader(output_pdf_path) |
| 148 | + extracted_pages = [page.extract_text() for page in reader.pages] |
| 149 | + |
| 150 | + # Compare the extracted text |
| 151 | + l1 = [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))] |
| 152 | + assert extracted_pages == l1 + [ |
| 153 | + "b", |
| 154 | + "d", |
| 155 | + "f", |
| 156 | + "h", |
| 157 | + "j", |
| 158 | + "l", |
| 159 | + "n", |
| 160 | + "p", |
| 161 | + "r", |
| 162 | + "t", |
| 163 | + "v", |
| 164 | + "x", |
| 165 | + "z", |
| 166 | + "a", |
| 167 | + "c", |
| 168 | + "e", |
| 169 | + "g", |
| 170 | + "i", |
| 171 | + "k", |
| 172 | + "m", |
| 173 | + "o", |
| 174 | + "q", |
| 175 | + "s", |
| 176 | + "u", |
| 177 | + "w", |
| 178 | + "y", |
| 179 | + ] |
| 180 | + |
| 181 | + |
| 182 | +@pytest.mark.parametrize( |
| 183 | + ("page_range", "expected"), |
| 184 | + [ |
| 185 | + ("22", [str(el) for el in range(100) if el != 22]), |
| 186 | + ("0:3", [str(el) for el in range(3, 100)]), |
| 187 | + (":3", [str(el) for el in range(3, 100)]), |
| 188 | + (":", []), |
| 189 | + ("5:", ["0", "1", "2", "3", "4"]), |
| 190 | + ("::2", [str(el) for el in list(range(100))[1::2]]), |
| 191 | + ( |
| 192 | + "1:10:2", |
| 193 | + [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))], |
| 194 | + ), |
| 195 | + ("::1", []), |
| 196 | + ("::-1", []), |
| 197 | + ], |
| 198 | +) |
| 199 | +def test_rm_commands( |
| 200 | + pdf_file_100: Path, |
| 201 | + capsys: CaptureFixture, |
| 202 | + tmp_path: Path, |
| 203 | + page_range: str, |
| 204 | + expected: List[str], |
| 205 | +) -> None: |
| 206 | + with chdir(tmp_path): |
| 207 | + output_pdf_path = tmp_path / "out.pdf" |
| 208 | + |
| 209 | + # Run pdfly rm command |
| 210 | + exit_code = run_cli( |
| 211 | + [ |
| 212 | + "rm", |
| 213 | + str(pdf_file_100), |
| 214 | + page_range, |
| 215 | + "--output", |
| 216 | + str(output_pdf_path), |
| 217 | + ] |
| 218 | + ) |
| 219 | + |
| 220 | + # Check if the command was successful |
| 221 | + assert exit_code == 0 |
| 222 | + |
| 223 | + # Extract text from the original and modified PDFs |
| 224 | + extracted_pages = [] |
| 225 | + reader = PdfReader(output_pdf_path) |
| 226 | + extracted_pages = [page.extract_text() for page in reader.pages] |
| 227 | + |
| 228 | + # Compare the extracted text |
| 229 | + assert extracted_pages == expected |
0 commit comments