Add rm funcionality (#59)

ebotiab · web-flow · commit ad0c33d2f0aa · 2024-10-22T13:45:57.000+02:00
diff --git a/pdfly/cat.py b/pdfly/cat.py
@@ -53,7 +53,11 @@
 
 
 def main(
-    filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool
+    filename: Path,
+    fn_pgrgs: List[str],
+    output: Path,
+    verbose: bool,
+    inverted_page_selection: bool = False,
 ) -> None:
     filename_page_ranges = parse_filepaths_and_pagerange_args(
         filename, fn_pgrgs
@@ -87,8 +91,15 @@ def main(
                     f"WARNING: Page range {page_range} is out of bounds",
                     file=sys.stderr,
                 )
-            for page_num in range(*page_range.indices(len(reader.pages))):
-                writer.add_page(reader.pages[page_num])
+            if inverted_page_selection:
+                all_page_nums = set(range(len(reader.pages)))
+                page_nums = set(range(*page_range.indices(len(reader.pages))))
+                inverted_page_nums = all_page_nums - page_nums
+                for page_num in inverted_page_nums:
+                    writer.add_page(reader.pages[page_num])
+            else:
+                for page_num in range(*page_range.indices(len(reader.pages))):
+                    writer.add_page(reader.pages[page_num])
         writer.write(output_fh)
     except Exception:
         print(traceback.format_exc(), file=sys.stderr)
diff --git a/pdfly/cli.py b/pdfly/cli.py
@@ -15,6 +15,7 @@
 import pdfly.extract_images
 import pdfly.metadata
 import pdfly.pagemeta
+import pdfly.rm
 import pdfly.up2
 import pdfly.x2pdf
 
@@ -104,6 +105,30 @@ def cat(
     pdfly.cat.main(filename, fn_pgrgs, output, verbose)
 
 
+@entry_point.command(name="rm", help=pdfly.rm.__doc__)
+def rm(
+    filename: Annotated[
+        Path,
+        typer.Argument(
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            writable=False,
+            readable=True,
+            resolve_path=True,
+        ),
+    ],
+    output: Path = typer.Option(..., "-o", "--output"),  # noqa
+    fn_pgrgs: List[str] = typer.Argument(  # noqa
+        ..., help="filenames and/or page ranges"
+    ),
+    verbose: bool = typer.Option(
+        False, help="show page ranges as they are being read"
+    ),
+) -> None:
+    pdfly.rm.main(filename, fn_pgrgs, output, verbose)
+
+
 @entry_point.command(name="meta", help=pdfly.metadata.__doc__)  # type: ignore[misc]
 def metadata(
     pdf: Annotated[
diff --git a/pdfly/rm.py b/pdfly/rm.py
@@ -0,0 +1,50 @@
+"""
+Remove pages from PDF files.
+
+Page ranges refer to the previously-named file.
+A file not followed by a page range means all the pages of the file.
+
+PAGE RANGES are like Python slices.
+
+        Remember, page indices start with zero.
+
+        Page range expression examples:
+
+            :     all pages.                   -1    last page.
+            22    just the 23rd page.          :-1   all but the last page.
+            0:3   the first three pages.       -2    second-to-last page.
+            :3    the first three pages.       -2:   last two pages.
+            5:    from the sixth page onward.  -3:-1 third & second to last.
+
+        The third, "stride" or "step" number is also recognized.
+
+            ::2       0 2 4 ... to the end.    3:0:-1    3 2 1 but not 0.
+            1:10:2    1 3 5 7 9                2::-1     2 1 0.
+            ::-1      all pages in reverse order.
+
+Examples
+    pdfly rm -o output.pdf document.pdf 2:5
+
+        Remove pages 2 to 4 from document.pdf, producing output.pdf.
+
+    pdfly rm document.pdf :-1
+
+        Removes all pages except the last one from document.pdf, modifying the original file.
+
+    pdfly rm report.pdf :6 7:
+
+        Remove all pages except page seven from report.pdf,
+        producing a single-page report.pdf.
+
+"""
+
+from pathlib import Path
+from typing import List
+
+from pdfly.cat import main as cat_main
+
+
+def main(
+    filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool
+) -> None:
+    cat_main(filename, fn_pgrgs, output, verbose, inverted_page_selection=True)
diff --git a/tests/test_rm.py b/tests/test_rm.py
@@ -0,0 +1,229 @@
+"""Tests for the `rm` command."""
+
+from pathlib import Path
+from typing import List
+
+import pytest
+from _pytest.capture import CaptureFixture
+from pypdf import PdfReader
+
+from .conftest import RESOURCES_ROOT, chdir, run_cli
+from .test_cat import extract_embedded_images
+
+
+def test_rm_incorrect_number_of_args(
+    capsys: CaptureFixture, tmp_path: Path
+) -> None:
+    with chdir(tmp_path):
+        exit_code = run_cli(["rm", str(RESOURCES_ROOT / "box.pdf")])
+    assert exit_code == 2
+    captured = capsys.readouterr()
+    assert "Missing argument" in captured.err
+
+
+def test_rm_subset_ok(capsys: CaptureFixture, tmp_path: Path) -> None:
+    with chdir(tmp_path):
+        exit_code = run_cli(
+            [
+                "rm",
+                str(RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"),
+                "13:15",
+                "--output",
+                "./out.pdf",
+            ]
+        )
+    captured = capsys.readouterr()
+    assert exit_code == 0, captured
+    assert not captured.err
+    inp_reader = PdfReader(
+        RESOURCES_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf"
+    )
+    out_reader = PdfReader(tmp_path / "out.pdf")
+    assert len(out_reader.pages) == len(inp_reader.pages) - 2
+
+
+@pytest.mark.parametrize(
+    "page_range",
+    ["a", "-", "1-", "1-1-1", "1:1:1:1"],
+)
+def test_rm_subset_invalid_args(
+    capsys: CaptureFixture, tmp_path: Path, page_range: str
+) -> None:
+    with chdir(tmp_path):
+        exit_code = run_cli(
+            [
+                "rm",
+                str(RESOURCES_ROOT / "jpeg.pdf"),
+                page_range,
+                "--output",
+                "./out.pdf",
+            ]
+        )
+    captured = capsys.readouterr()
+    assert exit_code == 2, captured
+    assert "Invalid file path or page range provided" in captured.err
+
+
+def test_rm_subset_warn_on_missing_pages(
+    capsys: CaptureFixture, tmp_path: Path
+) -> None:
+    with chdir(tmp_path):
+        exit_code = run_cli(
+            [
+                "rm",
+                str(RESOURCES_ROOT / "jpeg.pdf"),
+                "2",
+                "--output",
+                "./out.pdf",
+            ]
+        )
+    captured = capsys.readouterr()
+    assert exit_code == 0, captured
+    assert "WARN" in captured.err
+
+
+def test_rm_subset_ensure_reduced_size(
+    tmp_path: Path, two_pages_pdf_filepath: Path
+) -> None:
+    exit_code = run_cli(
+        [
+            "rm",
+            str(two_pages_pdf_filepath),
+            "0",
+            "--output",
+            str(tmp_path / "page1.pdf"),
+        ]
+    )
+    assert exit_code == 0
+    # The extracted PDF should only contain ONE image:
+    embedded_images = extract_embedded_images(tmp_path / "page1.pdf")
+    assert len(embedded_images) == 1
+
+    exit_code = run_cli(
+        [
+            "rm",
+            str(two_pages_pdf_filepath),
+            "1",
+            "--output",
+            str(tmp_path / "page2.pdf"),
+        ]
+    )
+    assert exit_code == 0
+    # The extracted PDF should only contain ONE image:
+    embedded_images = extract_embedded_images(tmp_path / "page2.pdf")
+    assert len(embedded_images) == 1
+
+
+def test_rm_combine_files(
+    pdf_file_100: Path,
+    pdf_file_abc: Path,
+    tmp_path: Path,
+    capsys: CaptureFixture,
+) -> None:
+    with chdir(tmp_path):
+        output_pdf_path = tmp_path / "out.pdf"
+
+        # Run pdfly rm command
+        exit_code = run_cli(
+            [
+                "rm",
+                str(pdf_file_100),
+                "1:10:2",
+                str(pdf_file_abc),
+                "::2",
+                str(pdf_file_abc),
+                "1::2",
+                "--output",
+                str(output_pdf_path),
+            ]
+        )
+        captured = capsys.readouterr()
+
+        # Check if the command was successful
+        assert exit_code == 0, captured.out
+
+        # Extract text from the original and modified PDFs
+        extracted_pages = []
+        reader = PdfReader(output_pdf_path)
+        extracted_pages = [page.extract_text() for page in reader.pages]
+
+        # Compare the extracted text
+        l1 = [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))]
+        assert extracted_pages == l1 + [
+            "b",
+            "d",
+            "f",
+            "h",
+            "j",
+            "l",
+            "n",
+            "p",
+            "r",
+            "t",
+            "v",
+            "x",
+            "z",
+            "a",
+            "c",
+            "e",
+            "g",
+            "i",
+            "k",
+            "m",
+            "o",
+            "q",
+            "s",
+            "u",
+            "w",
+            "y",
+        ]
+
+
+@pytest.mark.parametrize(
+    ("page_range", "expected"),
+    [
+        ("22", [str(el) for el in range(100) if el != 22]),
+        ("0:3", [str(el) for el in range(3, 100)]),
+        (":3", [str(el) for el in range(3, 100)]),
+        (":", []),
+        ("5:", ["0", "1", "2", "3", "4"]),
+        ("::2", [str(el) for el in list(range(100))[1::2]]),
+        (
+            "1:10:2",
+            [str(el) for el in list(range(0, 10, 2)) + list(range(10, 100))],
+        ),
+        ("::1", []),
+        ("::-1", []),
+    ],
+)
+def test_rm_commands(
+    pdf_file_100: Path,
+    capsys: CaptureFixture,
+    tmp_path: Path,
+    page_range: str,
+    expected: List[str],
+) -> None:
+    with chdir(tmp_path):
+        output_pdf_path = tmp_path / "out.pdf"
+
+        # Run pdfly rm command
+        exit_code = run_cli(
+            [
+                "rm",
+                str(pdf_file_100),
+                page_range,
+                "--output",
+                str(output_pdf_path),
+            ]
+        )
+
+        # Check if the command was successful
+        assert exit_code == 0
+
+        # Extract text from the original and modified PDFs
+        extracted_pages = []
+        reader = PdfReader(output_pdf_path)
+        extracted_pages = [page.extract_text() for page in reader.pages]
+
+        # Compare the extracted text
+        assert extracted_pages == expected