From b18c8e0db6a313cb300fb75a52501a97c3dab8bd Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Sun, 28 Aug 2022 11:38:08 +0200
Subject: [PATCH 01/22] ENH: Added command update-offsets to adjust offsets and
 lengths.

This command adjusts /Length-entries of stream objects and the xref-offsets
in simple PDF files (ASCII only, one xref section only).
---
 pdfly/cli.py            |  16 +++-
 pdfly/update_offsets.py | 163 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 178 insertions(+), 1 deletion(-)
 create mode 100644 pdfly/update_offsets.py

diff --git a/pdfly/cli.py b/pdfly/cli.py
index 2353d8d..5683ba6 100644
--- a/pdfly/cli.py
+++ b/pdfly/cli.py
@@ -17,6 +17,7 @@
 import pdfly.pagemeta
 import pdfly.rm
 import pdfly.up2
+import pdfly.update_offsets
 import pdfly.x2pdf
 
 
@@ -228,7 +229,20 @@ def compress(
     pdfly.compress.main(pdf, output)
 
 
-@entry_point.command(name="x2pdf", help=pdfly.x2pdf.__doc__)  # type: ignore[misc]
+@entry_point.command(name="update-offsets")  # type: ignore[misc]
+def update_offsets(
+    file_in: Path,
+    file_out: Path,
+    encoding: str = typer.Option(
+        "UTF-8",
+        help="Encoding used to read and write the files, e.g. ISO-8859-1.",
+    ),  # noqa
+    verbose: bool = typer.Option(False, help="Show progress while processing."),  # noqa
+) -> None:
+    pdfly.update_offsets.main(file_in, file_out, encoding, verbose)
+
+
+@entry_point.command(name="x2pdf", help=x2pdf.update_offsets.__doc__)  # type: ignore[misc]
 def x2pdf(
     x: List[Path],
     output: Annotated[
diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
new file mode 100644
index 0000000..8ca47b6
--- /dev/null
+++ b/pdfly/update_offsets.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+"""
+Updates offsets and lengths in a simple PDF file.
+
+The PDF specification requires that the xref section at the end
+of a PDF file has the correct offsets of the PDF's objects.
+It further requires that the dictionary of a stream object
+contains a /Length-entry giving the length of the encoded stream.
+
+When editing a PDF file using a text-editor (e.g. vim) it is
+elaborate to compute or adjust these offsets and lengths.
+
+This command tries to compute /Length-entries of the stream dictionaries
+and the offsets in the xref-section automatically.
+
+It expects that the PDF file has ASCII encoding only. It may
+use ISO-8859-1 or UTF-8 in its comments.
+Therefore it expects that there a no binary streams.
+It expects that there is one xref-section only.
+It expects that the /Length-entries have default values containing
+enough digits, e.g. /Length 000 when the stream consists of 576 bytes.
+
+EXAMPLE
+   update-offsets -v --encoding UTF-8 issue-297.pdf issue-297.out.pdf
+"""
+
+from collections.abc import Iterable
+from pathlib import Path
+import logging
+import re
+import sys
+
+
+def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]:
+    """Iterates over the lines of a pdf-files and updates offsets.
+
+    The input is expected to be a pdf without binary-sections.
+
+    :param lines_in: An Iterable over the lines including line-breaks.
+    :param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
+    :return The output is a list of lines to be written
+            in the given encoding.
+    """
+    logger = logging.getLogger("update_lines")
+    re_obj = re.compile(r"^([0-9]+) ([0-9]+) obj *")
+    re_content = re.compile(r"^(.*)")
+    re_length = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL)
+
+    lines_out = []  # lines to be written
+    map_line_offset = {}  # map from line-number to offset
+    map_obj_offset = {}  # map from object-number to offset
+    line_no = 0  # current line-number (starting at 0)
+    offset_out = 0  # current offset in output-file
+    line_xref = None  # line-number of xref-line (in xref-section only)
+    line_startxref = None  # line-number of startxref-line
+    curr_obj = None  # number of current object
+    len_stream = None  # length of stream (in stream only)
+    offset_xref = None  # offset of xref-section
+    map_stream_len = {}  # map from object-number to /Length of stream
+    map_obj_length_line = {}  # map from object-number to /Length-line
+    map_obj_length_line_no = {}  # map from object-number to line_no
+    # of /Length-line
+    for line in lines_in:
+        line_no += 1
+        m_content = re_content.match(line)
+        if m_content is None:
+            raise RuntimeError(f"Line {line_no} without line-break.")
+        content = m_content.group(1)
+        map_line_offset[line_no] = offset_out
+        m_obj = re_obj.match(line)
+        if m_obj is not None:
+            curr_obj = m_obj.group(1)
+            logger.info(f"line {line_no}: object {curr_obj}")
+            map_obj_offset[curr_obj] = int(offset_out)
+        if content == "xref":
+            offset_xref = offset_out
+            line_xref = line_no
+        elif content == "startxref":
+            line_startxref = line_no
+            line_xref = None
+        elif content == "stream":
+            logger.info(f"line {line_no}: start stream")
+            len_stream = 0
+        elif content == "endstream":
+            logger.info(f"line {line_no}: end stream")
+            if curr_obj is None:
+                raise RuntimeError(
+                    f"Line {line_no}: " + "endstream without object-start."
+                )
+            if len_stream is None:
+                raise RuntimeError(f"Line {line_no}: endstream without stream.")
+            logger.info(f"line {line_no}: /Length {len_stream}")
+            map_stream_len[curr_obj] = len_stream
+        elif content == "endobj":
+            curr_obj = None
+        elif curr_obj is not None and len_stream is None:
+            mLength = re_length.match(line)
+            if mLength is not None:
+                logger.info(f"line {line_no}, /Length: {content}")
+                map_obj_length_line[curr_obj] = line
+                map_obj_length_line_no[curr_obj] = line_no
+        elif curr_obj is not None and len_stream is not None:
+            len_stream += len(line.encode(encoding))
+        elif line_xref is not None and line_no > line_xref + 2:
+            objNo = line_no - line_xref - 2
+            if objNo <= len(map_obj_offset) and str(objNo) in map_obj_offset:
+                eol = line[-2:]
+                xrefUpd = ("%010d" % map_obj_offset[str(objNo)]) + " 00000 n"
+                logger.info(f"{content} -> {xrefUpd}")
+                line = xrefUpd + eol
+        elif line_startxref is not None and line_no == line_startxref + 1:
+            if offset_xref is None:
+                raise RuntimeError("startxref without preceding xref-section")
+            line = "%d\n" % offset_xref
+        lines_out.append(line)
+
+        offset_out += len(line.encode(encoding))
+
+    # Some checks
+    if len(map_obj_offset) == 0:
+        raise RuntimeError("The command didn't find any PDF objects.")
+    if offset_xref is None:
+        raise RuntimeError("The command didn't find a xref-section")
+    if line_startxref is None:
+        raise RuntimeError("The command didn't find a startxref-section")
+
+    for curr_obj, stream_len in map_stream_len.items():
+        if not curr_obj in map_obj_length_line:
+            raise RuntimeError(
+                f"obj {curr_obj} with stream-len {len}"
+                + f" has no object-length-line: {map_obj_length_line}"
+            )
+        m_length = re_length.match(map_obj_length_line[curr_obj])
+        prev_length = m_length.group(2)
+        len_digits = len(prev_length)
+        len_format = "%%0%dd" % len_digits
+        updated_length = len_format % stream_len
+        if len(updated_length) > len_digits:
+            raise RuntimeError(
+                f"Not enough digits in /Length-entry {m_length.group(2)}"
+                + f" of object {curr_obj}:"
+                + f" too short to take /Length {updated_length}"
+            )
+        line = m_length.group(1) + updated_length + m_length.group(3)
+        lines_out[map_obj_length_line_no[curr_obj] - 1] = line
+
+    return lines_out
+
+
+def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
+    if verbose:
+        logging.basicConfig(level=logging.INFO)
+        print(f"Read {file_in}")
+
+    with open(file_in, "r") as f:
+        lines_out = update_lines(f, encoding)
+
+    with open(file_out, "wb") as f:
+        for line in lines_out:
+            f.write(line.encode(encoding))
+
+    if verbose:
+        print(f"Wrote {file_out}")

From 25f0ccd02c725b50f2b9dcf8ab3fd641f9f386a5 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Fri, 24 May 2024 22:56:47 +0200
Subject: [PATCH 02/22] BUG: Clear stream-length at new object.

---
 pdfly/update_offsets.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index 8ca47b6..c80a609 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -72,6 +72,8 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]:
             curr_obj = m_obj.group(1)
             logger.info(f"line {line_no}: object {curr_obj}")
             map_obj_offset[curr_obj] = int(offset_out)
+            len_stream = None
+
         if content == "xref":
             offset_xref = offset_out
             line_xref = line_no

From d8f66691b7a40b37103b6627e387bdb1643527f2 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Sun, 3 Nov 2024 21:39:20 +0100
Subject: [PATCH 03/22] DEV: Logging migrated from Python's built-in logging to
 .

---
 pdfly/update_offsets.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index c80a609..25ccf63 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -26,22 +26,23 @@
 
 from collections.abc import Iterable
 from pathlib import Path
-import logging
+from rich.console import Console
 import re
 import sys
 
 
-def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]:
+def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]:
     """Iterates over the lines of a pdf-files and updates offsets.
 
     The input is expected to be a pdf without binary-sections.
 
     :param lines_in: An Iterable over the lines including line-breaks.
     :param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
+    :param console: Console used to print messages.
+    :param verbose: True to activate logging of info-messages.
     :return The output is a list of lines to be written
             in the given encoding.
     """
-    logger = logging.getLogger("update_lines")
     re_obj = re.compile(r"^([0-9]+) ([0-9]+) obj *")
     re_content = re.compile(r"^(.*)")
     re_length = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL)
@@ -70,7 +71,8 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]:
         m_obj = re_obj.match(line)
         if m_obj is not None:
             curr_obj = m_obj.group(1)
-            logger.info(f"line {line_no}: object {curr_obj}")
+            if verbose:
+                console.print(f"line {line_no}: object {curr_obj}")
             map_obj_offset[curr_obj] = int(offset_out)
             len_stream = None
 
@@ -81,24 +83,28 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]:
             line_startxref = line_no
             line_xref = None
         elif content == "stream":
-            logger.info(f"line {line_no}: start stream")
+            if verbose:
+                console.print(f"line {line_no}: start stream")
             len_stream = 0
         elif content == "endstream":
-            logger.info(f"line {line_no}: end stream")
+            if verbose:
+                console.print(f"line {line_no}: end stream")
             if curr_obj is None:
                 raise RuntimeError(
                     f"Line {line_no}: " + "endstream without object-start."
                 )
             if len_stream is None:
                 raise RuntimeError(f"Line {line_no}: endstream without stream.")
-            logger.info(f"line {line_no}: /Length {len_stream}")
+            if verbose:
+                console.print(f"line {line_no}: /Length {len_stream}")
             map_stream_len[curr_obj] = len_stream
         elif content == "endobj":
             curr_obj = None
         elif curr_obj is not None and len_stream is None:
             mLength = re_length.match(line)
             if mLength is not None:
-                logger.info(f"line {line_no}, /Length: {content}")
+                if verbose:
+                    console.print(f"line {line_no}, /Length: {content}")
                 map_obj_length_line[curr_obj] = line
                 map_obj_length_line_no[curr_obj] = line_no
         elif curr_obj is not None and len_stream is not None:
@@ -108,7 +114,8 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]:
             if objNo <= len(map_obj_offset) and str(objNo) in map_obj_offset:
                 eol = line[-2:]
                 xrefUpd = ("%010d" % map_obj_offset[str(objNo)]) + " 00000 n"
-                logger.info(f"{content} -> {xrefUpd}")
+                if verbose:
+                    console.print(f"{content} -> {xrefUpd}")
                 line = xrefUpd + eol
         elif line_startxref is not None and line_no == line_startxref + 1:
             if offset_xref is None:
@@ -150,16 +157,14 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]:
 
 
 def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
-    if verbose:
-        logging.basicConfig(level=logging.INFO)
-        print(f"Read {file_in}")
+    console = Console()
+    console.print(f"Read {file_in}")
 
     with open(file_in, "r") as f:
-        lines_out = update_lines(f, encoding)
+        lines_out = update_lines(f, encoding, console, verbose)
 
     with open(file_out, "wb") as f:
         for line in lines_out:
             f.write(line.encode(encoding))
 
-    if verbose:
-        print(f"Wrote {file_out}")
+    console.print(f"Wrote {file_out}")

From 37b9b9d523f33fc950c22b8d35358d42f7d47ea3 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Sun, 3 Nov 2024 22:26:58 +0100
Subject: [PATCH 04/22] TST: Added test of update-offsets using hello.pdf.

---
 resources/hello-expected.pdf | Bin 0 -> 883 bytes
 resources/hello.pdf          | Bin 0 -> 886 bytes
 tests/test_update_offsets.py |  41 +++++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+)
 create mode 100644 resources/hello-expected.pdf
 create mode 100644 resources/hello.pdf
 create mode 100644 tests/test_update_offsets.py

diff --git a/resources/hello-expected.pdf b/resources/hello-expected.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c92f5c19dcaedc0199f40d51fc707cb58e86ed7f
GIT binary patch
literal 883
zcmZuv(Qeu>6n*zsoEKEkL}LdMvZ>+$DBHAEMKs#Wwg;COmMX-RV-Vff?_QS_L}%m#
zpWJ(W?zyM4n9cj@dMG-5T0%j>K0b+?8$=7eF%1TUECqlle|a{DrrPVuZV>%2t|@Je
zh_W#r3sV=I8J|^{I-@<s%(OS?&OXI59x5dVF+HUGAk~$WS3SDP*<!ac4&8mZ?VT0l
z%Ue2AyQ@7EqHkr<;3tMm48L1muWU~|`12{X_NMlnm9$xRrYQBq9te~4%Z4$Mj|taX
zXh*?kjW11OcWy;uuq<;+AHyGoG*lapi8AqU<@P@X&$MHXXWcQEmg09)Z~PyqB;|m{
zlX6=juO&XyGXrl$C$~W?B}l%vw=<Xs3L>p2fVbO-NzT`HaA)@h-W)u<MXV^3_P*4}
zi@tp}E{rEIoGYjl`I--HU|#U4`^8k11sUa+9I77jCvltexQ{H4gJ&(;ZeFdzN`=}F
zyGI|)!?+62L^mcV8Bz7U@#RYQC$=iYX@Uo5)<R+=$ScQ%G)V>t)_B7diO37}3CqaE
z7nn*&!wW1{qj%UKBMC3!l8jnwG4FI)kyF_K59MEj5u&g4n#2uOh?dr}N!glVj@tRq
ON1BdAr*nHh7vdl3D&2Je

literal 0
HcmV?d00001

diff --git a/resources/hello.pdf b/resources/hello.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..b2fbc2d59958908746d17a06b9e7f03366bb0abf
GIT binary patch
literal 886
zcmcgq!EW0y487|s-X&;)A+_Y#X*U!(IBB+FD;A{2F53=jtavUQtB{o--Pey);>L%a
zCqt4&k<{bk8>|+~vARygU`#_ONZ6-mad(GkrAyOdLYB1v5aq8g2GLx5UE31TcjMa1
zHi#%1vzahW!I5z<!`vC|$!DRx!Eo^*mT{t#oW!(9`A(`UDX&KKlC#BrV;qKuYB%a7
z<HtKVbGvUm6rwLx(c%XZ28Lf9t`ph|4!*vGHr_OzqtY(w!4#Fw?2&AezS|^5`ab1+
zEA7a+SNPhrcJDSs2Gg>%bQyjWrXkyGMvw`It9JiL*yD~d_Nrqn9mKDuDgAE{E$q;G
zVs0nowZunSZ^2X1$(4x7h3I?Rok1qZiMS#Mo~{&G&d(0;U=Ie~96h{6tSFM^q0-2U
zv3)Tvv?m~3DyS*)Egw3^yx`ODlc{S963S1Rs1fpKai4R$&nVB0XD+&GPEui}Lhi@?
zlMmuyy9%aEw<a(dQTBuJ)kcpqTNmQe!J{)<A#o#!l=HweO(!X~c(W-Ykq4UpWAjc|
rH7S(^@L2sZxIy%}*%G-y3ennH7AZ?J^iey1sw1EjgTei`r4WAsU1QzF

literal 0
HcmV?d00001

diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
new file mode 100644
index 0000000..197071f
--- /dev/null
+++ b/tests/test_update_offsets.py
@@ -0,0 +1,41 @@
+"""
+Every CLI command is called here with a typer CliRunner.
+
+Here should only be end-to-end tests.
+"""
+
+from pathlib import Path
+
+from .conftest import RESOURCES_ROOT, chdir, run_cli
+
+
+def test_update_offsets(capsys, tmp_path: Path) -> None:
+    # Arrange
+    input = str(RESOURCES_ROOT / "hello.pdf")
+    file_expected = str(RESOURCES_ROOT / "hello-expected.pdf")
+    output = tmp_path / "hello-out.pdf"
+    assert not output.exists()
+
+    # Act
+    exit_code = run_cli(
+        [
+            "update-offsets",
+            str(input),
+            str(output),
+        ]
+    )
+
+    # Assert
+    captured = capsys.readouterr()
+    assert exit_code == 0, captured
+    assert not captured.err
+    assert f"Wrote {output}" in captured.out
+    assert output.exists()
+    with open(file_expected, 'r', encoding='iso-8859-1') as file_exp:
+        lines_exp = file_exp.readlines()
+    with open(output, 'r', encoding='iso-8859-1') as file_act:
+        lines_act = file_act.readlines()
+    assert len(lines_exp) == len(lines_act), f"lines_exp=f{lines_exp}, lines_act=f{lines_act}"
+    for line_no, (line_exp, line_act) in enumerate(zip(lines_exp, lines_act), start = 1):
+        assert line_exp == line_act, f"Lines differ in line {line_no}"
+

From be39e9bdc4d040392e74691b43398e1de81f817b Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Sun, 3 Nov 2024 22:38:18 +0100
Subject: [PATCH 05/22] MAINT: Regex uppercase module constants.

---
 pdfly/update_offsets.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index 25ccf63..3eaa203 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -21,7 +21,7 @@
 enough digits, e.g. /Length 000 when the stream consists of 576 bytes.
 
 EXAMPLE
-   update-offsets -v --encoding UTF-8 issue-297.pdf issue-297.out.pdf
+   update-offsets --verbose --encoding UTF-8 issue-297.pdf issue-297.out.pdf
 """
 
 from collections.abc import Iterable
@@ -30,6 +30,9 @@
 import re
 import sys
 
+RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
+RE_CONTENT = re.compile(r"^(.*)")
+RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL)
 
 def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]:
     """Iterates over the lines of a pdf-files and updates offsets.
@@ -43,9 +46,6 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
     :return The output is a list of lines to be written
             in the given encoding.
     """
-    re_obj = re.compile(r"^([0-9]+) ([0-9]+) obj *")
-    re_content = re.compile(r"^(.*)")
-    re_length = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL)
 
     lines_out = []  # lines to be written
     map_line_offset = {}  # map from line-number to offset
@@ -63,12 +63,12 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
     # of /Length-line
     for line in lines_in:
         line_no += 1
-        m_content = re_content.match(line)
+        m_content = RE_CONTENT.match(line)
         if m_content is None:
             raise RuntimeError(f"Line {line_no} without line-break.")
         content = m_content.group(1)
         map_line_offset[line_no] = offset_out
-        m_obj = re_obj.match(line)
+        m_obj = RE_OBJ.match(line)
         if m_obj is not None:
             curr_obj = m_obj.group(1)
             if verbose:
@@ -101,7 +101,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
         elif content == "endobj":
             curr_obj = None
         elif curr_obj is not None and len_stream is None:
-            mLength = re_length.match(line)
+            mLength = RE_LENGTH.match(line)
             if mLength is not None:
                 if verbose:
                     console.print(f"line {line_no}, /Length: {content}")
@@ -139,7 +139,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
                 f"obj {curr_obj} with stream-len {len}"
                 + f" has no object-length-line: {map_obj_length_line}"
             )
-        m_length = re_length.match(map_obj_length_line[curr_obj])
+        m_length = RE_LENGTH.match(map_obj_length_line[curr_obj])
         prev_length = m_length.group(2)
         len_digits = len(prev_length)
         len_format = "%%0%dd" % len_digits

From 2465ffe26ae5cf1baf163781da120e5b938b8e77 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Sun, 3 Nov 2024 22:41:47 +0100
Subject: [PATCH 06/22] DOC: Add update-offsets command

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b8bf9dc..0667e65 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ $ pdfly --help
 │ meta             Show metadata of a PDF file                                │
 │ pagemeta         Give details about a single page.                          │
 │ rm               Remove pages from PDF files.                               │
+│ update-offsets   Updates offsets and lengths in a simple PDF file.          │
 │ x2pdf            Convert one or more files to PDF. Each file is a page.     │
 ╰─────────────────────────────────────────────────────────────────────────────╯
 ```

From 8838ca5f6e06235c36b6d1cdd90751f2a42c9402 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Sun, 3 Nov 2024 22:49:46 +0100
Subject: [PATCH 07/22] MAINT: Added suggested help-attribute.

---
 pdfly/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pdfly/cli.py b/pdfly/cli.py
index 5683ba6..36e88f1 100644
--- a/pdfly/cli.py
+++ b/pdfly/cli.py
@@ -229,7 +229,7 @@ def compress(
     pdfly.compress.main(pdf, output)
 
 
-@entry_point.command(name="update-offsets")  # type: ignore[misc]
+@entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__)  # type: ignore[misc]
 def update_offsets(
     file_in: Path,
     file_out: Path,

From 3429c2fae6152f6cab92e3c63ba913ed19ebf585 Mon Sep 17 00:00:00 2001
From: "Cimon Lucas (LCM)" <lucas_cimon@connect-tech.sncf>
Date: Mon, 4 Nov 2024 19:33:24 +0100
Subject: [PATCH 08/22] Minor fixups & adding
 test_update_offsets_on_all_reference_files()

---
 pdfly/update_offsets.py      | 22 +++++++--------
 tests/test_update_offsets.py | 53 ++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index 3eaa203..639cbe7 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -65,7 +65,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
         line_no += 1
         m_content = RE_CONTENT.match(line)
         if m_content is None:
-            raise RuntimeError(f"Line {line_no} without line-break.")
+            raise RuntimeError(f"Invalid PDF file: line {line_no} without line-break.")
         content = m_content.group(1)
         map_line_offset[line_no] = offset_out
         m_obj = RE_OBJ.match(line)
@@ -91,10 +91,10 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
                 console.print(f"line {line_no}: end stream")
             if curr_obj is None:
                 raise RuntimeError(
-                    f"Line {line_no}: " + "endstream without object-start."
+                    f"Invalid PDF file: line {line_no}: endstream without object-start."
                 )
             if len_stream is None:
-                raise RuntimeError(f"Line {line_no}: endstream without stream.")
+                raise RuntimeError(f"Invalid PDF file: line {line_no}: endstream without stream.")
             if verbose:
                 console.print(f"line {line_no}: /Length {len_stream}")
             map_stream_len[curr_obj] = len_stream
@@ -119,7 +119,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
                 line = xrefUpd + eol
         elif line_startxref is not None and line_no == line_startxref + 1:
             if offset_xref is None:
-                raise RuntimeError("startxref without preceding xref-section")
+                raise NotImplementedError("Unsupported file: startxref without preceding xref-section (probable cross-reference stream)")
             line = "%d\n" % offset_xref
         lines_out.append(line)
 
@@ -127,16 +127,16 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
 
     # Some checks
     if len(map_obj_offset) == 0:
-        raise RuntimeError("The command didn't find any PDF objects.")
+        raise RuntimeError("Invalid PDF file: the command didn't find any PDF objects.")
     if offset_xref is None:
-        raise RuntimeError("The command didn't find a xref-section")
+        raise RuntimeError("Invalid PDF file: the command didn't find a xref-section")
     if line_startxref is None:
-        raise RuntimeError("The command didn't find a startxref-section")
+        raise RuntimeError("Invalid PDF file: the command didn't find a startxref-section")
 
     for curr_obj, stream_len in map_stream_len.items():
         if not curr_obj in map_obj_length_line:
             raise RuntimeError(
-                f"obj {curr_obj} with stream-len {len}"
+                f"obj {curr_obj} with stream-len {stream_len}"
                 + f" has no object-length-line: {map_obj_length_line}"
             )
         m_length = RE_LENGTH.match(map_obj_length_line[curr_obj])
@@ -146,7 +146,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
         updated_length = len_format % stream_len
         if len(updated_length) > len_digits:
             raise RuntimeError(
-                f"Not enough digits in /Length-entry {m_length.group(2)}"
+                f"Not enough digits in /Length-entry {prev_length}"
                 + f" of object {curr_obj}:"
                 + f" too short to take /Length {updated_length}"
             )
@@ -160,10 +160,10 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
     console = Console()
     console.print(f"Read {file_in}")
 
-    with open(file_in, "r") as f:
+    with open(file_in, "r", encoding=encoding) as f:
         lines_out = update_lines(f, encoding, console, verbose)
 
-    with open(file_out, "wb") as f:
+    with open(file_out, "wb", encoding=encoding) as f:
         for line in lines_out:
             f.write(line.encode(encoding))
 
diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
index 197071f..7929604 100644
--- a/tests/test_update_offsets.py
+++ b/tests/test_update_offsets.py
@@ -6,6 +6,8 @@
 
 from pathlib import Path
 
+import pytest
+
 from .conftest import RESOURCES_ROOT, chdir, run_cli
 
 
@@ -39,3 +41,54 @@ def test_update_offsets(capsys, tmp_path: Path) -> None:
     for line_no, (line_exp, line_act) in enumerate(zip(lines_exp, lines_act), start = 1):
         assert line_exp == line_act, f"Lines differ in line {line_no}"
 
+
+
+@pytest.mark.parametrize(
+    "input_pdf_filepath",
+    [
+        "sample-files/002-trivial-libre-office-writer/002-trivial-libre-office-writer.pdf",
+        "sample-files/005-libreoffice-writer-password/libreoffice-writer-password.pdf",
+        "sample-files/007-imagemagick-images/imagemagick-ASCII85Decode.pdf",
+        "sample-files/007-imagemagick-images/imagemagick-CCITTFaxDecode.pdf",
+        "sample-files/007-imagemagick-images/imagemagick-images.pdf",
+        "sample-files/007-imagemagick-images/imagemagick-lzw.pdf",
+        "sample-files/008-reportlab-inline-image/inline-image.pdf",
+        "sample-files/009-pdflatex-geotopo/GeoTopo-komprimiert.pdf",
+        "sample-files/011-google-doc-document/google-doc-document.pdf",
+        "sample-files/012-libreoffice-form/libreoffice-form.pdf",
+        "sample-files/013-reportlab-overlay/reportlab-overlay.pdf",
+        "sample-files/015-arabic/habibi-oneline-cmap.pdf",
+        "sample-files/015-arabic/habibi-rotated.pdf",
+        "sample-files/015-arabic/habibi.pdf",
+        "sample-files/016-libre-office-link/libre-office-link.pdf",
+        "sample-files/017-unreadable-meta-data/unreadablemetadata.pdf",
+        "sample-files/018-base64-image/base64image.pdf",
+        "sample-files/019-grayscale-image/grayscale-image.pdf",
+        "sample-files/020-xmp/output_with_metadata_pymupdf.pdf",
+        "sample-files/021-pdfa/crazyones-pdfa.pdf",
+        "sample-files/022-pdfkit/pdfkit.pdf",
+        "sample-files/023-cmyk-image/cmyk-image.pdf",
+        "sample-files/024-annotations/annotated_pdf.pdf",
+        "sample-files/025-attachment/with-attachment.pdf",
+    ]
+)
+def test_update_offsets_on_all_reference_files(capsys, tmp_path: Path, input_pdf_filepath: Path) -> None:
+    # Arrange
+    output_pdf_filepath = tmp_path / "out.pdf"
+
+    # Act
+    exit_code = run_cli(
+        [
+            "update-offsets",
+            "--encoding", "iso-8859-1",
+            input_pdf_filepath,
+            str(output_pdf_filepath),
+        ]
+    )
+
+    # Assert
+    captured = capsys.readouterr()
+    assert exit_code == 0, captured
+    assert not captured.err
+    assert f"Wrote {output_pdf_filepath}" in captured.out
+    assert output_pdf_filepath.exists()

From 10ce5048cfaf274e450c4d3cf00eb07fda77d31c Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Mon, 4 Nov 2024 22:01:13 +0100
Subject: [PATCH 09/22] MAINT: Bugfix help-attribute of x2pdf

---
 pdfly/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pdfly/cli.py b/pdfly/cli.py
index 36e88f1..3d7d434 100644
--- a/pdfly/cli.py
+++ b/pdfly/cli.py
@@ -242,7 +242,7 @@ def update_offsets(
     pdfly.update_offsets.main(file_in, file_out, encoding, verbose)
 
 
-@entry_point.command(name="x2pdf", help=x2pdf.update_offsets.__doc__)  # type: ignore[misc]
+@entry_point.command(name="x2pdf", help=pdfly.x2pdf.__doc__)  # type: ignore[misc]
 def x2pdf(
     x: List[Path],
     output: Annotated[

From 9b0138a8dd4897f57f1fce0a0612546c47bf295f Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Tue, 5 Nov 2024 00:14:57 +0100
Subject: [PATCH 10/22] ENH: Support of referenced lengths.

---
 pdfly/cli.py                 |   4 +-
 pdfly/update_offsets.py      |  89 +++++++++++++++++++++++++----------
 resources/hello-expected.pdf | Bin 883 -> 883 bytes
 3 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/pdfly/cli.py b/pdfly/cli.py
index 3d7d434..63b1e15 100644
--- a/pdfly/cli.py
+++ b/pdfly/cli.py
@@ -234,8 +234,8 @@ def update_offsets(
     file_in: Path,
     file_out: Path,
     encoding: str = typer.Option(
-        "UTF-8",
-        help="Encoding used to read and write the files, e.g. ISO-8859-1.",
+        "ISO-8859-1",
+        help="Encoding used to read and write the files, e.g. UTF-8.",
     ),  # noqa
     verbose: bool = typer.Option(False, help="Show progress while processing."),  # noqa
 ) -> None:
diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index 639cbe7..9b1602c 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -15,13 +15,13 @@
 
 It expects that the PDF file has ASCII encoding only. It may
 use ISO-8859-1 or UTF-8 in its comments.
-Therefore it expects that there a no binary streams.
+The current implementation incorrectly replaces CR (0x0d) by LF (0x0a) in binary data.
 It expects that there is one xref-section only.
 It expects that the /Length-entries have default values containing
 enough digits, e.g. /Length 000 when the stream consists of 576 bytes.
 
 EXAMPLE
-   update-offsets --verbose --encoding UTF-8 issue-297.pdf issue-297.out.pdf
+   update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf
 """
 
 from collections.abc import Iterable
@@ -30,9 +30,12 @@
 import re
 import sys
 
+# Here, only simple regular expressions are used.
+# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
 RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
 RE_CONTENT = re.compile(r"^(.*)")
-RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL)
+RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
+RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ /].*)", re.DOTALL)
 
 def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]:
     """Iterates over the lines of a pdf-files and updates offsets.
@@ -50,6 +53,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
     lines_out = []  # lines to be written
     map_line_offset = {}  # map from line-number to offset
     map_obj_offset = {}  # map from object-number to offset
+    map_obj_line = {}  # map from object-number to line-number
     line_no = 0  # current line-number (starting at 0)
     offset_out = 0  # current offset in output-file
     line_xref = None  # line-number of xref-line (in xref-section only)
@@ -59,7 +63,8 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
     offset_xref = None  # offset of xref-section
     map_stream_len = {}  # map from object-number to /Length of stream
     map_obj_length_line = {}  # map from object-number to /Length-line
-    map_obj_length_line_no = {}  # map from object-number to line_no
+    map_obj_length_ref = {}  # map from object-number to /Length-reference (e.g. "3")
+    map_obj_length_line_no = {}  # map from object-number to line_no of length
     # of /Length-line
     for line in lines_in:
         line_no += 1
@@ -71,9 +76,13 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
         m_obj = RE_OBJ.match(line)
         if m_obj is not None:
             curr_obj = m_obj.group(1)
+            curr_gen = m_obj.group(2)
             if verbose:
                 console.print(f"line {line_no}: object {curr_obj}")
+            if curr_gen != "0":
+                raise RuntimeError(f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported.")
             map_obj_offset[curr_obj] = int(offset_out)
+            map_obj_line[curr_obj] = line_no
             len_stream = None
 
         if content == "xref":
@@ -95,18 +104,28 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
                 )
             if len_stream is None:
                 raise RuntimeError(f"Invalid PDF file: line {line_no}: endstream without stream.")
+            if len_stream > 0:
+                len_stream = len_stream - 1 # ignore the last EOL
             if verbose:
-                console.print(f"line {line_no}: /Length {len_stream}")
+                console.print(f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}")
             map_stream_len[curr_obj] = len_stream
         elif content == "endobj":
             curr_obj = None
         elif curr_obj is not None and len_stream is None:
-            mLength = RE_LENGTH.match(line)
-            if mLength is not None:
+            m_length_ref = RE_LENGTH_REF.match(line)
+            if m_length_ref is not None:
+                len_obj = m_length_ref.group(2)
+                len_obj_gen = m_length_ref.group(3)
                 if verbose:
-                    console.print(f"line {line_no}, /Length: {content}")
-                map_obj_length_line[curr_obj] = line
-                map_obj_length_line_no[curr_obj] = line_no
+                    console.print(f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}")
+                map_obj_length_ref[curr_obj] = len_obj
+            else:
+                m_length = RE_LENGTH.match(line)
+                if m_length is not None:
+                    if verbose:
+                        console.print(f"line {line_no}, /Length: {content}")
+                    map_obj_length_line[curr_obj] = line
+                    map_obj_length_line_no[curr_obj] = line_no
         elif curr_obj is not None and len_stream is not None:
             len_stream += len(line.encode(encoding))
         elif line_xref is not None and line_no > line_xref + 2:
@@ -134,24 +153,44 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
         raise RuntimeError("Invalid PDF file: the command didn't find a startxref-section")
 
     for curr_obj, stream_len in map_stream_len.items():
-        if not curr_obj in map_obj_length_line:
+        if curr_obj in map_obj_length_line:
+            m_length = RE_LENGTH.match(map_obj_length_line[curr_obj])
+            prev_length = m_length.group(2)
+            len_digits = len(prev_length)
+            len_format = "%%0%dd" % len_digits
+            updated_length = len_format % stream_len
+            if len(updated_length) > len_digits:
+                raise RuntimeError(
+                    f"Not enough digits in /Length-entry {prev_length}"
+                    + f" of object {curr_obj}:"
+                    + f" too short to take /Length {updated_length}"
+                )
+            line = m_length.group(1) + updated_length + m_length.group(3)
+            lines_out[map_obj_length_line_no[curr_obj] - 1] = line
+        elif curr_obj in map_obj_length_ref:
+            len_obj = map_obj_length_ref[curr_obj]
+            if not len_obj in map_obj_line:
+                raise RuntimeError(f"obj {curr_obj} has unknown length-obj {len_obj}")
+            len_obj_line = map_obj_line[len_obj]
+            prev_length = lines_out[len_obj_line][:-1]
+            len_digits = len(prev_length)
+            len_format = "%%0%dd" % len_digits
+            updated_length = len_format % stream_len
+            if len(updated_length) > len_digits:
+                raise RuntimeError(
+                    f"Not enough digits in /Length-ref-entry {prev_length}"
+                    + f" of object {curr_obj} and len-object {len_obj}:"
+                    + f" too short to take /Length {updated_length}"
+                )
+            if prev_length != updated_length:
+                if verbose:
+                    console.print(f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}")
+                lines_out[len_obj_line] = updated_length + '\n'
+        else:
             raise RuntimeError(
                 f"obj {curr_obj} with stream-len {stream_len}"
                 + f" has no object-length-line: {map_obj_length_line}"
             )
-        m_length = RE_LENGTH.match(map_obj_length_line[curr_obj])
-        prev_length = m_length.group(2)
-        len_digits = len(prev_length)
-        len_format = "%%0%dd" % len_digits
-        updated_length = len_format % stream_len
-        if len(updated_length) > len_digits:
-            raise RuntimeError(
-                f"Not enough digits in /Length-entry {prev_length}"
-                + f" of object {curr_obj}:"
-                + f" too short to take /Length {updated_length}"
-            )
-        line = m_length.group(1) + updated_length + m_length.group(3)
-        lines_out[map_obj_length_line_no[curr_obj] - 1] = line
 
     return lines_out
 
@@ -163,7 +202,7 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
     with open(file_in, "r", encoding=encoding) as f:
         lines_out = update_lines(f, encoding, console, verbose)
 
-    with open(file_out, "wb", encoding=encoding) as f:
+    with open(file_out, "wb") as f:
         for line in lines_out:
             f.write(line.encode(encoding))
 
diff --git a/resources/hello-expected.pdf b/resources/hello-expected.pdf
index c92f5c19dcaedc0199f40d51fc707cb58e86ed7f..75f7829fb686f024ac70cfbb30b80367845e0341 100644
GIT binary patch
delta 14
Wcmey&_L*(NL`Fu_&65}(G6DcD9|e;D

delta 14
Wcmey&_L*(NL`Ful&65}(G6DcDB?XiK


From e0a32ff9f4e3aaf8d8d1d762001685fbb95ed92f Mon Sep 17 00:00:00 2001
From: Sascha Rogmann <59577610+srogmann@users.noreply.github.com>
Date: Tue, 5 Nov 2024 21:35:28 +0100
Subject: [PATCH 11/22] TST: Renamed test PDF file..

Co-authored-by: Lucas Cimon <925560+Lucas-C@users.noreply.github.com>
---
 tests/test_update_offsets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
index 7929604..9da2e09 100644
--- a/tests/test_update_offsets.py
+++ b/tests/test_update_offsets.py
@@ -13,7 +13,7 @@
 
 def test_update_offsets(capsys, tmp_path: Path) -> None:
     # Arrange
-    input = str(RESOURCES_ROOT / "hello.pdf")
+    input = str(RESOURCES_ROOT / "file-with-invalid-offsets.pdf")
     file_expected = str(RESOURCES_ROOT / "hello-expected.pdf")
     output = tmp_path / "hello-out.pdf"
     assert not output.exists()

From 4f003e586029a31b6c7f0eb7c5e15df639198655 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Tue, 5 Nov 2024 22:43:44 +0100
Subject: [PATCH 12/22] TST: Renamed test PDF file.

---
 ...llo-expected.pdf => file-with-fixed-offsets.pdf} | Bin
 .../{hello.pdf => file-with-invalid-offsets.pdf}    | Bin
 tests/test_update_offsets.py                        |   4 ++--
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename resources/{hello-expected.pdf => file-with-fixed-offsets.pdf} (100%)
 rename resources/{hello.pdf => file-with-invalid-offsets.pdf} (100%)

diff --git a/resources/hello-expected.pdf b/resources/file-with-fixed-offsets.pdf
similarity index 100%
rename from resources/hello-expected.pdf
rename to resources/file-with-fixed-offsets.pdf
diff --git a/resources/hello.pdf b/resources/file-with-invalid-offsets.pdf
similarity index 100%
rename from resources/hello.pdf
rename to resources/file-with-invalid-offsets.pdf
diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
index 9da2e09..3f414c7 100644
--- a/tests/test_update_offsets.py
+++ b/tests/test_update_offsets.py
@@ -14,8 +14,8 @@
 def test_update_offsets(capsys, tmp_path: Path) -> None:
     # Arrange
     input = str(RESOURCES_ROOT / "file-with-invalid-offsets.pdf")
-    file_expected = str(RESOURCES_ROOT / "hello-expected.pdf")
-    output = tmp_path / "hello-out.pdf"
+    file_expected = str(RESOURCES_ROOT / "file-with-fixed-offsets.pdf")
+    output = tmp_path / "file-with-offsets-out.pdf"
     assert not output.exists()
 
     # Act

From 47b16d416837224e4e315eecc8f6729cee8be366 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Tue, 5 Nov 2024 22:57:42 +0100
Subject: [PATCH 13/22] TST: rich.console introduces line-breaks in output.

---
 tests/test_update_offsets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
index 3f414c7..b05b1b9 100644
--- a/tests/test_update_offsets.py
+++ b/tests/test_update_offsets.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 
 import pytest
+import re
 
 from .conftest import RESOURCES_ROOT, chdir, run_cli
 
@@ -31,7 +32,7 @@ def test_update_offsets(capsys, tmp_path: Path) -> None:
     captured = capsys.readouterr()
     assert exit_code == 0, captured
     assert not captured.err
-    assert f"Wrote {output}" in captured.out
+    assert re.search(r"Wrote\s+" + re.escape(str(output)), captured.out)
     assert output.exists()
     with open(file_expected, 'r', encoding='iso-8859-1') as file_exp:
         lines_exp = file_exp.readlines()

From dd1be3b4acc16324d1e476d6fd177a2207f0b371 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Tue, 5 Nov 2024 23:10:15 +0100
Subject: [PATCH 14/22] MAINT: Changed /Length detection to support
 GeoTopo-komprimiert.pdf

---
 pdfly/update_offsets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index 9b1602c..403895a 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -35,7 +35,7 @@
 RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
 RE_CONTENT = re.compile(r"^(.*)")
 RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
-RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ /].*)", re.DOTALL)
+RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ /\r\n].*)", re.DOTALL)
 
 def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]:
     """Iterates over the lines of a pdf-files and updates offsets.
@@ -207,3 +207,4 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
             f.write(line.encode(encoding))
 
     console.print(f"Wrote {file_out}")
+

From 91468979f31fe35f95c297625bacdebc833fe60d Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Tue, 5 Nov 2024 23:15:41 +0100
Subject: [PATCH 15/22] MAINT: Changed /Length detection to support
 output_with_metadata_pymupdf.pdf

---
 pdfly/update_offsets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index 403895a..31f4668 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -35,7 +35,7 @@
 RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
 RE_CONTENT = re.compile(r"^(.*)")
 RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
-RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ /\r\n].*)", re.DOTALL)
+RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\r\n].*)", re.DOTALL)
 
 def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]:
     """Iterates over the lines of a pdf-files and updates offsets.

From 6d72f5acc82d3d8c59cf529d5e878e457af944c4 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Tue, 5 Nov 2024 23:26:38 +0100
Subject: [PATCH 16/22] MAINT: Changed /Length detection (PDF ref 3.1
 white-space characters)

---
 pdfly/update_offsets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index 31f4668..b9ede63 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -35,7 +35,7 @@
 RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
 RE_CONTENT = re.compile(r"^(.*)")
 RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
-RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\r\n].*)", re.DOTALL)
+RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\t\f\r\n].*)", re.DOTALL)
 
 def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]:
     """Iterates over the lines of a pdf-files and updates offsets.

From 657955b7bf6b5d066ce5151f133a275fbfad5a03 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Tue, 5 Nov 2024 23:56:01 +0100
Subject: [PATCH 17/22] MAINT: Don't replace pseudo line-breaks in binary parts
 of a pdf file.

---
 pdfly/update_offsets.py | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index b9ede63..e70f238 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -35,7 +35,7 @@
 RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
 RE_CONTENT = re.compile(r"^(.*)")
 RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
-RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\t\f\r\n].*)", re.DOTALL)
+RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL)
 
 def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]:
     """Iterates over the lines of a pdf-files and updates offsets.
@@ -194,13 +194,48 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
 
     return lines_out
 
+def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
+    """Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
+    Encoding utf-8 can't be used to read random binary data.
+
+    :param file_path: file to be read line by line
+    :param encoding: encoding to be used (e.g. "iso-8859-1")
+    :return lines including line-breaks
+    """
+    chunks = []
+    with open(file_path, 'rb') as file:
+        buffer = bytearray()
+        while True:
+            chunk = file.read(4096)  # Read in chunks of 4096 bytes
+            if not chunk:
+                break  # End of file
+
+            buffer += chunk
+
+            # Split buffer into chunks based on LF, CR, or CRLF
+            while True:
+                match = re.search(b'(\x0D\x0A|\x0A|\x0D)', buffer)
+                if not match:
+                    break  # No more line breaks found, process the remaining buffer
+
+                start, end = match.start(), match.end()
+                chunk_str = buffer[:end].decode(encoding, errors='strict')
+                buffer = buffer[end:]
+
+                chunks.append(chunk_str)
+
+        # Handle the last chunk
+        if buffer:
+            chunks.append(buffer.decode(encoding, errors='strict'))
+
+    return chunks
 
 def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
     console = Console()
     console.print(f"Read {file_in}")
 
-    with open(file_in, "r", encoding=encoding) as f:
-        lines_out = update_lines(f, encoding, console, verbose)
+    lines_in = read_binary_file(file_in, encoding)
+    lines_out = update_lines(lines_in, encoding, console, verbose)
 
     with open(file_out, "wb") as f:
         for line in lines_out:

From 5c3b92c18d6e5e4d0ec13ee34e1911eff32293c9 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Wed, 6 Nov 2024 22:41:05 +0100
Subject: [PATCH 18/22] MAINT: EOL can be CR, LF or CRLF.

---
 pdfly/update_offsets.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index e70f238..c02e3d7 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -33,7 +33,7 @@
 # Here, only simple regular expressions are used.
 # Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
 RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
-RE_CONTENT = re.compile(r"^(.*)")
+RE_CONTENT = re.compile(r"^([^\r\n]*)", re.DOTALL)
 RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
 RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL)
 
@@ -66,8 +66,8 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
     map_obj_length_ref = {}  # map from object-number to /Length-reference (e.g. "3")
     map_obj_length_line_no = {}  # map from object-number to line_no of length
     # of /Length-line
-    for line in lines_in:
-        line_no += 1
+    for idx, line in enumerate(lines_in):
+        line_no = idx + 1
         m_content = RE_CONTENT.match(line)
         if m_content is None:
             raise RuntimeError(f"Invalid PDF file: line {line_no} without line-break.")
@@ -105,7 +105,8 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
             if len_stream is None:
                 raise RuntimeError(f"Invalid PDF file: line {line_no}: endstream without stream.")
             if len_stream > 0:
-                len_stream = len_stream - 1 # ignore the last EOL
+                # Ignore the last EOL
+                len_stream = len_stream - 2 if lines_in[idx - 1][-2:] == '\r\n' else len_stream - 1
             if verbose:
                 console.print(f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}")
             map_stream_len[curr_obj] = len_stream

From 68a352fb05b0ac23be17b2dbb7b625fd1ccbfbc8 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Wed, 6 Nov 2024 23:00:55 +0100
Subject: [PATCH 19/22] TST: Disabled some documents which are not supported.

---
 tests/test_update_offsets.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
index b05b1b9..132b902 100644
--- a/tests/test_update_offsets.py
+++ b/tests/test_update_offsets.py
@@ -42,7 +42,7 @@ def test_update_offsets(capsys, tmp_path: Path) -> None:
     for line_no, (line_exp, line_act) in enumerate(zip(lines_exp, lines_act), start = 1):
         assert line_exp == line_act, f"Lines differ in line {line_no}"
 
-
+# The current implementation doesn't support valid PDF lines as "/Length 5470>> stream".
 
 @pytest.mark.parametrize(
     "input_pdf_filepath",
@@ -55,18 +55,18 @@ def test_update_offsets(capsys, tmp_path: Path) -> None:
         "sample-files/007-imagemagick-images/imagemagick-lzw.pdf",
         "sample-files/008-reportlab-inline-image/inline-image.pdf",
         "sample-files/009-pdflatex-geotopo/GeoTopo-komprimiert.pdf",
-        "sample-files/011-google-doc-document/google-doc-document.pdf",
+        # "sample-files/011-google-doc-document/google-doc-document.pdf", # stream token in line after /Length
         "sample-files/012-libreoffice-form/libreoffice-form.pdf",
         "sample-files/013-reportlab-overlay/reportlab-overlay.pdf",
         "sample-files/015-arabic/habibi-oneline-cmap.pdf",
         "sample-files/015-arabic/habibi-rotated.pdf",
         "sample-files/015-arabic/habibi.pdf",
         "sample-files/016-libre-office-link/libre-office-link.pdf",
-        "sample-files/017-unreadable-meta-data/unreadablemetadata.pdf",
+        # "sample-files/017-unreadable-meta-data/unreadablemetadata.pdf", # stream in line after object
         "sample-files/018-base64-image/base64image.pdf",
-        "sample-files/019-grayscale-image/grayscale-image.pdf",
+        # "sample-files/019-grayscale-image/grayscale-image.pdf", # stream in line after object
         "sample-files/020-xmp/output_with_metadata_pymupdf.pdf",
-        "sample-files/021-pdfa/crazyones-pdfa.pdf",
+        # "sample-files/021-pdfa/crazyones-pdfa.pdf", # stream in line is after dictionary
         "sample-files/022-pdfkit/pdfkit.pdf",
         "sample-files/023-cmyk-image/cmyk-image.pdf",
         "sample-files/024-annotations/annotated_pdf.pdf",

From 51ed725b27e12e8257dcf09cd75bc43d64ec6bc9 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Wed, 6 Nov 2024 23:11:12 +0100
Subject: [PATCH 20/22] MAINT: black (code formatting)

---
 pdfly/cli.py                 |  4 +-
 pdfly/update_offsets.py      | 76 ++++++++++++++++++++++++++----------
 tests/test_update_offsets.py | 23 +++++++----
 3 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/pdfly/cli.py b/pdfly/cli.py
index 63b1e15..2a7463c 100644
--- a/pdfly/cli.py
+++ b/pdfly/cli.py
@@ -237,7 +237,9 @@ def update_offsets(
         "ISO-8859-1",
         help="Encoding used to read and write the files, e.g. UTF-8.",
     ),  # noqa
-    verbose: bool = typer.Option(False, help="Show progress while processing."),  # noqa
+    verbose: bool = typer.Option(
+        False, help="Show progress while processing."
+    ),  # noqa
 ) -> None:
     pdfly.update_offsets.main(file_in, file_out, encoding, verbose)
 
diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index c02e3d7..d33954c 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -35,9 +35,14 @@
 RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
 RE_CONTENT = re.compile(r"^([^\r\n]*)", re.DOTALL)
 RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
-RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL)
+RE_LENGTH = re.compile(
+    r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL
+)
 
-def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]:
+
+def update_lines(
+    lines_in: Iterable[str], encoding: str, console: Console, verbose: bool
+) -> Iterable[str]:
     """Iterates over the lines of a pdf-files and updates offsets.
 
     The input is expected to be a pdf without binary-sections.
@@ -63,14 +68,18 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
     offset_xref = None  # offset of xref-section
     map_stream_len = {}  # map from object-number to /Length of stream
     map_obj_length_line = {}  # map from object-number to /Length-line
-    map_obj_length_ref = {}  # map from object-number to /Length-reference (e.g. "3")
+    map_obj_length_ref = (
+        {}
+    )  # map from object-number to /Length-reference (e.g. "3")
     map_obj_length_line_no = {}  # map from object-number to line_no of length
     # of /Length-line
     for idx, line in enumerate(lines_in):
         line_no = idx + 1
         m_content = RE_CONTENT.match(line)
         if m_content is None:
-            raise RuntimeError(f"Invalid PDF file: line {line_no} without line-break.")
+            raise RuntimeError(
+                f"Invalid PDF file: line {line_no} without line-break."
+            )
         content = m_content.group(1)
         map_line_offset[line_no] = offset_out
         m_obj = RE_OBJ.match(line)
@@ -80,7 +89,9 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
             if verbose:
                 console.print(f"line {line_no}: object {curr_obj}")
             if curr_gen != "0":
-                raise RuntimeError(f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported.")
+                raise RuntimeError(
+                    f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported."
+                )
             map_obj_offset[curr_obj] = int(offset_out)
             map_obj_line[curr_obj] = line_no
             len_stream = None
@@ -103,12 +114,20 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
                     f"Invalid PDF file: line {line_no}: endstream without object-start."
                 )
             if len_stream is None:
-                raise RuntimeError(f"Invalid PDF file: line {line_no}: endstream without stream.")
+                raise RuntimeError(
+                    f"Invalid PDF file: line {line_no}: endstream without stream."
+                )
             if len_stream > 0:
                 # Ignore the last EOL
-                len_stream = len_stream - 2 if lines_in[idx - 1][-2:] == '\r\n' else len_stream - 1
+                len_stream = (
+                    len_stream - 2
+                    if lines_in[idx - 1][-2:] == "\r\n"
+                    else len_stream - 1
+                )
             if verbose:
-                console.print(f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}")
+                console.print(
+                    f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}"
+                )
             map_stream_len[curr_obj] = len_stream
         elif content == "endobj":
             curr_obj = None
@@ -118,7 +137,9 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
                 len_obj = m_length_ref.group(2)
                 len_obj_gen = m_length_ref.group(3)
                 if verbose:
-                    console.print(f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}")
+                    console.print(
+                        f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}"
+                    )
                 map_obj_length_ref[curr_obj] = len_obj
             else:
                 m_length = RE_LENGTH.match(line)
@@ -139,7 +160,9 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
                 line = xrefUpd + eol
         elif line_startxref is not None and line_no == line_startxref + 1:
             if offset_xref is None:
-                raise NotImplementedError("Unsupported file: startxref without preceding xref-section (probable cross-reference stream)")
+                raise NotImplementedError(
+                    "Unsupported file: startxref without preceding xref-section (probable cross-reference stream)"
+                )
             line = "%d\n" % offset_xref
         lines_out.append(line)
 
@@ -147,11 +170,17 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
 
     # Some checks
     if len(map_obj_offset) == 0:
-        raise RuntimeError("Invalid PDF file: the command didn't find any PDF objects.")
+        raise RuntimeError(
+            "Invalid PDF file: the command didn't find any PDF objects."
+        )
     if offset_xref is None:
-        raise RuntimeError("Invalid PDF file: the command didn't find a xref-section")
+        raise RuntimeError(
+            "Invalid PDF file: the command didn't find a xref-section"
+        )
     if line_startxref is None:
-        raise RuntimeError("Invalid PDF file: the command didn't find a startxref-section")
+        raise RuntimeError(
+            "Invalid PDF file: the command didn't find a startxref-section"
+        )
 
     for curr_obj, stream_len in map_stream_len.items():
         if curr_obj in map_obj_length_line:
@@ -171,7 +200,9 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
         elif curr_obj in map_obj_length_ref:
             len_obj = map_obj_length_ref[curr_obj]
             if not len_obj in map_obj_line:
-                raise RuntimeError(f"obj {curr_obj} has unknown length-obj {len_obj}")
+                raise RuntimeError(
+                    f"obj {curr_obj} has unknown length-obj {len_obj}"
+                )
             len_obj_line = map_obj_line[len_obj]
             prev_length = lines_out[len_obj_line][:-1]
             len_digits = len(prev_length)
@@ -185,8 +216,10 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
                 )
             if prev_length != updated_length:
                 if verbose:
-                    console.print(f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}")
-                lines_out[len_obj_line] = updated_length + '\n'
+                    console.print(
+                        f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}"
+                    )
+                lines_out[len_obj_line] = updated_length + "\n"
         else:
             raise RuntimeError(
                 f"obj {curr_obj} with stream-len {stream_len}"
@@ -195,6 +228,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo
 
     return lines_out
 
+
 def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
     """Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
     Encoding utf-8 can't be used to read random binary data.
@@ -204,7 +238,7 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
     :return lines including line-breaks
     """
     chunks = []
-    with open(file_path, 'rb') as file:
+    with open(file_path, "rb") as file:
         buffer = bytearray()
         while True:
             chunk = file.read(4096)  # Read in chunks of 4096 bytes
@@ -215,22 +249,23 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
 
             # Split buffer into chunks based on LF, CR, or CRLF
             while True:
-                match = re.search(b'(\x0D\x0A|\x0A|\x0D)', buffer)
+                match = re.search(b"(\x0D\x0A|\x0A|\x0D)", buffer)
                 if not match:
                     break  # No more line breaks found, process the remaining buffer
 
                 start, end = match.start(), match.end()
-                chunk_str = buffer[:end].decode(encoding, errors='strict')
+                chunk_str = buffer[:end].decode(encoding, errors="strict")
                 buffer = buffer[end:]
 
                 chunks.append(chunk_str)
 
         # Handle the last chunk
         if buffer:
-            chunks.append(buffer.decode(encoding, errors='strict'))
+            chunks.append(buffer.decode(encoding, errors="strict"))
 
     return chunks
 
+
 def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
     console = Console()
     console.print(f"Read {file_in}")
@@ -243,4 +278,3 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
             f.write(line.encode(encoding))
 
     console.print(f"Wrote {file_out}")
-
diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
index 132b902..94aff4b 100644
--- a/tests/test_update_offsets.py
+++ b/tests/test_update_offsets.py
@@ -34,16 +34,22 @@ def test_update_offsets(capsys, tmp_path: Path) -> None:
     assert not captured.err
     assert re.search(r"Wrote\s+" + re.escape(str(output)), captured.out)
     assert output.exists()
-    with open(file_expected, 'r', encoding='iso-8859-1') as file_exp:
+    with open(file_expected, "r", encoding="iso-8859-1") as file_exp:
         lines_exp = file_exp.readlines()
-    with open(output, 'r', encoding='iso-8859-1') as file_act:
+    with open(output, "r", encoding="iso-8859-1") as file_act:
         lines_act = file_act.readlines()
-    assert len(lines_exp) == len(lines_act), f"lines_exp=f{lines_exp}, lines_act=f{lines_act}"
-    for line_no, (line_exp, line_act) in enumerate(zip(lines_exp, lines_act), start = 1):
+    assert len(lines_exp) == len(
+        lines_act
+    ), f"lines_exp=f{lines_exp}, lines_act=f{lines_act}"
+    for line_no, (line_exp, line_act) in enumerate(
+        zip(lines_exp, lines_act), start=1
+    ):
         assert line_exp == line_act, f"Lines differ in line {line_no}"
 
+
 # The current implementation doesn't support valid PDF lines as "/Length 5470>> stream".
 
+
 @pytest.mark.parametrize(
     "input_pdf_filepath",
     [
@@ -71,9 +77,11 @@ def test_update_offsets(capsys, tmp_path: Path) -> None:
         "sample-files/023-cmyk-image/cmyk-image.pdf",
         "sample-files/024-annotations/annotated_pdf.pdf",
         "sample-files/025-attachment/with-attachment.pdf",
-    ]
+    ],
 )
-def test_update_offsets_on_all_reference_files(capsys, tmp_path: Path, input_pdf_filepath: Path) -> None:
+def test_update_offsets_on_all_reference_files(
+    capsys, tmp_path: Path, input_pdf_filepath: Path
+) -> None:
     # Arrange
     output_pdf_filepath = tmp_path / "out.pdf"
 
@@ -81,7 +89,8 @@ def test_update_offsets_on_all_reference_files(capsys, tmp_path: Path, input_pdf
     exit_code = run_cli(
         [
             "update-offsets",
-            "--encoding", "iso-8859-1",
+            "--encoding",
+            "iso-8859-1",
             input_pdf_filepath,
             str(output_pdf_filepath),
         ]

From c3a6c88680c51167eec189be956c4d8ed5096f54 Mon Sep 17 00:00:00 2001
From: rogmann <github@rogmann.org>
Date: Wed, 6 Nov 2024 23:17:46 +0100
Subject: [PATCH 21/22] DEV: directory tests is lower-case.

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 1047d86..1f32208 100644
--- a/Makefile
+++ b/Makefile
@@ -15,10 +15,10 @@ upload:
 clean:
 	python setup.py clean --all
 	pyclean .
-	rm -rf Tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt
+	rm -rf tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt
 
 test:
-	pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30
+	pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30
 
 mutation-test:
 	mutmut run

From fc42eb4342022e110bb35c50fef3607e72ef21d0 Mon Sep 17 00:00:00 2001
From: "Cimon Lucas (LCM)" <lucas_cimon@connect-tech.sncf>
Date: Thu, 7 Nov 2024 16:57:10 +0100
Subject: [PATCH 22/22] Pleasing mypy & typing imports under Python 3.8

---
 pdfly/cli.py                 |  4 +--
 pdfly/update_offsets.py      | 57 +++++++++++++++++++++---------------
 tests/conftest.py            |  6 ++--
 tests/test_update_offsets.py |  4 +--
 4 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/pdfly/cli.py b/pdfly/cli.py
index 2a7463c..317c9e7 100644
--- a/pdfly/cli.py
+++ b/pdfly/cli.py
@@ -236,10 +236,10 @@ def update_offsets(
     encoding: str = typer.Option(
         "ISO-8859-1",
         help="Encoding used to read and write the files, e.g. UTF-8.",
-    ),  # noqa
+    ),
     verbose: bool = typer.Option(
         False, help="Show progress while processing."
-    ),  # noqa
+    ),
 ) -> None:
     pdfly.update_offsets.main(file_in, file_out, encoding, verbose)
 
diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py
index d33954c..ac4bb07 100644
--- a/pdfly/update_offsets.py
+++ b/pdfly/update_offsets.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 """
 Updates offsets and lengths in a simple PDF file.
 
@@ -20,15 +19,21 @@
 It expects that the /Length-entries have default values containing
 enough digits, e.g. /Length 000 when the stream consists of 576 bytes.
 
-EXAMPLE
+Example:
    update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf
+
 """
 
-from collections.abc import Iterable
-from pathlib import Path
-from rich.console import Console
 import re
 import sys
+from pathlib import Path
+
+if sys.version_info >= (3, 9):
+    List = list
+else:  # Support for Python 3.8
+    from typing import List
+
+from rich.console import Console
 
 # Here, only simple regular expressions are used.
 # Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
@@ -41,20 +46,20 @@
 
 
 def update_lines(
-    lines_in: Iterable[str], encoding: str, console: Console, verbose: bool
-) -> Iterable[str]:
-    """Iterates over the lines of a pdf-files and updates offsets.
+    lines_in: List[str], encoding: str, console: Console, verbose: bool
+) -> List[str]:
+    """
+    Iterates over the lines of a pdf-files and updates offsets.
 
     The input is expected to be a pdf without binary-sections.
 
-    :param lines_in: An Iterable over the lines including line-breaks.
+    :param lines_in: A list over the lines including line-breaks.
     :param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
     :param console: Console used to print messages.
     :param verbose: True to activate logging of info-messages.
     :return The output is a list of lines to be written
             in the given encoding.
     """
-
     lines_out = []  # lines to be written
     map_line_offset = {}  # map from line-number to offset
     map_obj_offset = {}  # map from object-number to offset
@@ -184,7 +189,12 @@ def update_lines(
 
     for curr_obj, stream_len in map_stream_len.items():
         if curr_obj in map_obj_length_line:
-            m_length = RE_LENGTH.match(map_obj_length_line[curr_obj])
+            line = map_obj_length_line[curr_obj]
+            m_length = RE_LENGTH.match(line)
+            if m_length is None:
+                raise RuntimeError(
+                    f"Invalid PDF file: line '{line}' does not contain a valid /Length."
+                )
             prev_length = m_length.group(2)
             len_digits = len(prev_length)
             len_format = "%%0%dd" % len_digits
@@ -192,14 +202,14 @@ def update_lines(
             if len(updated_length) > len_digits:
                 raise RuntimeError(
                     f"Not enough digits in /Length-entry {prev_length}"
-                    + f" of object {curr_obj}:"
-                    + f" too short to take /Length {updated_length}"
+                    f" of object {curr_obj}:"
+                    f" too short to take /Length {updated_length}"
                 )
             line = m_length.group(1) + updated_length + m_length.group(3)
             lines_out[map_obj_length_line_no[curr_obj] - 1] = line
         elif curr_obj in map_obj_length_ref:
             len_obj = map_obj_length_ref[curr_obj]
-            if not len_obj in map_obj_line:
+            if len_obj not in map_obj_line:
                 raise RuntimeError(
                     f"obj {curr_obj} has unknown length-obj {len_obj}"
                 )
@@ -211,8 +221,8 @@ def update_lines(
             if len(updated_length) > len_digits:
                 raise RuntimeError(
                     f"Not enough digits in /Length-ref-entry {prev_length}"
-                    + f" of object {curr_obj} and len-object {len_obj}:"
-                    + f" too short to take /Length {updated_length}"
+                    f" of object {curr_obj} and len-object {len_obj}:"
+                    f" too short to take /Length {updated_length}"
                 )
             if prev_length != updated_length:
                 if verbose:
@@ -223,22 +233,23 @@ def update_lines(
         else:
             raise RuntimeError(
                 f"obj {curr_obj} with stream-len {stream_len}"
-                + f" has no object-length-line: {map_obj_length_line}"
+                f" has no object-length-line: {map_obj_length_line}"
             )
 
     return lines_out
 
 
-def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
-    """Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
+def read_binary_file(file_path: Path, encoding: str) -> List[str]:
+    """
+    Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
     Encoding utf-8 can't be used to read random binary data.
 
     :param file_path: file to be read line by line
     :param encoding: encoding to be used (e.g. "iso-8859-1")
     :return lines including line-breaks
     """
-    chunks = []
-    with open(file_path, "rb") as file:
+    chunks: List[str] = []
+    with file_path.open("rb") as file:
         buffer = bytearray()
         while True:
             chunk = file.read(4096)  # Read in chunks of 4096 bytes
@@ -253,7 +264,7 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
                 if not match:
                     break  # No more line breaks found, process the remaining buffer
 
-                start, end = match.start(), match.end()
+                end = match.end()
                 chunk_str = buffer[:end].decode(encoding, errors="strict")
                 buffer = buffer[end:]
 
@@ -277,4 +288,4 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
         for line in lines_out:
             f.write(line.encode(encoding))
 
-    console.print(f"Wrote {file_out}")
+    console.print(f"Wrote {file_out}", soft_wrap=True)
diff --git a/tests/conftest.py b/tests/conftest.py
index 0e02931..9ab40d4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,6 @@
 """Utilities and fixtures that are available automatically for all tests."""
 
-import io, os
+import os
 from pathlib import Path
 
 from fpdf import FPDF
@@ -58,7 +58,7 @@ def pdf_file_100(tmp_path):
     for i in range(100):
         pdf.add_page()
         pdf.set_font("helvetica", size=12)
-        pdf.cell(200, 10, txt=f"{i}", ln=True, align="C")
+        pdf.cell(200, 10, text=f"{i}", ln=True, align="C")
 
     pdf_filepath = tmp_path / "pdf_file_100.pdf"
     pdf.output(pdf_filepath)
@@ -73,7 +73,7 @@ def pdf_file_abc(tmp_path):
     for char in [chr(i) for i in range(ord("a"), ord("z") + 1)]:
         pdf.add_page()
         pdf.set_font("helvetica", size=12)
-        pdf.cell(200, 10, txt=f"{char}", ln=True, align="C")
+        pdf.cell(200, 10, text=f"{char}", ln=True, align="C")
 
     pdf_filepath = tmp_path / "abc.pdf"
     pdf.output(pdf_filepath)
diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py
index 94aff4b..bd5d506 100644
--- a/tests/test_update_offsets.py
+++ b/tests/test_update_offsets.py
@@ -34,9 +34,9 @@ def test_update_offsets(capsys, tmp_path: Path) -> None:
     assert not captured.err
     assert re.search(r"Wrote\s+" + re.escape(str(output)), captured.out)
     assert output.exists()
-    with open(file_expected, "r", encoding="iso-8859-1") as file_exp:
+    with open(file_expected, encoding="iso-8859-1") as file_exp:
         lines_exp = file_exp.readlines()
-    with open(output, "r", encoding="iso-8859-1") as file_act:
+    with open(output, encoding="iso-8859-1") as file_act:
         lines_act = file_act.readlines()
     assert len(lines_exp) == len(
         lines_act