From b18c8e0db6a313cb300fb75a52501a97c3dab8bd Mon Sep 17 00:00:00 2001 From: rogmann Date: Sun, 28 Aug 2022 11:38:08 +0200 Subject: [PATCH 01/22] ENH: Added command update-offsets to adjust offsets and lengths. This command adjusts /Length-entries of stream objects and the xref-offsets in simple PDF files (ASCII only, one xref section only). --- pdfly/cli.py | 16 +++- pdfly/update_offsets.py | 163 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 pdfly/update_offsets.py diff --git a/pdfly/cli.py b/pdfly/cli.py index 2353d8d..5683ba6 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -17,6 +17,7 @@ import pdfly.pagemeta import pdfly.rm import pdfly.up2 +import pdfly.update_offsets import pdfly.x2pdf @@ -228,7 +229,20 @@ def compress( pdfly.compress.main(pdf, output) -@entry_point.command(name="x2pdf", help=pdfly.x2pdf.__doc__) # type: ignore[misc] +@entry_point.command(name="update-offsets") # type: ignore[misc] +def update_offsets( + file_in: Path, + file_out: Path, + encoding: str = typer.Option( + "UTF-8", + help="Encoding used to read and write the files, e.g. ISO-8859-1.", + ), # noqa + verbose: bool = typer.Option(False, help="Show progress while processing."), # noqa +) -> None: + pdfly.update_offsets.main(file_in, file_out, encoding, verbose) + + +@entry_point.command(name="x2pdf", help=x2pdf.update_offsets.__doc__) # type: ignore[misc] def x2pdf( x: List[Path], output: Annotated[ diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py new file mode 100644 index 0000000..8ca47b6 --- /dev/null +++ b/pdfly/update_offsets.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python +""" +Updates offsets and lengths in a simple PDF file. + +The PDF specification requires that the xref section at the end +of a PDF file has the correct offsets of the PDF's objects. +It further requires that the dictionary of a stream object +contains a /Length-entry giving the length of the encoded stream. + +When editing a PDF file using a text-editor (e.g. vim) it is +elaborate to compute or adjust these offsets and lengths. + +This command tries to compute /Length-entries of the stream dictionaries +and the offsets in the xref-section automatically. + +It expects that the PDF file has ASCII encoding only. It may +use ISO-8859-1 or UTF-8 in its comments. +Therefore it expects that there a no binary streams. +It expects that there is one xref-section only. +It expects that the /Length-entries have default values containing +enough digits, e.g. /Length 000 when the stream consists of 576 bytes. + +EXAMPLE + update-offsets -v --encoding UTF-8 issue-297.pdf issue-297.out.pdf +""" + +from collections.abc import Iterable +from pathlib import Path +import logging +import re +import sys + + +def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]: + """Iterates over the lines of a pdf-files and updates offsets. + + The input is expected to be a pdf without binary-sections. + + :param lines_in: An Iterable over the lines including line-breaks. + :param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8". + :return The output is a list of lines to be written + in the given encoding. + """ + logger = logging.getLogger("update_lines") + re_obj = re.compile(r"^([0-9]+) ([0-9]+) obj *") + re_content = re.compile(r"^(.*)") + re_length = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL) + + lines_out = [] # lines to be written + map_line_offset = {} # map from line-number to offset + map_obj_offset = {} # map from object-number to offset + line_no = 0 # current line-number (starting at 0) + offset_out = 0 # current offset in output-file + line_xref = None # line-number of xref-line (in xref-section only) + line_startxref = None # line-number of startxref-line + curr_obj = None # number of current object + len_stream = None # length of stream (in stream only) + offset_xref = None # offset of xref-section + map_stream_len = {} # map from object-number to /Length of stream + map_obj_length_line = {} # map from object-number to /Length-line + map_obj_length_line_no = {} # map from object-number to line_no + # of /Length-line + for line in lines_in: + line_no += 1 + m_content = re_content.match(line) + if m_content is None: + raise RuntimeError(f"Line {line_no} without line-break.") + content = m_content.group(1) + map_line_offset[line_no] = offset_out + m_obj = re_obj.match(line) + if m_obj is not None: + curr_obj = m_obj.group(1) + logger.info(f"line {line_no}: object {curr_obj}") + map_obj_offset[curr_obj] = int(offset_out) + if content == "xref": + offset_xref = offset_out + line_xref = line_no + elif content == "startxref": + line_startxref = line_no + line_xref = None + elif content == "stream": + logger.info(f"line {line_no}: start stream") + len_stream = 0 + elif content == "endstream": + logger.info(f"line {line_no}: end stream") + if curr_obj is None: + raise RuntimeError( + f"Line {line_no}: " + "endstream without object-start." + ) + if len_stream is None: + raise RuntimeError(f"Line {line_no}: endstream without stream.") + logger.info(f"line {line_no}: /Length {len_stream}") + map_stream_len[curr_obj] = len_stream + elif content == "endobj": + curr_obj = None + elif curr_obj is not None and len_stream is None: + mLength = re_length.match(line) + if mLength is not None: + logger.info(f"line {line_no}, /Length: {content}") + map_obj_length_line[curr_obj] = line + map_obj_length_line_no[curr_obj] = line_no + elif curr_obj is not None and len_stream is not None: + len_stream += len(line.encode(encoding)) + elif line_xref is not None and line_no > line_xref + 2: + objNo = line_no - line_xref - 2 + if objNo <= len(map_obj_offset) and str(objNo) in map_obj_offset: + eol = line[-2:] + xrefUpd = ("%010d" % map_obj_offset[str(objNo)]) + " 00000 n" + logger.info(f"{content} -> {xrefUpd}") + line = xrefUpd + eol + elif line_startxref is not None and line_no == line_startxref + 1: + if offset_xref is None: + raise RuntimeError("startxref without preceding xref-section") + line = "%d\n" % offset_xref + lines_out.append(line) + + offset_out += len(line.encode(encoding)) + + # Some checks + if len(map_obj_offset) == 0: + raise RuntimeError("The command didn't find any PDF objects.") + if offset_xref is None: + raise RuntimeError("The command didn't find a xref-section") + if line_startxref is None: + raise RuntimeError("The command didn't find a startxref-section") + + for curr_obj, stream_len in map_stream_len.items(): + if not curr_obj in map_obj_length_line: + raise RuntimeError( + f"obj {curr_obj} with stream-len {len}" + + f" has no object-length-line: {map_obj_length_line}" + ) + m_length = re_length.match(map_obj_length_line[curr_obj]) + prev_length = m_length.group(2) + len_digits = len(prev_length) + len_format = "%%0%dd" % len_digits + updated_length = len_format % stream_len + if len(updated_length) > len_digits: + raise RuntimeError( + f"Not enough digits in /Length-entry {m_length.group(2)}" + + f" of object {curr_obj}:" + + f" too short to take /Length {updated_length}" + ) + line = m_length.group(1) + updated_length + m_length.group(3) + lines_out[map_obj_length_line_no[curr_obj] - 1] = line + + return lines_out + + +def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: + if verbose: + logging.basicConfig(level=logging.INFO) + print(f"Read {file_in}") + + with open(file_in, "r") as f: + lines_out = update_lines(f, encoding) + + with open(file_out, "wb") as f: + for line in lines_out: + f.write(line.encode(encoding)) + + if verbose: + print(f"Wrote {file_out}") From 25f0ccd02c725b50f2b9dcf8ab3fd641f9f386a5 Mon Sep 17 00:00:00 2001 From: rogmann Date: Fri, 24 May 2024 22:56:47 +0200 Subject: [PATCH 02/22] BUG: Clear stream-length at new object. --- pdfly/update_offsets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index 8ca47b6..c80a609 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -72,6 +72,8 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]: curr_obj = m_obj.group(1) logger.info(f"line {line_no}: object {curr_obj}") map_obj_offset[curr_obj] = int(offset_out) + len_stream = None + if content == "xref": offset_xref = offset_out line_xref = line_no From d8f66691b7a40b37103b6627e387bdb1643527f2 Mon Sep 17 00:00:00 2001 From: rogmann Date: Sun, 3 Nov 2024 21:39:20 +0100 Subject: [PATCH 03/22] DEV: Logging migrated from Python's built-in logging to . --- pdfly/update_offsets.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index c80a609..25ccf63 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -26,22 +26,23 @@ from collections.abc import Iterable from pathlib import Path -import logging +from rich.console import Console import re import sys -def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]: +def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]: """Iterates over the lines of a pdf-files and updates offsets. The input is expected to be a pdf without binary-sections. :param lines_in: An Iterable over the lines including line-breaks. :param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8". + :param console: Console used to print messages. + :param verbose: True to activate logging of info-messages. :return The output is a list of lines to be written in the given encoding. """ - logger = logging.getLogger("update_lines") re_obj = re.compile(r"^([0-9]+) ([0-9]+) obj *") re_content = re.compile(r"^(.*)") re_length = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL) @@ -70,7 +71,8 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]: m_obj = re_obj.match(line) if m_obj is not None: curr_obj = m_obj.group(1) - logger.info(f"line {line_no}: object {curr_obj}") + if verbose: + console.print(f"line {line_no}: object {curr_obj}") map_obj_offset[curr_obj] = int(offset_out) len_stream = None @@ -81,24 +83,28 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]: line_startxref = line_no line_xref = None elif content == "stream": - logger.info(f"line {line_no}: start stream") + if verbose: + console.print(f"line {line_no}: start stream") len_stream = 0 elif content == "endstream": - logger.info(f"line {line_no}: end stream") + if verbose: + console.print(f"line {line_no}: end stream") if curr_obj is None: raise RuntimeError( f"Line {line_no}: " + "endstream without object-start." ) if len_stream is None: raise RuntimeError(f"Line {line_no}: endstream without stream.") - logger.info(f"line {line_no}: /Length {len_stream}") + if verbose: + console.print(f"line {line_no}: /Length {len_stream}") map_stream_len[curr_obj] = len_stream elif content == "endobj": curr_obj = None elif curr_obj is not None and len_stream is None: mLength = re_length.match(line) if mLength is not None: - logger.info(f"line {line_no}, /Length: {content}") + if verbose: + console.print(f"line {line_no}, /Length: {content}") map_obj_length_line[curr_obj] = line map_obj_length_line_no[curr_obj] = line_no elif curr_obj is not None and len_stream is not None: @@ -108,7 +114,8 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]: if objNo <= len(map_obj_offset) and str(objNo) in map_obj_offset: eol = line[-2:] xrefUpd = ("%010d" % map_obj_offset[str(objNo)]) + " 00000 n" - logger.info(f"{content} -> {xrefUpd}") + if verbose: + console.print(f"{content} -> {xrefUpd}") line = xrefUpd + eol elif line_startxref is not None and line_no == line_startxref + 1: if offset_xref is None: @@ -150,16 +157,14 @@ def update_lines(lines_in: Iterable[str], encoding: str) -> Iterable[str]: def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: - if verbose: - logging.basicConfig(level=logging.INFO) - print(f"Read {file_in}") + console = Console() + console.print(f"Read {file_in}") with open(file_in, "r") as f: - lines_out = update_lines(f, encoding) + lines_out = update_lines(f, encoding, console, verbose) with open(file_out, "wb") as f: for line in lines_out: f.write(line.encode(encoding)) - if verbose: - print(f"Wrote {file_out}") + console.print(f"Wrote {file_out}") From 37b9b9d523f33fc950c22b8d35358d42f7d47ea3 Mon Sep 17 00:00:00 2001 From: rogmann Date: Sun, 3 Nov 2024 22:26:58 +0100 Subject: [PATCH 04/22] TST: Added test of update-offsets using hello.pdf. --- resources/hello-expected.pdf | Bin 0 -> 883 bytes resources/hello.pdf | Bin 0 -> 886 bytes tests/test_update_offsets.py | 41 +++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 resources/hello-expected.pdf create mode 100644 resources/hello.pdf create mode 100644 tests/test_update_offsets.py diff --git a/resources/hello-expected.pdf b/resources/hello-expected.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c92f5c19dcaedc0199f40d51fc707cb58e86ed7f GIT binary patch literal 883 zcmZuv(Qeu>6n*zsoEKEkL}LdMvZ>+$DBHAEMKs#Wwg;COmMX-RV-Vff?_QS_L}%m# zpWJ(W?zyM4n9cj@dMG-5T0%j>K0b+?8$=7eF%1TUECqlle|a{DrrPVuZV>%2t|@Je zh_W#r3sV=I8J|^{I-@p2fVbO-NzT`HaA)@h-W)ut)_B7diO37}3CqaE z7nn*&!wW1{qj%UKBMC3!l8jnwG4FI)kyF_K59MEj5u&g4n#2uOh?dr}N!glVj@tRq ON1BdAr*nHh7vdl3D&2Je literal 0 HcmV?d00001 diff --git a/resources/hello.pdf b/resources/hello.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b2fbc2d59958908746d17a06b9e7f03366bb0abf GIT binary patch literal 886 zcmcgq!EW0y487|s-X&;)A+_Y#X*U!(IBB+FD;A{2F53=jtavUQtB{o--Pey);>L%a zCqt4&k<{bk8>|+~vARygU`#_ONZ6-mad(GkrAyOdLYB1v5aq8g2GLx5UE31TcjMa1 zHi#%1vzahW!I5z%bQyjWrXkyGMvw`It9JiL*yD~d_Nrqn9mKDuDgAE{E$q;G zVs0nowZunSZ^2X1$(4x7h3I?Rok1qZiMS#Mo~{&G&d(0;U=Ie~96h{6tSFM^q0-2U zv3)Tvv?m~3DyS*)Egw3^yx`ODlc{S963S1Rs1fpKai4R$&nVB0XD+&GPEui}Lhi@? zlMmuyy9%aEw None: + # Arrange + input = str(RESOURCES_ROOT / "hello.pdf") + file_expected = str(RESOURCES_ROOT / "hello-expected.pdf") + output = tmp_path / "hello-out.pdf" + assert not output.exists() + + # Act + exit_code = run_cli( + [ + "update-offsets", + str(input), + str(output), + ] + ) + + # Assert + captured = capsys.readouterr() + assert exit_code == 0, captured + assert not captured.err + assert f"Wrote {output}" in captured.out + assert output.exists() + with open(file_expected, 'r', encoding='iso-8859-1') as file_exp: + lines_exp = file_exp.readlines() + with open(output, 'r', encoding='iso-8859-1') as file_act: + lines_act = file_act.readlines() + assert len(lines_exp) == len(lines_act), f"lines_exp=f{lines_exp}, lines_act=f{lines_act}" + for line_no, (line_exp, line_act) in enumerate(zip(lines_exp, lines_act), start = 1): + assert line_exp == line_act, f"Lines differ in line {line_no}" + From be39e9bdc4d040392e74691b43398e1de81f817b Mon Sep 17 00:00:00 2001 From: rogmann Date: Sun, 3 Nov 2024 22:38:18 +0100 Subject: [PATCH 05/22] MAINT: Regex uppercase module constants. --- pdfly/update_offsets.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index 25ccf63..3eaa203 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -21,7 +21,7 @@ enough digits, e.g. /Length 000 when the stream consists of 576 bytes. EXAMPLE - update-offsets -v --encoding UTF-8 issue-297.pdf issue-297.out.pdf + update-offsets --verbose --encoding UTF-8 issue-297.pdf issue-297.out.pdf """ from collections.abc import Iterable @@ -30,6 +30,9 @@ import re import sys +RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") +RE_CONTENT = re.compile(r"^(.*)") +RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL) def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]: """Iterates over the lines of a pdf-files and updates offsets. @@ -43,9 +46,6 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo :return The output is a list of lines to be written in the given encoding. """ - re_obj = re.compile(r"^([0-9]+) ([0-9]+) obj *") - re_content = re.compile(r"^(.*)") - re_length = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL) lines_out = [] # lines to be written map_line_offset = {} # map from line-number to offset @@ -63,12 +63,12 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo # of /Length-line for line in lines_in: line_no += 1 - m_content = re_content.match(line) + m_content = RE_CONTENT.match(line) if m_content is None: raise RuntimeError(f"Line {line_no} without line-break.") content = m_content.group(1) map_line_offset[line_no] = offset_out - m_obj = re_obj.match(line) + m_obj = RE_OBJ.match(line) if m_obj is not None: curr_obj = m_obj.group(1) if verbose: @@ -101,7 +101,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo elif content == "endobj": curr_obj = None elif curr_obj is not None and len_stream is None: - mLength = re_length.match(line) + mLength = RE_LENGTH.match(line) if mLength is not None: if verbose: console.print(f"line {line_no}, /Length: {content}") @@ -139,7 +139,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo f"obj {curr_obj} with stream-len {len}" + f" has no object-length-line: {map_obj_length_line}" ) - m_length = re_length.match(map_obj_length_line[curr_obj]) + m_length = RE_LENGTH.match(map_obj_length_line[curr_obj]) prev_length = m_length.group(2) len_digits = len(prev_length) len_format = "%%0%dd" % len_digits From 2465ffe26ae5cf1baf163781da120e5b938b8e77 Mon Sep 17 00:00:00 2001 From: rogmann Date: Sun, 3 Nov 2024 22:41:47 +0100 Subject: [PATCH 06/22] DOC: Add update-offsets command --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b8bf9dc..0667e65 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ $ pdfly --help │ meta Show metadata of a PDF file │ │ pagemeta Give details about a single page. │ │ rm Remove pages from PDF files. │ +│ update-offsets Updates offsets and lengths in a simple PDF file. │ │ x2pdf Convert one or more files to PDF. Each file is a page. │ ╰─────────────────────────────────────────────────────────────────────────────╯ ``` From 8838ca5f6e06235c36b6d1cdd90751f2a42c9402 Mon Sep 17 00:00:00 2001 From: rogmann Date: Sun, 3 Nov 2024 22:49:46 +0100 Subject: [PATCH 07/22] MAINT: Added suggested help-attribute. --- pdfly/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfly/cli.py b/pdfly/cli.py index 5683ba6..36e88f1 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -229,7 +229,7 @@ def compress( pdfly.compress.main(pdf, output) -@entry_point.command(name="update-offsets") # type: ignore[misc] +@entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc] def update_offsets( file_in: Path, file_out: Path, From 3429c2fae6152f6cab92e3c63ba913ed19ebf585 Mon Sep 17 00:00:00 2001 From: "Cimon Lucas (LCM)" Date: Mon, 4 Nov 2024 19:33:24 +0100 Subject: [PATCH 08/22] Minor fixups & adding test_update_offsets_on_all_reference_files() --- pdfly/update_offsets.py | 22 +++++++-------- tests/test_update_offsets.py | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 11 deletions(-) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index 3eaa203..639cbe7 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -65,7 +65,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo line_no += 1 m_content = RE_CONTENT.match(line) if m_content is None: - raise RuntimeError(f"Line {line_no} without line-break.") + raise RuntimeError(f"Invalid PDF file: line {line_no} without line-break.") content = m_content.group(1) map_line_offset[line_no] = offset_out m_obj = RE_OBJ.match(line) @@ -91,10 +91,10 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo console.print(f"line {line_no}: end stream") if curr_obj is None: raise RuntimeError( - f"Line {line_no}: " + "endstream without object-start." + f"Invalid PDF file: line {line_no}: endstream without object-start." ) if len_stream is None: - raise RuntimeError(f"Line {line_no}: endstream without stream.") + raise RuntimeError(f"Invalid PDF file: line {line_no}: endstream without stream.") if verbose: console.print(f"line {line_no}: /Length {len_stream}") map_stream_len[curr_obj] = len_stream @@ -119,7 +119,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo line = xrefUpd + eol elif line_startxref is not None and line_no == line_startxref + 1: if offset_xref is None: - raise RuntimeError("startxref without preceding xref-section") + raise NotImplementedError("Unsupported file: startxref without preceding xref-section (probable cross-reference stream)") line = "%d\n" % offset_xref lines_out.append(line) @@ -127,16 +127,16 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo # Some checks if len(map_obj_offset) == 0: - raise RuntimeError("The command didn't find any PDF objects.") + raise RuntimeError("Invalid PDF file: the command didn't find any PDF objects.") if offset_xref is None: - raise RuntimeError("The command didn't find a xref-section") + raise RuntimeError("Invalid PDF file: the command didn't find a xref-section") if line_startxref is None: - raise RuntimeError("The command didn't find a startxref-section") + raise RuntimeError("Invalid PDF file: the command didn't find a startxref-section") for curr_obj, stream_len in map_stream_len.items(): if not curr_obj in map_obj_length_line: raise RuntimeError( - f"obj {curr_obj} with stream-len {len}" + f"obj {curr_obj} with stream-len {stream_len}" + f" has no object-length-line: {map_obj_length_line}" ) m_length = RE_LENGTH.match(map_obj_length_line[curr_obj]) @@ -146,7 +146,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo updated_length = len_format % stream_len if len(updated_length) > len_digits: raise RuntimeError( - f"Not enough digits in /Length-entry {m_length.group(2)}" + f"Not enough digits in /Length-entry {prev_length}" + f" of object {curr_obj}:" + f" too short to take /Length {updated_length}" ) @@ -160,10 +160,10 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: console = Console() console.print(f"Read {file_in}") - with open(file_in, "r") as f: + with open(file_in, "r", encoding=encoding) as f: lines_out = update_lines(f, encoding, console, verbose) - with open(file_out, "wb") as f: + with open(file_out, "wb", encoding=encoding) as f: for line in lines_out: f.write(line.encode(encoding)) diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index 197071f..7929604 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -6,6 +6,8 @@ from pathlib import Path +import pytest + from .conftest import RESOURCES_ROOT, chdir, run_cli @@ -39,3 +41,54 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: for line_no, (line_exp, line_act) in enumerate(zip(lines_exp, lines_act), start = 1): assert line_exp == line_act, f"Lines differ in line {line_no}" + + +@pytest.mark.parametrize( + "input_pdf_filepath", + [ + "sample-files/002-trivial-libre-office-writer/002-trivial-libre-office-writer.pdf", + "sample-files/005-libreoffice-writer-password/libreoffice-writer-password.pdf", + "sample-files/007-imagemagick-images/imagemagick-ASCII85Decode.pdf", + "sample-files/007-imagemagick-images/imagemagick-CCITTFaxDecode.pdf", + "sample-files/007-imagemagick-images/imagemagick-images.pdf", + "sample-files/007-imagemagick-images/imagemagick-lzw.pdf", + "sample-files/008-reportlab-inline-image/inline-image.pdf", + "sample-files/009-pdflatex-geotopo/GeoTopo-komprimiert.pdf", + "sample-files/011-google-doc-document/google-doc-document.pdf", + "sample-files/012-libreoffice-form/libreoffice-form.pdf", + "sample-files/013-reportlab-overlay/reportlab-overlay.pdf", + "sample-files/015-arabic/habibi-oneline-cmap.pdf", + "sample-files/015-arabic/habibi-rotated.pdf", + "sample-files/015-arabic/habibi.pdf", + "sample-files/016-libre-office-link/libre-office-link.pdf", + "sample-files/017-unreadable-meta-data/unreadablemetadata.pdf", + "sample-files/018-base64-image/base64image.pdf", + "sample-files/019-grayscale-image/grayscale-image.pdf", + "sample-files/020-xmp/output_with_metadata_pymupdf.pdf", + "sample-files/021-pdfa/crazyones-pdfa.pdf", + "sample-files/022-pdfkit/pdfkit.pdf", + "sample-files/023-cmyk-image/cmyk-image.pdf", + "sample-files/024-annotations/annotated_pdf.pdf", + "sample-files/025-attachment/with-attachment.pdf", + ] +) +def test_update_offsets_on_all_reference_files(capsys, tmp_path: Path, input_pdf_filepath: Path) -> None: + # Arrange + output_pdf_filepath = tmp_path / "out.pdf" + + # Act + exit_code = run_cli( + [ + "update-offsets", + "--encoding", "iso-8859-1", + input_pdf_filepath, + str(output_pdf_filepath), + ] + ) + + # Assert + captured = capsys.readouterr() + assert exit_code == 0, captured + assert not captured.err + assert f"Wrote {output_pdf_filepath}" in captured.out + assert output_pdf_filepath.exists() From 10ce5048cfaf274e450c4d3cf00eb07fda77d31c Mon Sep 17 00:00:00 2001 From: rogmann Date: Mon, 4 Nov 2024 22:01:13 +0100 Subject: [PATCH 09/22] MAINT: Bugfix help-attribute of x2pdf --- pdfly/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfly/cli.py b/pdfly/cli.py index 36e88f1..3d7d434 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -242,7 +242,7 @@ def update_offsets( pdfly.update_offsets.main(file_in, file_out, encoding, verbose) -@entry_point.command(name="x2pdf", help=x2pdf.update_offsets.__doc__) # type: ignore[misc] +@entry_point.command(name="x2pdf", help=pdfly.x2pdf.__doc__) # type: ignore[misc] def x2pdf( x: List[Path], output: Annotated[ From 9b0138a8dd4897f57f1fce0a0612546c47bf295f Mon Sep 17 00:00:00 2001 From: rogmann Date: Tue, 5 Nov 2024 00:14:57 +0100 Subject: [PATCH 10/22] ENH: Support of referenced lengths. --- pdfly/cli.py | 4 +- pdfly/update_offsets.py | 89 +++++++++++++++++++++++++---------- resources/hello-expected.pdf | Bin 883 -> 883 bytes 3 files changed, 66 insertions(+), 27 deletions(-) diff --git a/pdfly/cli.py b/pdfly/cli.py index 3d7d434..63b1e15 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -234,8 +234,8 @@ def update_offsets( file_in: Path, file_out: Path, encoding: str = typer.Option( - "UTF-8", - help="Encoding used to read and write the files, e.g. ISO-8859-1.", + "ISO-8859-1", + help="Encoding used to read and write the files, e.g. UTF-8.", ), # noqa verbose: bool = typer.Option(False, help="Show progress while processing."), # noqa ) -> None: diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index 639cbe7..9b1602c 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -15,13 +15,13 @@ It expects that the PDF file has ASCII encoding only. It may use ISO-8859-1 or UTF-8 in its comments. -Therefore it expects that there a no binary streams. +The current implementation incorrectly replaces CR (0x0d) by LF (0x0a) in binary data. It expects that there is one xref-section only. It expects that the /Length-entries have default values containing enough digits, e.g. /Length 000 when the stream consists of 576 bytes. EXAMPLE - update-offsets --verbose --encoding UTF-8 issue-297.pdf issue-297.out.pdf + update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf """ from collections.abc import Iterable @@ -30,9 +30,12 @@ import re import sys +# Here, only simple regular expressions are used. +# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better. RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") RE_CONTENT = re.compile(r"^(.*)") -RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)( .*)", re.DOTALL) +RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL) +RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ /].*)", re.DOTALL) def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]: """Iterates over the lines of a pdf-files and updates offsets. @@ -50,6 +53,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo lines_out = [] # lines to be written map_line_offset = {} # map from line-number to offset map_obj_offset = {} # map from object-number to offset + map_obj_line = {} # map from object-number to line-number line_no = 0 # current line-number (starting at 0) offset_out = 0 # current offset in output-file line_xref = None # line-number of xref-line (in xref-section only) @@ -59,7 +63,8 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo offset_xref = None # offset of xref-section map_stream_len = {} # map from object-number to /Length of stream map_obj_length_line = {} # map from object-number to /Length-line - map_obj_length_line_no = {} # map from object-number to line_no + map_obj_length_ref = {} # map from object-number to /Length-reference (e.g. "3") + map_obj_length_line_no = {} # map from object-number to line_no of length # of /Length-line for line in lines_in: line_no += 1 @@ -71,9 +76,13 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo m_obj = RE_OBJ.match(line) if m_obj is not None: curr_obj = m_obj.group(1) + curr_gen = m_obj.group(2) if verbose: console.print(f"line {line_no}: object {curr_obj}") + if curr_gen != "0": + raise RuntimeError(f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported.") map_obj_offset[curr_obj] = int(offset_out) + map_obj_line[curr_obj] = line_no len_stream = None if content == "xref": @@ -95,18 +104,28 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo ) if len_stream is None: raise RuntimeError(f"Invalid PDF file: line {line_no}: endstream without stream.") + if len_stream > 0: + len_stream = len_stream - 1 # ignore the last EOL if verbose: - console.print(f"line {line_no}: /Length {len_stream}") + console.print(f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}") map_stream_len[curr_obj] = len_stream elif content == "endobj": curr_obj = None elif curr_obj is not None and len_stream is None: - mLength = RE_LENGTH.match(line) - if mLength is not None: + m_length_ref = RE_LENGTH_REF.match(line) + if m_length_ref is not None: + len_obj = m_length_ref.group(2) + len_obj_gen = m_length_ref.group(3) if verbose: - console.print(f"line {line_no}, /Length: {content}") - map_obj_length_line[curr_obj] = line - map_obj_length_line_no[curr_obj] = line_no + console.print(f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}") + map_obj_length_ref[curr_obj] = len_obj + else: + m_length = RE_LENGTH.match(line) + if m_length is not None: + if verbose: + console.print(f"line {line_no}, /Length: {content}") + map_obj_length_line[curr_obj] = line + map_obj_length_line_no[curr_obj] = line_no elif curr_obj is not None and len_stream is not None: len_stream += len(line.encode(encoding)) elif line_xref is not None and line_no > line_xref + 2: @@ -134,24 +153,44 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo raise RuntimeError("Invalid PDF file: the command didn't find a startxref-section") for curr_obj, stream_len in map_stream_len.items(): - if not curr_obj in map_obj_length_line: + if curr_obj in map_obj_length_line: + m_length = RE_LENGTH.match(map_obj_length_line[curr_obj]) + prev_length = m_length.group(2) + len_digits = len(prev_length) + len_format = "%%0%dd" % len_digits + updated_length = len_format % stream_len + if len(updated_length) > len_digits: + raise RuntimeError( + f"Not enough digits in /Length-entry {prev_length}" + + f" of object {curr_obj}:" + + f" too short to take /Length {updated_length}" + ) + line = m_length.group(1) + updated_length + m_length.group(3) + lines_out[map_obj_length_line_no[curr_obj] - 1] = line + elif curr_obj in map_obj_length_ref: + len_obj = map_obj_length_ref[curr_obj] + if not len_obj in map_obj_line: + raise RuntimeError(f"obj {curr_obj} has unknown length-obj {len_obj}") + len_obj_line = map_obj_line[len_obj] + prev_length = lines_out[len_obj_line][:-1] + len_digits = len(prev_length) + len_format = "%%0%dd" % len_digits + updated_length = len_format % stream_len + if len(updated_length) > len_digits: + raise RuntimeError( + f"Not enough digits in /Length-ref-entry {prev_length}" + + f" of object {curr_obj} and len-object {len_obj}:" + + f" too short to take /Length {updated_length}" + ) + if prev_length != updated_length: + if verbose: + console.print(f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}") + lines_out[len_obj_line] = updated_length + '\n' + else: raise RuntimeError( f"obj {curr_obj} with stream-len {stream_len}" + f" has no object-length-line: {map_obj_length_line}" ) - m_length = RE_LENGTH.match(map_obj_length_line[curr_obj]) - prev_length = m_length.group(2) - len_digits = len(prev_length) - len_format = "%%0%dd" % len_digits - updated_length = len_format % stream_len - if len(updated_length) > len_digits: - raise RuntimeError( - f"Not enough digits in /Length-entry {prev_length}" - + f" of object {curr_obj}:" - + f" too short to take /Length {updated_length}" - ) - line = m_length.group(1) + updated_length + m_length.group(3) - lines_out[map_obj_length_line_no[curr_obj] - 1] = line return lines_out @@ -163,7 +202,7 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: with open(file_in, "r", encoding=encoding) as f: lines_out = update_lines(f, encoding, console, verbose) - with open(file_out, "wb", encoding=encoding) as f: + with open(file_out, "wb") as f: for line in lines_out: f.write(line.encode(encoding)) diff --git a/resources/hello-expected.pdf b/resources/hello-expected.pdf index c92f5c19dcaedc0199f40d51fc707cb58e86ed7f..75f7829fb686f024ac70cfbb30b80367845e0341 100644 GIT binary patch delta 14 Wcmey&_L*(NL`Fu_&65}(G6DcD9|e;D delta 14 Wcmey&_L*(NL`Ful&65}(G6DcDB?XiK From e0a32ff9f4e3aaf8d8d1d762001685fbb95ed92f Mon Sep 17 00:00:00 2001 From: Sascha Rogmann <59577610+srogmann@users.noreply.github.com> Date: Tue, 5 Nov 2024 21:35:28 +0100 Subject: [PATCH 11/22] TST: Renamed test PDF file.. Co-authored-by: Lucas Cimon <925560+Lucas-C@users.noreply.github.com> --- tests/test_update_offsets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index 7929604..9da2e09 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -13,7 +13,7 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: # Arrange - input = str(RESOURCES_ROOT / "hello.pdf") + input = str(RESOURCES_ROOT / "file-with-invalid-offsets.pdf") file_expected = str(RESOURCES_ROOT / "hello-expected.pdf") output = tmp_path / "hello-out.pdf" assert not output.exists() From 4f003e586029a31b6c7f0eb7c5e15df639198655 Mon Sep 17 00:00:00 2001 From: rogmann Date: Tue, 5 Nov 2024 22:43:44 +0100 Subject: [PATCH 12/22] TST: Renamed test PDF file. --- ...llo-expected.pdf => file-with-fixed-offsets.pdf} | Bin .../{hello.pdf => file-with-invalid-offsets.pdf} | Bin tests/test_update_offsets.py | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) rename resources/{hello-expected.pdf => file-with-fixed-offsets.pdf} (100%) rename resources/{hello.pdf => file-with-invalid-offsets.pdf} (100%) diff --git a/resources/hello-expected.pdf b/resources/file-with-fixed-offsets.pdf similarity index 100% rename from resources/hello-expected.pdf rename to resources/file-with-fixed-offsets.pdf diff --git a/resources/hello.pdf b/resources/file-with-invalid-offsets.pdf similarity index 100% rename from resources/hello.pdf rename to resources/file-with-invalid-offsets.pdf diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index 9da2e09..3f414c7 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -14,8 +14,8 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: # Arrange input = str(RESOURCES_ROOT / "file-with-invalid-offsets.pdf") - file_expected = str(RESOURCES_ROOT / "hello-expected.pdf") - output = tmp_path / "hello-out.pdf" + file_expected = str(RESOURCES_ROOT / "file-with-fixed-offsets.pdf") + output = tmp_path / "file-with-offsets-out.pdf" assert not output.exists() # Act From 47b16d416837224e4e315eecc8f6729cee8be366 Mon Sep 17 00:00:00 2001 From: rogmann Date: Tue, 5 Nov 2024 22:57:42 +0100 Subject: [PATCH 13/22] TST: rich.console introduces line-breaks in output. --- tests/test_update_offsets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index 3f414c7..b05b1b9 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -7,6 +7,7 @@ from pathlib import Path import pytest +import re from .conftest import RESOURCES_ROOT, chdir, run_cli @@ -31,7 +32,7 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: captured = capsys.readouterr() assert exit_code == 0, captured assert not captured.err - assert f"Wrote {output}" in captured.out + assert re.search(r"Wrote\s+" + re.escape(str(output)), captured.out) assert output.exists() with open(file_expected, 'r', encoding='iso-8859-1') as file_exp: lines_exp = file_exp.readlines() From dd1be3b4acc16324d1e476d6fd177a2207f0b371 Mon Sep 17 00:00:00 2001 From: rogmann Date: Tue, 5 Nov 2024 23:10:15 +0100 Subject: [PATCH 14/22] MAINT: Changed /Length detection to support GeoTopo-komprimiert.pdf --- pdfly/update_offsets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index 9b1602c..403895a 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -35,7 +35,7 @@ RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") RE_CONTENT = re.compile(r"^(.*)") RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL) -RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ /].*)", re.DOTALL) +RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ /\r\n].*)", re.DOTALL) def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]: """Iterates over the lines of a pdf-files and updates offsets. @@ -207,3 +207,4 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: f.write(line.encode(encoding)) console.print(f"Wrote {file_out}") + From 91468979f31fe35f95c297625bacdebc833fe60d Mon Sep 17 00:00:00 2001 From: rogmann Date: Tue, 5 Nov 2024 23:15:41 +0100 Subject: [PATCH 15/22] MAINT: Changed /Length detection to support output_with_metadata_pymupdf.pdf --- pdfly/update_offsets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index 403895a..31f4668 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -35,7 +35,7 @@ RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") RE_CONTENT = re.compile(r"^(.*)") RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL) -RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ /\r\n].*)", re.DOTALL) +RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\r\n].*)", re.DOTALL) def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]: """Iterates over the lines of a pdf-files and updates offsets. From 6d72f5acc82d3d8c59cf529d5e878e457af944c4 Mon Sep 17 00:00:00 2001 From: rogmann Date: Tue, 5 Nov 2024 23:26:38 +0100 Subject: [PATCH 16/22] MAINT: Changed /Length detection (PDF ref 3.1 white-space characters) --- pdfly/update_offsets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index 31f4668..b9ede63 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -35,7 +35,7 @@ RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") RE_CONTENT = re.compile(r"^(.*)") RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL) -RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\r\n].*)", re.DOTALL) +RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\t\f\r\n].*)", re.DOTALL) def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]: """Iterates over the lines of a pdf-files and updates offsets. From 657955b7bf6b5d066ce5151f133a275fbfad5a03 Mon Sep 17 00:00:00 2001 From: rogmann Date: Tue, 5 Nov 2024 23:56:01 +0100 Subject: [PATCH 17/22] MAINT: Don't replace pseudo line-breaks in binary parts of a pdf file. --- pdfly/update_offsets.py | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index b9ede63..e70f238 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -35,7 +35,7 @@ RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") RE_CONTENT = re.compile(r"^(.*)") RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL) -RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\t\f\r\n].*)", re.DOTALL) +RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL) def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]: """Iterates over the lines of a pdf-files and updates offsets. @@ -194,13 +194,48 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo return lines_out +def read_binary_file(file_path: str, encoding: str) -> Iterable[str]: + """Reads a binary file line by line and returns these lines as a list of strings in the given encoding. + Encoding utf-8 can't be used to read random binary data. + + :param file_path: file to be read line by line + :param encoding: encoding to be used (e.g. "iso-8859-1") + :return lines including line-breaks + """ + chunks = [] + with open(file_path, 'rb') as file: + buffer = bytearray() + while True: + chunk = file.read(4096) # Read in chunks of 4096 bytes + if not chunk: + break # End of file + + buffer += chunk + + # Split buffer into chunks based on LF, CR, or CRLF + while True: + match = re.search(b'(\x0D\x0A|\x0A|\x0D)', buffer) + if not match: + break # No more line breaks found, process the remaining buffer + + start, end = match.start(), match.end() + chunk_str = buffer[:end].decode(encoding, errors='strict') + buffer = buffer[end:] + + chunks.append(chunk_str) + + # Handle the last chunk + if buffer: + chunks.append(buffer.decode(encoding, errors='strict')) + + return chunks def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: console = Console() console.print(f"Read {file_in}") - with open(file_in, "r", encoding=encoding) as f: - lines_out = update_lines(f, encoding, console, verbose) + lines_in = read_binary_file(file_in, encoding) + lines_out = update_lines(lines_in, encoding, console, verbose) with open(file_out, "wb") as f: for line in lines_out: From 5c3b92c18d6e5e4d0ec13ee34e1911eff32293c9 Mon Sep 17 00:00:00 2001 From: rogmann Date: Wed, 6 Nov 2024 22:41:05 +0100 Subject: [PATCH 18/22] MAINT: EOL can be CR, LF or CRLF. --- pdfly/update_offsets.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index e70f238..c02e3d7 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -33,7 +33,7 @@ # Here, only simple regular expressions are used. # Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better. RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") -RE_CONTENT = re.compile(r"^(.*)") +RE_CONTENT = re.compile(r"^([^\r\n]*)", re.DOTALL) RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL) RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL) @@ -66,8 +66,8 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo map_obj_length_ref = {} # map from object-number to /Length-reference (e.g. "3") map_obj_length_line_no = {} # map from object-number to line_no of length # of /Length-line - for line in lines_in: - line_no += 1 + for idx, line in enumerate(lines_in): + line_no = idx + 1 m_content = RE_CONTENT.match(line) if m_content is None: raise RuntimeError(f"Invalid PDF file: line {line_no} without line-break.") @@ -105,7 +105,8 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo if len_stream is None: raise RuntimeError(f"Invalid PDF file: line {line_no}: endstream without stream.") if len_stream > 0: - len_stream = len_stream - 1 # ignore the last EOL + # Ignore the last EOL + len_stream = len_stream - 2 if lines_in[idx - 1][-2:] == '\r\n' else len_stream - 1 if verbose: console.print(f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}") map_stream_len[curr_obj] = len_stream From 68a352fb05b0ac23be17b2dbb7b625fd1ccbfbc8 Mon Sep 17 00:00:00 2001 From: rogmann Date: Wed, 6 Nov 2024 23:00:55 +0100 Subject: [PATCH 19/22] TST: Disabled some documents which are not supported. --- tests/test_update_offsets.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index b05b1b9..132b902 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -42,7 +42,7 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: for line_no, (line_exp, line_act) in enumerate(zip(lines_exp, lines_act), start = 1): assert line_exp == line_act, f"Lines differ in line {line_no}" - +# The current implementation doesn't support valid PDF lines as "/Length 5470>> stream". @pytest.mark.parametrize( "input_pdf_filepath", @@ -55,18 +55,18 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: "sample-files/007-imagemagick-images/imagemagick-lzw.pdf", "sample-files/008-reportlab-inline-image/inline-image.pdf", "sample-files/009-pdflatex-geotopo/GeoTopo-komprimiert.pdf", - "sample-files/011-google-doc-document/google-doc-document.pdf", + # "sample-files/011-google-doc-document/google-doc-document.pdf", # stream token in line after /Length "sample-files/012-libreoffice-form/libreoffice-form.pdf", "sample-files/013-reportlab-overlay/reportlab-overlay.pdf", "sample-files/015-arabic/habibi-oneline-cmap.pdf", "sample-files/015-arabic/habibi-rotated.pdf", "sample-files/015-arabic/habibi.pdf", "sample-files/016-libre-office-link/libre-office-link.pdf", - "sample-files/017-unreadable-meta-data/unreadablemetadata.pdf", + # "sample-files/017-unreadable-meta-data/unreadablemetadata.pdf", # stream in line after object "sample-files/018-base64-image/base64image.pdf", - "sample-files/019-grayscale-image/grayscale-image.pdf", + # "sample-files/019-grayscale-image/grayscale-image.pdf", # stream in line after object "sample-files/020-xmp/output_with_metadata_pymupdf.pdf", - "sample-files/021-pdfa/crazyones-pdfa.pdf", + # "sample-files/021-pdfa/crazyones-pdfa.pdf", # stream in line is after dictionary "sample-files/022-pdfkit/pdfkit.pdf", "sample-files/023-cmyk-image/cmyk-image.pdf", "sample-files/024-annotations/annotated_pdf.pdf", From 51ed725b27e12e8257dcf09cd75bc43d64ec6bc9 Mon Sep 17 00:00:00 2001 From: rogmann Date: Wed, 6 Nov 2024 23:11:12 +0100 Subject: [PATCH 20/22] MAINT: black (code formatting) --- pdfly/cli.py | 4 +- pdfly/update_offsets.py | 76 ++++++++++++++++++++++++++---------- tests/test_update_offsets.py | 23 +++++++---- 3 files changed, 74 insertions(+), 29 deletions(-) diff --git a/pdfly/cli.py b/pdfly/cli.py index 63b1e15..2a7463c 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -237,7 +237,9 @@ def update_offsets( "ISO-8859-1", help="Encoding used to read and write the files, e.g. UTF-8.", ), # noqa - verbose: bool = typer.Option(False, help="Show progress while processing."), # noqa + verbose: bool = typer.Option( + False, help="Show progress while processing." + ), # noqa ) -> None: pdfly.update_offsets.main(file_in, file_out, encoding, verbose) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index c02e3d7..d33954c 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -35,9 +35,14 @@ RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") RE_CONTENT = re.compile(r"^([^\r\n]*)", re.DOTALL) RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL) -RE_LENGTH = re.compile(r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL) +RE_LENGTH = re.compile( + r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL +) -def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbose: bool) -> Iterable[str]: + +def update_lines( + lines_in: Iterable[str], encoding: str, console: Console, verbose: bool +) -> Iterable[str]: """Iterates over the lines of a pdf-files and updates offsets. The input is expected to be a pdf without binary-sections. @@ -63,14 +68,18 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo offset_xref = None # offset of xref-section map_stream_len = {} # map from object-number to /Length of stream map_obj_length_line = {} # map from object-number to /Length-line - map_obj_length_ref = {} # map from object-number to /Length-reference (e.g. "3") + map_obj_length_ref = ( + {} + ) # map from object-number to /Length-reference (e.g. "3") map_obj_length_line_no = {} # map from object-number to line_no of length # of /Length-line for idx, line in enumerate(lines_in): line_no = idx + 1 m_content = RE_CONTENT.match(line) if m_content is None: - raise RuntimeError(f"Invalid PDF file: line {line_no} without line-break.") + raise RuntimeError( + f"Invalid PDF file: line {line_no} without line-break." + ) content = m_content.group(1) map_line_offset[line_no] = offset_out m_obj = RE_OBJ.match(line) @@ -80,7 +89,9 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo if verbose: console.print(f"line {line_no}: object {curr_obj}") if curr_gen != "0": - raise RuntimeError(f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported.") + raise RuntimeError( + f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported." + ) map_obj_offset[curr_obj] = int(offset_out) map_obj_line[curr_obj] = line_no len_stream = None @@ -103,12 +114,20 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo f"Invalid PDF file: line {line_no}: endstream without object-start." ) if len_stream is None: - raise RuntimeError(f"Invalid PDF file: line {line_no}: endstream without stream.") + raise RuntimeError( + f"Invalid PDF file: line {line_no}: endstream without stream." + ) if len_stream > 0: # Ignore the last EOL - len_stream = len_stream - 2 if lines_in[idx - 1][-2:] == '\r\n' else len_stream - 1 + len_stream = ( + len_stream - 2 + if lines_in[idx - 1][-2:] == "\r\n" + else len_stream - 1 + ) if verbose: - console.print(f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}") + console.print( + f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}" + ) map_stream_len[curr_obj] = len_stream elif content == "endobj": curr_obj = None @@ -118,7 +137,9 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo len_obj = m_length_ref.group(2) len_obj_gen = m_length_ref.group(3) if verbose: - console.print(f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}") + console.print( + f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}" + ) map_obj_length_ref[curr_obj] = len_obj else: m_length = RE_LENGTH.match(line) @@ -139,7 +160,9 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo line = xrefUpd + eol elif line_startxref is not None and line_no == line_startxref + 1: if offset_xref is None: - raise NotImplementedError("Unsupported file: startxref without preceding xref-section (probable cross-reference stream)") + raise NotImplementedError( + "Unsupported file: startxref without preceding xref-section (probable cross-reference stream)" + ) line = "%d\n" % offset_xref lines_out.append(line) @@ -147,11 +170,17 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo # Some checks if len(map_obj_offset) == 0: - raise RuntimeError("Invalid PDF file: the command didn't find any PDF objects.") + raise RuntimeError( + "Invalid PDF file: the command didn't find any PDF objects." + ) if offset_xref is None: - raise RuntimeError("Invalid PDF file: the command didn't find a xref-section") + raise RuntimeError( + "Invalid PDF file: the command didn't find a xref-section" + ) if line_startxref is None: - raise RuntimeError("Invalid PDF file: the command didn't find a startxref-section") + raise RuntimeError( + "Invalid PDF file: the command didn't find a startxref-section" + ) for curr_obj, stream_len in map_stream_len.items(): if curr_obj in map_obj_length_line: @@ -171,7 +200,9 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo elif curr_obj in map_obj_length_ref: len_obj = map_obj_length_ref[curr_obj] if not len_obj in map_obj_line: - raise RuntimeError(f"obj {curr_obj} has unknown length-obj {len_obj}") + raise RuntimeError( + f"obj {curr_obj} has unknown length-obj {len_obj}" + ) len_obj_line = map_obj_line[len_obj] prev_length = lines_out[len_obj_line][:-1] len_digits = len(prev_length) @@ -185,8 +216,10 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo ) if prev_length != updated_length: if verbose: - console.print(f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}") - lines_out[len_obj_line] = updated_length + '\n' + console.print( + f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}" + ) + lines_out[len_obj_line] = updated_length + "\n" else: raise RuntimeError( f"obj {curr_obj} with stream-len {stream_len}" @@ -195,6 +228,7 @@ def update_lines(lines_in: Iterable[str], encoding: str, console: Console, verbo return lines_out + def read_binary_file(file_path: str, encoding: str) -> Iterable[str]: """Reads a binary file line by line and returns these lines as a list of strings in the given encoding. Encoding utf-8 can't be used to read random binary data. @@ -204,7 +238,7 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]: :return lines including line-breaks """ chunks = [] - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: buffer = bytearray() while True: chunk = file.read(4096) # Read in chunks of 4096 bytes @@ -215,22 +249,23 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]: # Split buffer into chunks based on LF, CR, or CRLF while True: - match = re.search(b'(\x0D\x0A|\x0A|\x0D)', buffer) + match = re.search(b"(\x0D\x0A|\x0A|\x0D)", buffer) if not match: break # No more line breaks found, process the remaining buffer start, end = match.start(), match.end() - chunk_str = buffer[:end].decode(encoding, errors='strict') + chunk_str = buffer[:end].decode(encoding, errors="strict") buffer = buffer[end:] chunks.append(chunk_str) # Handle the last chunk if buffer: - chunks.append(buffer.decode(encoding, errors='strict')) + chunks.append(buffer.decode(encoding, errors="strict")) return chunks + def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: console = Console() console.print(f"Read {file_in}") @@ -243,4 +278,3 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: f.write(line.encode(encoding)) console.print(f"Wrote {file_out}") - diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index 132b902..94aff4b 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -34,16 +34,22 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: assert not captured.err assert re.search(r"Wrote\s+" + re.escape(str(output)), captured.out) assert output.exists() - with open(file_expected, 'r', encoding='iso-8859-1') as file_exp: + with open(file_expected, "r", encoding="iso-8859-1") as file_exp: lines_exp = file_exp.readlines() - with open(output, 'r', encoding='iso-8859-1') as file_act: + with open(output, "r", encoding="iso-8859-1") as file_act: lines_act = file_act.readlines() - assert len(lines_exp) == len(lines_act), f"lines_exp=f{lines_exp}, lines_act=f{lines_act}" - for line_no, (line_exp, line_act) in enumerate(zip(lines_exp, lines_act), start = 1): + assert len(lines_exp) == len( + lines_act + ), f"lines_exp=f{lines_exp}, lines_act=f{lines_act}" + for line_no, (line_exp, line_act) in enumerate( + zip(lines_exp, lines_act), start=1 + ): assert line_exp == line_act, f"Lines differ in line {line_no}" + # The current implementation doesn't support valid PDF lines as "/Length 5470>> stream". + @pytest.mark.parametrize( "input_pdf_filepath", [ @@ -71,9 +77,11 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: "sample-files/023-cmyk-image/cmyk-image.pdf", "sample-files/024-annotations/annotated_pdf.pdf", "sample-files/025-attachment/with-attachment.pdf", - ] + ], ) -def test_update_offsets_on_all_reference_files(capsys, tmp_path: Path, input_pdf_filepath: Path) -> None: +def test_update_offsets_on_all_reference_files( + capsys, tmp_path: Path, input_pdf_filepath: Path +) -> None: # Arrange output_pdf_filepath = tmp_path / "out.pdf" @@ -81,7 +89,8 @@ def test_update_offsets_on_all_reference_files(capsys, tmp_path: Path, input_pdf exit_code = run_cli( [ "update-offsets", - "--encoding", "iso-8859-1", + "--encoding", + "iso-8859-1", input_pdf_filepath, str(output_pdf_filepath), ] From c3a6c88680c51167eec189be956c4d8ed5096f54 Mon Sep 17 00:00:00 2001 From: rogmann Date: Wed, 6 Nov 2024 23:17:46 +0100 Subject: [PATCH 21/22] DEV: directory tests is lower-case. --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1047d86..1f32208 100644 --- a/Makefile +++ b/Makefile @@ -15,10 +15,10 @@ upload: clean: python setup.py clean --all pyclean . - rm -rf Tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt + rm -rf tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt test: - pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 + pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30 mutation-test: mutmut run From fc42eb4342022e110bb35c50fef3607e72ef21d0 Mon Sep 17 00:00:00 2001 From: "Cimon Lucas (LCM)" Date: Thu, 7 Nov 2024 16:57:10 +0100 Subject: [PATCH 22/22] Pleasing mypy & typing imports under Python 3.8 --- pdfly/cli.py | 4 +-- pdfly/update_offsets.py | 57 +++++++++++++++++++++--------------- tests/conftest.py | 6 ++-- tests/test_update_offsets.py | 4 +-- 4 files changed, 41 insertions(+), 30 deletions(-) diff --git a/pdfly/cli.py b/pdfly/cli.py index 2a7463c..317c9e7 100644 --- a/pdfly/cli.py +++ b/pdfly/cli.py @@ -236,10 +236,10 @@ def update_offsets( encoding: str = typer.Option( "ISO-8859-1", help="Encoding used to read and write the files, e.g. UTF-8.", - ), # noqa + ), verbose: bool = typer.Option( False, help="Show progress while processing." - ), # noqa + ), ) -> None: pdfly.update_offsets.main(file_in, file_out, encoding, verbose) diff --git a/pdfly/update_offsets.py b/pdfly/update_offsets.py index d33954c..ac4bb07 100644 --- a/pdfly/update_offsets.py +++ b/pdfly/update_offsets.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python """ Updates offsets and lengths in a simple PDF file. @@ -20,15 +19,21 @@ It expects that the /Length-entries have default values containing enough digits, e.g. /Length 000 when the stream consists of 576 bytes. -EXAMPLE +Example: update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf + """ -from collections.abc import Iterable -from pathlib import Path -from rich.console import Console import re import sys +from pathlib import Path + +if sys.version_info >= (3, 9): + List = list +else: # Support for Python 3.8 + from typing import List + +from rich.console import Console # Here, only simple regular expressions are used. # Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better. @@ -41,20 +46,20 @@ def update_lines( - lines_in: Iterable[str], encoding: str, console: Console, verbose: bool -) -> Iterable[str]: - """Iterates over the lines of a pdf-files and updates offsets. + lines_in: List[str], encoding: str, console: Console, verbose: bool +) -> List[str]: + """ + Iterates over the lines of a pdf-files and updates offsets. The input is expected to be a pdf without binary-sections. - :param lines_in: An Iterable over the lines including line-breaks. + :param lines_in: A list over the lines including line-breaks. :param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8". :param console: Console used to print messages. :param verbose: True to activate logging of info-messages. :return The output is a list of lines to be written in the given encoding. """ - lines_out = [] # lines to be written map_line_offset = {} # map from line-number to offset map_obj_offset = {} # map from object-number to offset @@ -184,7 +189,12 @@ def update_lines( for curr_obj, stream_len in map_stream_len.items(): if curr_obj in map_obj_length_line: - m_length = RE_LENGTH.match(map_obj_length_line[curr_obj]) + line = map_obj_length_line[curr_obj] + m_length = RE_LENGTH.match(line) + if m_length is None: + raise RuntimeError( + f"Invalid PDF file: line '{line}' does not contain a valid /Length." + ) prev_length = m_length.group(2) len_digits = len(prev_length) len_format = "%%0%dd" % len_digits @@ -192,14 +202,14 @@ def update_lines( if len(updated_length) > len_digits: raise RuntimeError( f"Not enough digits in /Length-entry {prev_length}" - + f" of object {curr_obj}:" - + f" too short to take /Length {updated_length}" + f" of object {curr_obj}:" + f" too short to take /Length {updated_length}" ) line = m_length.group(1) + updated_length + m_length.group(3) lines_out[map_obj_length_line_no[curr_obj] - 1] = line elif curr_obj in map_obj_length_ref: len_obj = map_obj_length_ref[curr_obj] - if not len_obj in map_obj_line: + if len_obj not in map_obj_line: raise RuntimeError( f"obj {curr_obj} has unknown length-obj {len_obj}" ) @@ -211,8 +221,8 @@ def update_lines( if len(updated_length) > len_digits: raise RuntimeError( f"Not enough digits in /Length-ref-entry {prev_length}" - + f" of object {curr_obj} and len-object {len_obj}:" - + f" too short to take /Length {updated_length}" + f" of object {curr_obj} and len-object {len_obj}:" + f" too short to take /Length {updated_length}" ) if prev_length != updated_length: if verbose: @@ -223,22 +233,23 @@ def update_lines( else: raise RuntimeError( f"obj {curr_obj} with stream-len {stream_len}" - + f" has no object-length-line: {map_obj_length_line}" + f" has no object-length-line: {map_obj_length_line}" ) return lines_out -def read_binary_file(file_path: str, encoding: str) -> Iterable[str]: - """Reads a binary file line by line and returns these lines as a list of strings in the given encoding. +def read_binary_file(file_path: Path, encoding: str) -> List[str]: + """ + Reads a binary file line by line and returns these lines as a list of strings in the given encoding. Encoding utf-8 can't be used to read random binary data. :param file_path: file to be read line by line :param encoding: encoding to be used (e.g. "iso-8859-1") :return lines including line-breaks """ - chunks = [] - with open(file_path, "rb") as file: + chunks: List[str] = [] + with file_path.open("rb") as file: buffer = bytearray() while True: chunk = file.read(4096) # Read in chunks of 4096 bytes @@ -253,7 +264,7 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]: if not match: break # No more line breaks found, process the remaining buffer - start, end = match.start(), match.end() + end = match.end() chunk_str = buffer[:end].decode(encoding, errors="strict") buffer = buffer[end:] @@ -277,4 +288,4 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: for line in lines_out: f.write(line.encode(encoding)) - console.print(f"Wrote {file_out}") + console.print(f"Wrote {file_out}", soft_wrap=True) diff --git a/tests/conftest.py b/tests/conftest.py index 0e02931..9ab40d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,6 @@ """Utilities and fixtures that are available automatically for all tests.""" -import io, os +import os from pathlib import Path from fpdf import FPDF @@ -58,7 +58,7 @@ def pdf_file_100(tmp_path): for i in range(100): pdf.add_page() pdf.set_font("helvetica", size=12) - pdf.cell(200, 10, txt=f"{i}", ln=True, align="C") + pdf.cell(200, 10, text=f"{i}", ln=True, align="C") pdf_filepath = tmp_path / "pdf_file_100.pdf" pdf.output(pdf_filepath) @@ -73,7 +73,7 @@ def pdf_file_abc(tmp_path): for char in [chr(i) for i in range(ord("a"), ord("z") + 1)]: pdf.add_page() pdf.set_font("helvetica", size=12) - pdf.cell(200, 10, txt=f"{char}", ln=True, align="C") + pdf.cell(200, 10, text=f"{char}", ln=True, align="C") pdf_filepath = tmp_path / "abc.pdf" pdf.output(pdf_filepath) diff --git a/tests/test_update_offsets.py b/tests/test_update_offsets.py index 94aff4b..bd5d506 100644 --- a/tests/test_update_offsets.py +++ b/tests/test_update_offsets.py @@ -34,9 +34,9 @@ def test_update_offsets(capsys, tmp_path: Path) -> None: assert not captured.err assert re.search(r"Wrote\s+" + re.escape(str(output)), captured.out) assert output.exists() - with open(file_expected, "r", encoding="iso-8859-1") as file_exp: + with open(file_expected, encoding="iso-8859-1") as file_exp: lines_exp = file_exp.readlines() - with open(output, "r", encoding="iso-8859-1") as file_act: + with open(output, encoding="iso-8859-1") as file_act: lines_act = file_act.readlines() assert len(lines_exp) == len( lines_act