Skip to content

Commit da75816

Browse files
srogmannLucas-CCimon Lucas (LCM)
authored
ENH: Added command update-offsets to adjust offsets and lengths. (#15)
Co-authored-by: Lucas Cimon <[email protected]> Co-authored-by: Cimon Lucas (LCM) <[email protected]>
1 parent 010d5a4 commit da75816

8 files changed

+417
-5
lines changed

Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ upload:
1515
clean:
1616
python setup.py clean --all
1717
pyclean .
18-
rm -rf Tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt
18+
rm -rf tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt
1919

2020
test:
21-
pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30
21+
pytest tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30
2222

2323
mutation-test:
2424
mutmut run

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ $ pdfly --help
3939
│ meta Show metadata of a PDF file │
4040
│ pagemeta Give details about a single page. │
4141
│ rm Remove pages from PDF files. │
42+
│ update-offsets Updates offsets and lengths in a simple PDF file. │
4243
│ x2pdf Convert one or more files to PDF. Each file is a page. │
4344
╰─────────────────────────────────────────────────────────────────────────────╯
4445
```

pdfly/cli.py

+16
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import pdfly.pagemeta
1818
import pdfly.rm
1919
import pdfly.up2
20+
import pdfly.update_offsets
2021
import pdfly.x2pdf
2122

2223

@@ -228,6 +229,21 @@ def compress(
228229
pdfly.compress.main(pdf, output)
229230

230231

232+
@entry_point.command(name="update-offsets", help=pdfly.update_offsets.__doc__) # type: ignore[misc]
233+
def update_offsets(
234+
file_in: Path,
235+
file_out: Path,
236+
encoding: str = typer.Option(
237+
"ISO-8859-1",
238+
help="Encoding used to read and write the files, e.g. UTF-8.",
239+
),
240+
verbose: bool = typer.Option(
241+
False, help="Show progress while processing."
242+
),
243+
) -> None:
244+
pdfly.update_offsets.main(file_in, file_out, encoding, verbose)
245+
246+
231247
@entry_point.command(name="x2pdf", help=pdfly.x2pdf.__doc__) # type: ignore[misc]
232248
def x2pdf(
233249
x: List[Path],

pdfly/update_offsets.py

+291
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
"""
2+
Updates offsets and lengths in a simple PDF file.
3+
4+
The PDF specification requires that the xref section at the end
5+
of a PDF file has the correct offsets of the PDF's objects.
6+
It further requires that the dictionary of a stream object
7+
contains a /Length-entry giving the length of the encoded stream.
8+
9+
When editing a PDF file using a text-editor (e.g. vim) it is
10+
elaborate to compute or adjust these offsets and lengths.
11+
12+
This command tries to compute /Length-entries of the stream dictionaries
13+
and the offsets in the xref-section automatically.
14+
15+
It expects that the PDF file has ASCII encoding only. It may
16+
use ISO-8859-1 or UTF-8 in its comments.
17+
The current implementation incorrectly replaces CR (0x0d) by LF (0x0a) in binary data.
18+
It expects that there is one xref-section only.
19+
It expects that the /Length-entries have default values containing
20+
enough digits, e.g. /Length 000 when the stream consists of 576 bytes.
21+
22+
Example:
23+
update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf
24+
25+
"""
26+
27+
import re
28+
import sys
29+
from pathlib import Path
30+
31+
if sys.version_info >= (3, 9):
32+
List = list
33+
else: # Support for Python 3.8
34+
from typing import List
35+
36+
from rich.console import Console
37+
38+
# Here, only simple regular expressions are used.
39+
# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
40+
RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *")
41+
RE_CONTENT = re.compile(r"^([^\r\n]*)", re.DOTALL)
42+
RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL)
43+
RE_LENGTH = re.compile(
44+
r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL
45+
)
46+
47+
48+
def update_lines(
49+
lines_in: List[str], encoding: str, console: Console, verbose: bool
50+
) -> List[str]:
51+
"""
52+
Iterates over the lines of a pdf-files and updates offsets.
53+
54+
The input is expected to be a pdf without binary-sections.
55+
56+
:param lines_in: A list over the lines including line-breaks.
57+
:param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
58+
:param console: Console used to print messages.
59+
:param verbose: True to activate logging of info-messages.
60+
:return The output is a list of lines to be written
61+
in the given encoding.
62+
"""
63+
lines_out = [] # lines to be written
64+
map_line_offset = {} # map from line-number to offset
65+
map_obj_offset = {} # map from object-number to offset
66+
map_obj_line = {} # map from object-number to line-number
67+
line_no = 0 # current line-number (starting at 0)
68+
offset_out = 0 # current offset in output-file
69+
line_xref = None # line-number of xref-line (in xref-section only)
70+
line_startxref = None # line-number of startxref-line
71+
curr_obj = None # number of current object
72+
len_stream = None # length of stream (in stream only)
73+
offset_xref = None # offset of xref-section
74+
map_stream_len = {} # map from object-number to /Length of stream
75+
map_obj_length_line = {} # map from object-number to /Length-line
76+
map_obj_length_ref = (
77+
{}
78+
) # map from object-number to /Length-reference (e.g. "3")
79+
map_obj_length_line_no = {} # map from object-number to line_no of length
80+
# of /Length-line
81+
for idx, line in enumerate(lines_in):
82+
line_no = idx + 1
83+
m_content = RE_CONTENT.match(line)
84+
if m_content is None:
85+
raise RuntimeError(
86+
f"Invalid PDF file: line {line_no} without line-break."
87+
)
88+
content = m_content.group(1)
89+
map_line_offset[line_no] = offset_out
90+
m_obj = RE_OBJ.match(line)
91+
if m_obj is not None:
92+
curr_obj = m_obj.group(1)
93+
curr_gen = m_obj.group(2)
94+
if verbose:
95+
console.print(f"line {line_no}: object {curr_obj}")
96+
if curr_gen != "0":
97+
raise RuntimeError(
98+
f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported."
99+
)
100+
map_obj_offset[curr_obj] = int(offset_out)
101+
map_obj_line[curr_obj] = line_no
102+
len_stream = None
103+
104+
if content == "xref":
105+
offset_xref = offset_out
106+
line_xref = line_no
107+
elif content == "startxref":
108+
line_startxref = line_no
109+
line_xref = None
110+
elif content == "stream":
111+
if verbose:
112+
console.print(f"line {line_no}: start stream")
113+
len_stream = 0
114+
elif content == "endstream":
115+
if verbose:
116+
console.print(f"line {line_no}: end stream")
117+
if curr_obj is None:
118+
raise RuntimeError(
119+
f"Invalid PDF file: line {line_no}: endstream without object-start."
120+
)
121+
if len_stream is None:
122+
raise RuntimeError(
123+
f"Invalid PDF file: line {line_no}: endstream without stream."
124+
)
125+
if len_stream > 0:
126+
# Ignore the last EOL
127+
len_stream = (
128+
len_stream - 2
129+
if lines_in[idx - 1][-2:] == "\r\n"
130+
else len_stream - 1
131+
)
132+
if verbose:
133+
console.print(
134+
f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}"
135+
)
136+
map_stream_len[curr_obj] = len_stream
137+
elif content == "endobj":
138+
curr_obj = None
139+
elif curr_obj is not None and len_stream is None:
140+
m_length_ref = RE_LENGTH_REF.match(line)
141+
if m_length_ref is not None:
142+
len_obj = m_length_ref.group(2)
143+
len_obj_gen = m_length_ref.group(3)
144+
if verbose:
145+
console.print(
146+
f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}"
147+
)
148+
map_obj_length_ref[curr_obj] = len_obj
149+
else:
150+
m_length = RE_LENGTH.match(line)
151+
if m_length is not None:
152+
if verbose:
153+
console.print(f"line {line_no}, /Length: {content}")
154+
map_obj_length_line[curr_obj] = line
155+
map_obj_length_line_no[curr_obj] = line_no
156+
elif curr_obj is not None and len_stream is not None:
157+
len_stream += len(line.encode(encoding))
158+
elif line_xref is not None and line_no > line_xref + 2:
159+
objNo = line_no - line_xref - 2
160+
if objNo <= len(map_obj_offset) and str(objNo) in map_obj_offset:
161+
eol = line[-2:]
162+
xrefUpd = ("%010d" % map_obj_offset[str(objNo)]) + " 00000 n"
163+
if verbose:
164+
console.print(f"{content} -> {xrefUpd}")
165+
line = xrefUpd + eol
166+
elif line_startxref is not None and line_no == line_startxref + 1:
167+
if offset_xref is None:
168+
raise NotImplementedError(
169+
"Unsupported file: startxref without preceding xref-section (probable cross-reference stream)"
170+
)
171+
line = "%d\n" % offset_xref
172+
lines_out.append(line)
173+
174+
offset_out += len(line.encode(encoding))
175+
176+
# Some checks
177+
if len(map_obj_offset) == 0:
178+
raise RuntimeError(
179+
"Invalid PDF file: the command didn't find any PDF objects."
180+
)
181+
if offset_xref is None:
182+
raise RuntimeError(
183+
"Invalid PDF file: the command didn't find a xref-section"
184+
)
185+
if line_startxref is None:
186+
raise RuntimeError(
187+
"Invalid PDF file: the command didn't find a startxref-section"
188+
)
189+
190+
for curr_obj, stream_len in map_stream_len.items():
191+
if curr_obj in map_obj_length_line:
192+
line = map_obj_length_line[curr_obj]
193+
m_length = RE_LENGTH.match(line)
194+
if m_length is None:
195+
raise RuntimeError(
196+
f"Invalid PDF file: line '{line}' does not contain a valid /Length."
197+
)
198+
prev_length = m_length.group(2)
199+
len_digits = len(prev_length)
200+
len_format = "%%0%dd" % len_digits
201+
updated_length = len_format % stream_len
202+
if len(updated_length) > len_digits:
203+
raise RuntimeError(
204+
f"Not enough digits in /Length-entry {prev_length}"
205+
f" of object {curr_obj}:"
206+
f" too short to take /Length {updated_length}"
207+
)
208+
line = m_length.group(1) + updated_length + m_length.group(3)
209+
lines_out[map_obj_length_line_no[curr_obj] - 1] = line
210+
elif curr_obj in map_obj_length_ref:
211+
len_obj = map_obj_length_ref[curr_obj]
212+
if len_obj not in map_obj_line:
213+
raise RuntimeError(
214+
f"obj {curr_obj} has unknown length-obj {len_obj}"
215+
)
216+
len_obj_line = map_obj_line[len_obj]
217+
prev_length = lines_out[len_obj_line][:-1]
218+
len_digits = len(prev_length)
219+
len_format = "%%0%dd" % len_digits
220+
updated_length = len_format % stream_len
221+
if len(updated_length) > len_digits:
222+
raise RuntimeError(
223+
f"Not enough digits in /Length-ref-entry {prev_length}"
224+
f" of object {curr_obj} and len-object {len_obj}:"
225+
f" too short to take /Length {updated_length}"
226+
)
227+
if prev_length != updated_length:
228+
if verbose:
229+
console.print(
230+
f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}"
231+
)
232+
lines_out[len_obj_line] = updated_length + "\n"
233+
else:
234+
raise RuntimeError(
235+
f"obj {curr_obj} with stream-len {stream_len}"
236+
f" has no object-length-line: {map_obj_length_line}"
237+
)
238+
239+
return lines_out
240+
241+
242+
def read_binary_file(file_path: Path, encoding: str) -> List[str]:
243+
"""
244+
Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
245+
Encoding utf-8 can't be used to read random binary data.
246+
247+
:param file_path: file to be read line by line
248+
:param encoding: encoding to be used (e.g. "iso-8859-1")
249+
:return lines including line-breaks
250+
"""
251+
chunks: List[str] = []
252+
with file_path.open("rb") as file:
253+
buffer = bytearray()
254+
while True:
255+
chunk = file.read(4096) # Read in chunks of 4096 bytes
256+
if not chunk:
257+
break # End of file
258+
259+
buffer += chunk
260+
261+
# Split buffer into chunks based on LF, CR, or CRLF
262+
while True:
263+
match = re.search(b"(\x0D\x0A|\x0A|\x0D)", buffer)
264+
if not match:
265+
break # No more line breaks found, process the remaining buffer
266+
267+
end = match.end()
268+
chunk_str = buffer[:end].decode(encoding, errors="strict")
269+
buffer = buffer[end:]
270+
271+
chunks.append(chunk_str)
272+
273+
# Handle the last chunk
274+
if buffer:
275+
chunks.append(buffer.decode(encoding, errors="strict"))
276+
277+
return chunks
278+
279+
280+
def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
281+
console = Console()
282+
console.print(f"Read {file_in}")
283+
284+
lines_in = read_binary_file(file_in, encoding)
285+
lines_out = update_lines(lines_in, encoding, console, verbose)
286+
287+
with open(file_out, "wb") as f:
288+
for line in lines_out:
289+
f.write(line.encode(encoding))
290+
291+
console.print(f"Wrote {file_out}", soft_wrap=True)

resources/file-with-fixed-offsets.pdf

883 Bytes
Binary file not shown.
886 Bytes
Binary file not shown.

tests/conftest.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Utilities and fixtures that are available automatically for all tests."""
22

3-
import io, os
3+
import os
44
from pathlib import Path
55

66
from fpdf import FPDF
@@ -58,7 +58,7 @@ def pdf_file_100(tmp_path):
5858
for i in range(100):
5959
pdf.add_page()
6060
pdf.set_font("helvetica", size=12)
61-
pdf.cell(200, 10, txt=f"{i}", ln=True, align="C")
61+
pdf.cell(200, 10, text=f"{i}", ln=True, align="C")
6262

6363
pdf_filepath = tmp_path / "pdf_file_100.pdf"
6464
pdf.output(pdf_filepath)
@@ -73,7 +73,7 @@ def pdf_file_abc(tmp_path):
7373
for char in [chr(i) for i in range(ord("a"), ord("z") + 1)]:
7474
pdf.add_page()
7575
pdf.set_font("helvetica", size=12)
76-
pdf.cell(200, 10, txt=f"{char}", ln=True, align="C")
76+
pdf.cell(200, 10, text=f"{char}", ln=True, align="C")
7777

7878
pdf_filepath = tmp_path / "abc.pdf"
7979
pdf.output(pdf_filepath)

0 commit comments

Comments
 (0)