1
- #!/usr/bin/env python
2
1
"""
3
2
Updates offsets and lengths in a simple PDF file.
4
3
20
19
It expects that the /Length-entries have default values containing
21
20
enough digits, e.g. /Length 000 when the stream consists of 576 bytes.
22
21
23
- EXAMPLE
22
+ Example:
24
23
update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf
24
+
25
25
"""
26
26
27
- from collections .abc import Iterable
28
- from pathlib import Path
29
- from rich .console import Console
30
27
import re
31
28
import sys
29
+ from pathlib import Path
30
+
31
+ if sys .version_info >= (3 , 9 ):
32
+ List = list
33
+ else : # Support for Python 3.8
34
+ from typing import List
35
+
36
+ from rich .console import Console
32
37
33
38
# Here, only simple regular expressions are used.
34
39
# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
41
46
42
47
43
48
def update_lines (
44
- lines_in : Iterable [str ], encoding : str , console : Console , verbose : bool
45
- ) -> Iterable [str ]:
46
- """Iterates over the lines of a pdf-files and updates offsets.
49
+ lines_in : List [str ], encoding : str , console : Console , verbose : bool
50
+ ) -> List [str ]:
51
+ """
52
+ Iterates over the lines of a pdf-files and updates offsets.
47
53
48
54
The input is expected to be a pdf without binary-sections.
49
55
50
- :param lines_in: An Iterable over the lines including line-breaks.
56
+ :param lines_in: A list over the lines including line-breaks.
51
57
:param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
52
58
:param console: Console used to print messages.
53
59
:param verbose: True to activate logging of info-messages.
54
60
:return The output is a list of lines to be written
55
61
in the given encoding.
56
62
"""
57
-
58
63
lines_out = [] # lines to be written
59
64
map_line_offset = {} # map from line-number to offset
60
65
map_obj_offset = {} # map from object-number to offset
@@ -184,22 +189,27 @@ def update_lines(
184
189
185
190
for curr_obj , stream_len in map_stream_len .items ():
186
191
if curr_obj in map_obj_length_line :
187
- m_length = RE_LENGTH .match (map_obj_length_line [curr_obj ])
192
+ line = map_obj_length_line [curr_obj ]
193
+ m_length = RE_LENGTH .match (line )
194
+ if m_length is None :
195
+ raise RuntimeError (
196
+ f"Invalid PDF file: line '{ line } ' does not contain a valid /Length."
197
+ )
188
198
prev_length = m_length .group (2 )
189
199
len_digits = len (prev_length )
190
200
len_format = "%%0%dd" % len_digits
191
201
updated_length = len_format % stream_len
192
202
if len (updated_length ) > len_digits :
193
203
raise RuntimeError (
194
204
f"Not enough digits in /Length-entry { prev_length } "
195
- + f" of object { curr_obj } :"
196
- + f" too short to take /Length { updated_length } "
205
+ f" of object { curr_obj } :"
206
+ f" too short to take /Length { updated_length } "
197
207
)
198
208
line = m_length .group (1 ) + updated_length + m_length .group (3 )
199
209
lines_out [map_obj_length_line_no [curr_obj ] - 1 ] = line
200
210
elif curr_obj in map_obj_length_ref :
201
211
len_obj = map_obj_length_ref [curr_obj ]
202
- if not len_obj in map_obj_line :
212
+ if len_obj not in map_obj_line :
203
213
raise RuntimeError (
204
214
f"obj { curr_obj } has unknown length-obj { len_obj } "
205
215
)
@@ -211,8 +221,8 @@ def update_lines(
211
221
if len (updated_length ) > len_digits :
212
222
raise RuntimeError (
213
223
f"Not enough digits in /Length-ref-entry { prev_length } "
214
- + f" of object { curr_obj } and len-object { len_obj } :"
215
- + f" too short to take /Length { updated_length } "
224
+ f" of object { curr_obj } and len-object { len_obj } :"
225
+ f" too short to take /Length { updated_length } "
216
226
)
217
227
if prev_length != updated_length :
218
228
if verbose :
@@ -223,22 +233,23 @@ def update_lines(
223
233
else :
224
234
raise RuntimeError (
225
235
f"obj { curr_obj } with stream-len { stream_len } "
226
- + f" has no object-length-line: { map_obj_length_line } "
236
+ f" has no object-length-line: { map_obj_length_line } "
227
237
)
228
238
229
239
return lines_out
230
240
231
241
232
- def read_binary_file (file_path : str , encoding : str ) -> Iterable [str ]:
233
- """Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
242
+ def read_binary_file (file_path : Path , encoding : str ) -> List [str ]:
243
+ """
244
+ Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
234
245
Encoding utf-8 can't be used to read random binary data.
235
246
236
247
:param file_path: file to be read line by line
237
248
:param encoding: encoding to be used (e.g. "iso-8859-1")
238
249
:return lines including line-breaks
239
250
"""
240
- chunks = []
241
- with open (file_path , "rb" ) as file :
251
+ chunks : List [ str ] = []
252
+ with file_path . open ("rb" ) as file :
242
253
buffer = bytearray ()
243
254
while True :
244
255
chunk = file .read (4096 ) # Read in chunks of 4096 bytes
@@ -253,7 +264,7 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
253
264
if not match :
254
265
break # No more line breaks found, process the remaining buffer
255
266
256
- start , end = match . start (), match .end ()
267
+ end = match .end ()
257
268
chunk_str = buffer [:end ].decode (encoding , errors = "strict" )
258
269
buffer = buffer [end :]
259
270
@@ -277,4 +288,4 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
277
288
for line in lines_out :
278
289
f .write (line .encode (encoding ))
279
290
280
- console .print (f"Wrote { file_out } " )
291
+ console .print (f"Wrote { file_out } " , soft_wrap = True )
0 commit comments