24
24
update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf
25
25
"""
26
26
27
- from collections .abc import Iterable
27
+ if sys .version_info >= (3 , 9 ):
28
+ List = list
29
+ else :
30
+ from typing import List
28
31
from pathlib import Path
29
32
from rich .console import Console
30
33
import re
41
44
42
45
43
46
def update_lines (
44
- lines_in : Iterable [str ], encoding : str , console : Console , verbose : bool
45
- ) -> Iterable [str ]:
47
+ lines_in : List [str ], encoding : str , console : Console , verbose : bool
48
+ ) -> List [str ]:
46
49
"""Iterates over the lines of a pdf-files and updates offsets.
47
50
48
51
The input is expected to be a pdf without binary-sections.
49
52
50
- :param lines_in: An Iterable over the lines including line-breaks.
53
+ :param lines_in: A list over the lines including line-breaks.
51
54
:param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
52
55
:param console: Console used to print messages.
53
56
:param verbose: True to activate logging of info-messages.
@@ -184,7 +187,12 @@ def update_lines(
184
187
185
188
for curr_obj , stream_len in map_stream_len .items ():
186
189
if curr_obj in map_obj_length_line :
187
- m_length = RE_LENGTH .match (map_obj_length_line [curr_obj ])
190
+ line = map_obj_length_line [curr_obj ]
191
+ m_length = RE_LENGTH .match (line )
192
+ if m_length is None :
193
+ raise RuntimeError (
194
+ f"Invalid PDF file: line '{ line } ' does not contain a valid /Length."
195
+ )
188
196
prev_length = m_length .group (2 )
189
197
len_digits = len (prev_length )
190
198
len_format = "%%0%dd" % len_digits
@@ -229,16 +237,16 @@ def update_lines(
229
237
return lines_out
230
238
231
239
232
- def read_binary_file (file_path : str , encoding : str ) -> Iterable [str ]:
240
+ def read_binary_file (file_path : Path , encoding : str ) -> List [str ]:
233
241
"""Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
234
242
Encoding utf-8 can't be used to read random binary data.
235
243
236
244
:param file_path: file to be read line by line
237
245
:param encoding: encoding to be used (e.g. "iso-8859-1")
238
246
:return lines including line-breaks
239
247
"""
240
- chunks = []
241
- with open (file_path , "rb" ) as file :
248
+ chunks : List [ str ] = []
249
+ with file_path . open ("rb" ) as file :
242
250
buffer = bytearray ()
243
251
while True :
244
252
chunk = file .read (4096 ) # Read in chunks of 4096 bytes
@@ -253,7 +261,7 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
253
261
if not match :
254
262
break # No more line breaks found, process the remaining buffer
255
263
256
- start , end = match . start (), match .end ()
264
+ end = match .end ()
257
265
chunk_str = buffer [:end ].decode (encoding , errors = "strict" )
258
266
buffer = buffer [end :]
259
267
@@ -277,4 +285,4 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
277
285
for line in lines_out :
278
286
f .write (line .encode (encoding ))
279
287
280
- console .print (f"Wrote { file_out } " )
288
+ console .print (f"Wrote { file_out } " , soft_wrap = True )
0 commit comments