24
24
update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf
25
25
"""
26
26
27
- from collections .abc import Iterable
28
27
from pathlib import Path
29
- from rich .console import Console
30
28
import re
31
29
import sys
30
+ if sys .version_info >= (3 , 9 ):
31
+ List = list
32
+ else : # Support for Python 3.8
33
+ from typing import List
34
+
35
+ from rich .console import Console
32
36
33
37
# Here, only simple regular expressions are used.
34
38
# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better.
41
45
42
46
43
47
def update_lines (
44
- lines_in : Iterable [str ], encoding : str , console : Console , verbose : bool
45
- ) -> Iterable [str ]:
48
+ lines_in : List [str ], encoding : str , console : Console , verbose : bool
49
+ ) -> List [str ]:
46
50
"""Iterates over the lines of a pdf-files and updates offsets.
47
51
48
52
The input is expected to be a pdf without binary-sections.
49
53
50
- :param lines_in: An Iterable over the lines including line-breaks.
54
+ :param lines_in: A list over the lines including line-breaks.
51
55
:param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8".
52
56
:param console: Console used to print messages.
53
57
:param verbose: True to activate logging of info-messages.
@@ -184,7 +188,12 @@ def update_lines(
184
188
185
189
for curr_obj , stream_len in map_stream_len .items ():
186
190
if curr_obj in map_obj_length_line :
187
- m_length = RE_LENGTH .match (map_obj_length_line [curr_obj ])
191
+ line = map_obj_length_line [curr_obj ]
192
+ m_length = RE_LENGTH .match (line )
193
+ if m_length is None :
194
+ raise RuntimeError (
195
+ f"Invalid PDF file: line '{ line } ' does not contain a valid /Length."
196
+ )
188
197
prev_length = m_length .group (2 )
189
198
len_digits = len (prev_length )
190
199
len_format = "%%0%dd" % len_digits
@@ -229,16 +238,16 @@ def update_lines(
229
238
return lines_out
230
239
231
240
232
- def read_binary_file (file_path : str , encoding : str ) -> Iterable [str ]:
241
+ def read_binary_file (file_path : Path , encoding : str ) -> List [str ]:
233
242
"""Reads a binary file line by line and returns these lines as a list of strings in the given encoding.
234
243
Encoding utf-8 can't be used to read random binary data.
235
244
236
245
:param file_path: file to be read line by line
237
246
:param encoding: encoding to be used (e.g. "iso-8859-1")
238
247
:return lines including line-breaks
239
248
"""
240
- chunks = []
241
- with open (file_path , "rb" ) as file :
249
+ chunks : List [ str ] = []
250
+ with file_path . open ("rb" ) as file :
242
251
buffer = bytearray ()
243
252
while True :
244
253
chunk = file .read (4096 ) # Read in chunks of 4096 bytes
@@ -253,7 +262,7 @@ def read_binary_file(file_path: str, encoding: str) -> Iterable[str]:
253
262
if not match :
254
263
break # No more line breaks found, process the remaining buffer
255
264
256
- start , end = match . start (), match .end ()
265
+ end = match .end ()
257
266
chunk_str = buffer [:end ].decode (encoding , errors = "strict" )
258
267
buffer = buffer [end :]
259
268
@@ -277,4 +286,4 @@ def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None:
277
286
for line in lines_out :
278
287
f .write (line .encode (encoding ))
279
288
280
- console .print (f"Wrote { file_out } " )
289
+ console .print (f"Wrote { file_out } " , soft_wrap = True )
0 commit comments