|
| 1 | +""" |
| 2 | +Updates offsets and lengths in a simple PDF file. |
| 3 | +
|
| 4 | +The PDF specification requires that the xref section at the end |
| 5 | +of a PDF file has the correct offsets of the PDF's objects. |
| 6 | +It further requires that the dictionary of a stream object |
| 7 | +contains a /Length-entry giving the length of the encoded stream. |
| 8 | +
|
| 9 | +When editing a PDF file using a text-editor (e.g. vim) it is |
| 10 | +elaborate to compute or adjust these offsets and lengths. |
| 11 | +
|
| 12 | +This command tries to compute /Length-entries of the stream dictionaries |
| 13 | +and the offsets in the xref-section automatically. |
| 14 | +
|
| 15 | +It expects that the PDF file has ASCII encoding only. It may |
| 16 | +use ISO-8859-1 or UTF-8 in its comments. |
| 17 | +The current implementation incorrectly replaces CR (0x0d) by LF (0x0a) in binary data. |
| 18 | +It expects that there is one xref-section only. |
| 19 | +It expects that the /Length-entries have default values containing |
| 20 | +enough digits, e.g. /Length 000 when the stream consists of 576 bytes. |
| 21 | +
|
| 22 | +Example: |
| 23 | + update-offsets --verbose --encoding ISO-8859-1 issue-297.pdf issue-297.out.pdf |
| 24 | +
|
| 25 | +""" |
| 26 | + |
| 27 | +import re |
| 28 | +import sys |
| 29 | +from pathlib import Path |
| 30 | + |
| 31 | +if sys.version_info >= (3, 9): |
| 32 | + List = list |
| 33 | +else: # Support for Python 3.8 |
| 34 | + from typing import List |
| 35 | + |
| 36 | +from rich.console import Console |
| 37 | + |
| 38 | +# Here, only simple regular expressions are used. |
| 39 | +# Beyond a certain level of complexity, switching to a proper PDF dictionary parser would be better. |
| 40 | +RE_OBJ = re.compile(r"^([0-9]+) ([0-9]+) obj *") |
| 41 | +RE_CONTENT = re.compile(r"^([^\r\n]*)", re.DOTALL) |
| 42 | +RE_LENGTH_REF = re.compile(r"^(.*/Length )([0-9]+) ([0-9]+) R(.*)", re.DOTALL) |
| 43 | +RE_LENGTH = re.compile( |
| 44 | + r"^(.*/Length )([0-9]+)([ />\x00\t\f\r\n].*)", re.DOTALL |
| 45 | +) |
| 46 | + |
| 47 | + |
| 48 | +def update_lines( |
| 49 | + lines_in: List[str], encoding: str, console: Console, verbose: bool |
| 50 | +) -> List[str]: |
| 51 | + """ |
| 52 | + Iterates over the lines of a pdf-files and updates offsets. |
| 53 | +
|
| 54 | + The input is expected to be a pdf without binary-sections. |
| 55 | +
|
| 56 | + :param lines_in: A list over the lines including line-breaks. |
| 57 | + :param encoding: The encoding, e.g. "iso-8859-1" or "UTF-8". |
| 58 | + :param console: Console used to print messages. |
| 59 | + :param verbose: True to activate logging of info-messages. |
| 60 | + :return The output is a list of lines to be written |
| 61 | + in the given encoding. |
| 62 | + """ |
| 63 | + lines_out = [] # lines to be written |
| 64 | + map_line_offset = {} # map from line-number to offset |
| 65 | + map_obj_offset = {} # map from object-number to offset |
| 66 | + map_obj_line = {} # map from object-number to line-number |
| 67 | + line_no = 0 # current line-number (starting at 0) |
| 68 | + offset_out = 0 # current offset in output-file |
| 69 | + line_xref = None # line-number of xref-line (in xref-section only) |
| 70 | + line_startxref = None # line-number of startxref-line |
| 71 | + curr_obj = None # number of current object |
| 72 | + len_stream = None # length of stream (in stream only) |
| 73 | + offset_xref = None # offset of xref-section |
| 74 | + map_stream_len = {} # map from object-number to /Length of stream |
| 75 | + map_obj_length_line = {} # map from object-number to /Length-line |
| 76 | + map_obj_length_ref = ( |
| 77 | + {} |
| 78 | + ) # map from object-number to /Length-reference (e.g. "3") |
| 79 | + map_obj_length_line_no = {} # map from object-number to line_no of length |
| 80 | + # of /Length-line |
| 81 | + for idx, line in enumerate(lines_in): |
| 82 | + line_no = idx + 1 |
| 83 | + m_content = RE_CONTENT.match(line) |
| 84 | + if m_content is None: |
| 85 | + raise RuntimeError( |
| 86 | + f"Invalid PDF file: line {line_no} without line-break." |
| 87 | + ) |
| 88 | + content = m_content.group(1) |
| 89 | + map_line_offset[line_no] = offset_out |
| 90 | + m_obj = RE_OBJ.match(line) |
| 91 | + if m_obj is not None: |
| 92 | + curr_obj = m_obj.group(1) |
| 93 | + curr_gen = m_obj.group(2) |
| 94 | + if verbose: |
| 95 | + console.print(f"line {line_no}: object {curr_obj}") |
| 96 | + if curr_gen != "0": |
| 97 | + raise RuntimeError( |
| 98 | + f"Invalid PDF file: generation {curr_gen} of object {curr_obj} in line {line_no} is not supported." |
| 99 | + ) |
| 100 | + map_obj_offset[curr_obj] = int(offset_out) |
| 101 | + map_obj_line[curr_obj] = line_no |
| 102 | + len_stream = None |
| 103 | + |
| 104 | + if content == "xref": |
| 105 | + offset_xref = offset_out |
| 106 | + line_xref = line_no |
| 107 | + elif content == "startxref": |
| 108 | + line_startxref = line_no |
| 109 | + line_xref = None |
| 110 | + elif content == "stream": |
| 111 | + if verbose: |
| 112 | + console.print(f"line {line_no}: start stream") |
| 113 | + len_stream = 0 |
| 114 | + elif content == "endstream": |
| 115 | + if verbose: |
| 116 | + console.print(f"line {line_no}: end stream") |
| 117 | + if curr_obj is None: |
| 118 | + raise RuntimeError( |
| 119 | + f"Invalid PDF file: line {line_no}: endstream without object-start." |
| 120 | + ) |
| 121 | + if len_stream is None: |
| 122 | + raise RuntimeError( |
| 123 | + f"Invalid PDF file: line {line_no}: endstream without stream." |
| 124 | + ) |
| 125 | + if len_stream > 0: |
| 126 | + # Ignore the last EOL |
| 127 | + len_stream = ( |
| 128 | + len_stream - 2 |
| 129 | + if lines_in[idx - 1][-2:] == "\r\n" |
| 130 | + else len_stream - 1 |
| 131 | + ) |
| 132 | + if verbose: |
| 133 | + console.print( |
| 134 | + f"line {line_no}: Computed /Length {len_stream} of obj {curr_obj}" |
| 135 | + ) |
| 136 | + map_stream_len[curr_obj] = len_stream |
| 137 | + elif content == "endobj": |
| 138 | + curr_obj = None |
| 139 | + elif curr_obj is not None and len_stream is None: |
| 140 | + m_length_ref = RE_LENGTH_REF.match(line) |
| 141 | + if m_length_ref is not None: |
| 142 | + len_obj = m_length_ref.group(2) |
| 143 | + len_obj_gen = m_length_ref.group(3) |
| 144 | + if verbose: |
| 145 | + console.print( |
| 146 | + f"line {line_no}, /Length-reference {len_obj} {len_obj_gen} R: {content}" |
| 147 | + ) |
| 148 | + map_obj_length_ref[curr_obj] = len_obj |
| 149 | + else: |
| 150 | + m_length = RE_LENGTH.match(line) |
| 151 | + if m_length is not None: |
| 152 | + if verbose: |
| 153 | + console.print(f"line {line_no}, /Length: {content}") |
| 154 | + map_obj_length_line[curr_obj] = line |
| 155 | + map_obj_length_line_no[curr_obj] = line_no |
| 156 | + elif curr_obj is not None and len_stream is not None: |
| 157 | + len_stream += len(line.encode(encoding)) |
| 158 | + elif line_xref is not None and line_no > line_xref + 2: |
| 159 | + objNo = line_no - line_xref - 2 |
| 160 | + if objNo <= len(map_obj_offset) and str(objNo) in map_obj_offset: |
| 161 | + eol = line[-2:] |
| 162 | + xrefUpd = ("%010d" % map_obj_offset[str(objNo)]) + " 00000 n" |
| 163 | + if verbose: |
| 164 | + console.print(f"{content} -> {xrefUpd}") |
| 165 | + line = xrefUpd + eol |
| 166 | + elif line_startxref is not None and line_no == line_startxref + 1: |
| 167 | + if offset_xref is None: |
| 168 | + raise NotImplementedError( |
| 169 | + "Unsupported file: startxref without preceding xref-section (probable cross-reference stream)" |
| 170 | + ) |
| 171 | + line = "%d\n" % offset_xref |
| 172 | + lines_out.append(line) |
| 173 | + |
| 174 | + offset_out += len(line.encode(encoding)) |
| 175 | + |
| 176 | + # Some checks |
| 177 | + if len(map_obj_offset) == 0: |
| 178 | + raise RuntimeError( |
| 179 | + "Invalid PDF file: the command didn't find any PDF objects." |
| 180 | + ) |
| 181 | + if offset_xref is None: |
| 182 | + raise RuntimeError( |
| 183 | + "Invalid PDF file: the command didn't find a xref-section" |
| 184 | + ) |
| 185 | + if line_startxref is None: |
| 186 | + raise RuntimeError( |
| 187 | + "Invalid PDF file: the command didn't find a startxref-section" |
| 188 | + ) |
| 189 | + |
| 190 | + for curr_obj, stream_len in map_stream_len.items(): |
| 191 | + if curr_obj in map_obj_length_line: |
| 192 | + line = map_obj_length_line[curr_obj] |
| 193 | + m_length = RE_LENGTH.match(line) |
| 194 | + if m_length is None: |
| 195 | + raise RuntimeError( |
| 196 | + f"Invalid PDF file: line '{line}' does not contain a valid /Length." |
| 197 | + ) |
| 198 | + prev_length = m_length.group(2) |
| 199 | + len_digits = len(prev_length) |
| 200 | + len_format = "%%0%dd" % len_digits |
| 201 | + updated_length = len_format % stream_len |
| 202 | + if len(updated_length) > len_digits: |
| 203 | + raise RuntimeError( |
| 204 | + f"Not enough digits in /Length-entry {prev_length}" |
| 205 | + f" of object {curr_obj}:" |
| 206 | + f" too short to take /Length {updated_length}" |
| 207 | + ) |
| 208 | + line = m_length.group(1) + updated_length + m_length.group(3) |
| 209 | + lines_out[map_obj_length_line_no[curr_obj] - 1] = line |
| 210 | + elif curr_obj in map_obj_length_ref: |
| 211 | + len_obj = map_obj_length_ref[curr_obj] |
| 212 | + if len_obj not in map_obj_line: |
| 213 | + raise RuntimeError( |
| 214 | + f"obj {curr_obj} has unknown length-obj {len_obj}" |
| 215 | + ) |
| 216 | + len_obj_line = map_obj_line[len_obj] |
| 217 | + prev_length = lines_out[len_obj_line][:-1] |
| 218 | + len_digits = len(prev_length) |
| 219 | + len_format = "%%0%dd" % len_digits |
| 220 | + updated_length = len_format % stream_len |
| 221 | + if len(updated_length) > len_digits: |
| 222 | + raise RuntimeError( |
| 223 | + f"Not enough digits in /Length-ref-entry {prev_length}" |
| 224 | + f" of object {curr_obj} and len-object {len_obj}:" |
| 225 | + f" too short to take /Length {updated_length}" |
| 226 | + ) |
| 227 | + if prev_length != updated_length: |
| 228 | + if verbose: |
| 229 | + console.print( |
| 230 | + f"line {line_no}, ref-len {len_obj} of {curr_obj}: {prev_length} -> {updated_length}" |
| 231 | + ) |
| 232 | + lines_out[len_obj_line] = updated_length + "\n" |
| 233 | + else: |
| 234 | + raise RuntimeError( |
| 235 | + f"obj {curr_obj} with stream-len {stream_len}" |
| 236 | + f" has no object-length-line: {map_obj_length_line}" |
| 237 | + ) |
| 238 | + |
| 239 | + return lines_out |
| 240 | + |
| 241 | + |
| 242 | +def read_binary_file(file_path: Path, encoding: str) -> List[str]: |
| 243 | + """ |
| 244 | + Reads a binary file line by line and returns these lines as a list of strings in the given encoding. |
| 245 | + Encoding utf-8 can't be used to read random binary data. |
| 246 | +
|
| 247 | + :param file_path: file to be read line by line |
| 248 | + :param encoding: encoding to be used (e.g. "iso-8859-1") |
| 249 | + :return lines including line-breaks |
| 250 | + """ |
| 251 | + chunks: List[str] = [] |
| 252 | + with file_path.open("rb") as file: |
| 253 | + buffer = bytearray() |
| 254 | + while True: |
| 255 | + chunk = file.read(4096) # Read in chunks of 4096 bytes |
| 256 | + if not chunk: |
| 257 | + break # End of file |
| 258 | + |
| 259 | + buffer += chunk |
| 260 | + |
| 261 | + # Split buffer into chunks based on LF, CR, or CRLF |
| 262 | + while True: |
| 263 | + match = re.search(b"(\x0D\x0A|\x0A|\x0D)", buffer) |
| 264 | + if not match: |
| 265 | + break # No more line breaks found, process the remaining buffer |
| 266 | + |
| 267 | + end = match.end() |
| 268 | + chunk_str = buffer[:end].decode(encoding, errors="strict") |
| 269 | + buffer = buffer[end:] |
| 270 | + |
| 271 | + chunks.append(chunk_str) |
| 272 | + |
| 273 | + # Handle the last chunk |
| 274 | + if buffer: |
| 275 | + chunks.append(buffer.decode(encoding, errors="strict")) |
| 276 | + |
| 277 | + return chunks |
| 278 | + |
| 279 | + |
| 280 | +def main(file_in: Path, file_out: Path, encoding: str, verbose: bool) -> None: |
| 281 | + console = Console() |
| 282 | + console.print(f"Read {file_in}") |
| 283 | + |
| 284 | + lines_in = read_binary_file(file_in, encoding) |
| 285 | + lines_out = update_lines(lines_in, encoding, console, verbose) |
| 286 | + |
| 287 | + with open(file_out, "wb") as f: |
| 288 | + for line in lines_out: |
| 289 | + f.write(line.encode(encoding)) |
| 290 | + |
| 291 | + console.print(f"Wrote {file_out}", soft_wrap=True) |
0 commit comments