Skip to content

Commit d82649a

Browse files
authored
MAINT: Upgrade from PyPDF2 to pypdf + add system metadata (#16)
1 parent 45e1fcb commit d82649a

11 files changed

+139
-38
lines changed

.isort.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ multi_line_output=3
55
length_sort=0
66
include_trailing_comma=True
77
skip=docs
8-
known_third_party = PIL,PyPDF2,pydantic,setuptools,typer
8+
known_third_party = PIL,pypdf,pydantic,setuptools,typer

.pre-commit-config.yaml

+20-21
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# pre-commit run --all-files
22
repos:
33
- repo: https://github.com/pre-commit/pre-commit-hooks
4-
rev: v4.2.0
4+
rev: v4.4.0
55
hooks:
66
- id: check-ast
77
- id: check-byte-order-marker
@@ -10,39 +10,38 @@ repos:
1010
- id: check-yaml
1111
- id: debug-statements
1212
- id: end-of-file-fixer
13+
exclude: "resources/.*|docs/make.bat"
1314
- id: trailing-whitespace
1415
- id: mixed-line-ending
16+
args: ['--fix=lf']
17+
exclude: "docs/make.bat"
1518
- id: check-added-large-files
1619
args: ['--maxkb=1000']
20+
# - repo: https://github.com/pre-commit/mirrors-mypy
21+
# rev: v0.942
22+
# hooks:
23+
# - id: mypy
1724
- repo: https://github.com/psf/black
18-
rev: 22.3.0
25+
rev: 23.1.0
1926
hooks:
2027
- id: black
28+
args: [--target-version, py36]
2129
- repo: https://github.com/asottile/blacken-docs
22-
rev: v1.12.1
30+
rev: 1.13.0
2331
hooks:
2432
- id: blacken-docs
2533
additional_dependencies: [black==22.1.0]
26-
- repo: https://gitlab.com/pycqa/flake8
27-
rev: 3.9.2
34+
- repo: https://github.com/charliermarsh/ruff-pre-commit
35+
rev: 'v0.0.254'
2836
hooks:
29-
- id: flake8
30-
- repo: https://github.com/pre-commit/mirrors-mypy
31-
rev: v0.942
32-
hooks:
33-
- id: mypy
34-
additional_dependencies:
35-
- 'pydantic'
36-
- repo: https://github.com/asottile/seed-isort-config
37-
rev: v2.2.0
38-
hooks:
39-
- id: seed-isort-config
40-
- repo: https://github.com/pre-commit/mirrors-isort
41-
rev: v5.10.1
42-
hooks:
43-
- id: isort
37+
- id: ruff
4438
- repo: https://github.com/asottile/pyupgrade
45-
rev: v2.32.0
39+
rev: v3.3.1
4640
hooks:
4741
- id: pyupgrade
4842
args: [--py36-plus]
43+
- repo: https://github.com/pycqa/flake8
44+
rev: 6.0.0
45+
hooks:
46+
- id: flake8
47+
args: ["--ignore", "E,W,F"]

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ upload:
1010
clean:
1111
python setup.py clean --all
1212
pyclean .
13-
rm -rf Tests/__pycache__ PyPDF2/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf PyPDF2.egg-info PyPDF2_pdfLocation.txt
13+
rm -rf Tests/__pycache__ pypdf/__pycache__ Image9.png htmlcov docs/_build dist dont_commit_merged.pdf dont_commit_writer.pdf pypdf.egg-info pypdf_pdfLocation.txt
1414

1515
test:
1616
pytest Tests --cov --cov-report term-missing -vv --cov-report html --durations=3 --timeout=30

pdfly/cat.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
"""
4242
# Copyright (c) 2014, Steve Witham <[email protected]>.
4343
# All rights reserved. This software is available under a BSD license;
44-
# see https://github.com/py-pdf/PyPDF2/LICENSE
44+
# see https://github.com/py-pdf/pypdf/LICENSE
4545

4646

4747
import os
@@ -50,7 +50,7 @@
5050
from sys import exit, stderr, stdout
5151
from typing import List
5252

53-
from PyPDF2 import PdfMerger, parse_filename_page_ranges
53+
from pypdf import PdfMerger, parse_filename_page_ranges
5454

5555

5656
def main(filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool) -> None:
@@ -66,7 +66,7 @@ def main(filename: Path, fn_pgrgs: List[str], output: Path, verbose: bool) -> No
6666
merger = PdfMerger()
6767
in_fs = {}
6868
try:
69-
for (filename, page_range) in filename_page_ranges: # type: ignore
69+
for filename, page_range in filename_page_ranges: # type: ignore
7070
if verbose:
7171
print(filename, page_range, file=stderr)
7272
if filename not in in_fs:

pdfly/cli.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def metadata(
7171
@entry_point.command(name="extract-text") # type: ignore[misc]
7272
def extract_text(pdf: Path):
7373
"""Extract text from a PDF file."""
74-
from PyPDF2 import PdfReader
74+
from pypdf import PdfReader
7575

7676
reader = PdfReader(str(pdf))
7777
for page in reader.pages:

pdfly/compress.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
from pathlib import Path
55

6-
from PyPDF2 import PdfReader, PdfWriter
6+
from pypdf import PdfReader, PdfWriter
77

88

99
def main(pdf: Path, output: Path):

pdfly/extract_images.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99

1010
from PIL import Image
11-
from PyPDF2 import PdfReader
11+
from pypdf import PdfReader
1212

1313

1414
def main(pdf: Path) -> None:

pdfly/metadata.py

+34-7
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,33 @@
11
"""Show metadata of a PDF file"""
22

3-
import os
3+
import stat
4+
from datetime import datetime
45
from enum import Enum
56
from pathlib import Path
67
from typing import Optional, Set, Tuple
78

89
from pydantic import BaseModel
9-
from PyPDF2 import PdfReader
10+
from pypdf import PdfReader
1011

1112

1213
class MetaInfo(BaseModel):
1314
title: Optional[str] = None
1415
producer: Optional[str] = None
16+
author: Optional[str] = None
1517
pages: int
1618
encrypted: bool
17-
file_size: int # in bytes
1819
page_size: Tuple[float, float] # (width, height)
1920
pdf_file_version: str
2021
page_mode: Optional[str]
2122
page_layout: Optional[str]
2223

24+
# OS Information
25+
file_permissions: str
26+
file_size: int # in bytes
27+
creation_time: datetime
28+
modification_time: datetime
29+
access_time: datetime
30+
2331

2432
class OutputOptions(Enum):
2533
json = "json"
@@ -33,35 +41,41 @@ def main(pdf: Path, output: OutputOptions) -> None:
3341

3442
reader.stream.seek(0)
3543
pdf_file_version = reader.stream.read(8).decode("utf-8")
44+
pdf_stat = pdf.stat()
3645
meta = MetaInfo(
3746
pages=len(reader.pages),
3847
encrypted=reader.is_encrypted,
39-
file_size=pdf.stat().st_size,
4048
page_size=(x2 - x1, y2 - y1),
4149
page_mode=reader.page_mode,
4250
pdf_file_version=pdf_file_version,
4351
page_layout=reader.page_layout,
52+
# OS Info
53+
file_permissions=f"{stat.filemode(pdf_stat.st_mode)}",
54+
file_size=pdf_stat.st_size,
55+
creation_time=datetime.fromtimestamp(pdf_stat.st_ctime),
56+
modification_time=datetime.fromtimestamp(pdf_stat.st_mtime),
57+
access_time=datetime.fromtimestamp(pdf_stat.st_atime),
4458
)
4559
if info is not None:
4660
meta.title = info.title
4761
meta.producer = info.producer
62+
meta.author = info.author
4863

4964
if output == OutputOptions.json:
5065
print(meta.json())
5166
else:
5267
from rich.console import Console
5368
from rich.table import Table
5469

55-
table = Table(title=f"{pdf}")
70+
table = Table(title="PDF Data")
5671
table.add_column("Attribute", justify="right", style="cyan", no_wrap=True)
5772
table.add_column("Value", style="white")
5873

59-
table.add_row("File Size", f"{os.path.getsize(pdf):,} Bytes")
6074
table.add_row("Title", meta.title)
6175
table.add_row("Producer", meta.producer)
76+
table.add_row("Author", meta.author)
6277
table.add_row("Pages", f"{meta.pages:,}")
6378
table.add_row("Encrypted", f"{meta.encrypted}")
64-
table.add_row("File size", f"{meta.file_size:,} bytes")
6579
table.add_row(
6680
"Page size", f"{meta.page_size[0]} x {meta.page_size[1]} pts (w x h)"
6781
)
@@ -77,5 +91,18 @@ def main(pdf: Path, output: OutputOptions) -> None:
7791
table.add_row("Fonts (unembedded)", ", ".join(sorted(unemedded_fonts)))
7892
table.add_row("Fonts (embedded)", ", ".join(sorted(embedded_fonts)))
7993

94+
os_table = Table(title="Operating System Data")
95+
os_table.add_column("Attribute", justify="right", style="cyan", no_wrap=True)
96+
os_table.add_column("Value", style="white")
97+
os_table.add_row("File Name", f"{pdf}")
98+
os_table.add_row("File Permissions", f"{meta.file_permissions}")
99+
os_table.add_row("File Size", f"{meta.file_size:,} bytes")
100+
os_table.add_row("Creation Time", f"{meta.creation_time:%Y-%m-%d %H:%M:%S}")
101+
os_table.add_row(
102+
"Modification Time", f"{meta.modification_time:%Y-%m-%d %H:%M:%S}"
103+
)
104+
os_table.add_row("Access Time", f"{meta.access_time:%Y-%m-%d %H:%M:%S}")
105+
80106
console = Console()
107+
console.print(os_table)
81108
console.print(table)

pdfly/up2.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import sys
1010
from pathlib import Path
1111

12-
from PyPDF2 import PdfReader, PdfWriter
12+
from pypdf import PdfReader, PdfWriter
1313

1414

1515
def main(pdf: Path, output: Path) -> None:

pypproject.toml

+75
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,77 @@
11
[tool.black]
22
line-length = 79
3+
4+
[tool.ruff]
5+
line-length = 120
6+
select = ["ALL"]
7+
ignore = [
8+
"D404", # First word of the docstring should not be "This"
9+
# I would like to have it, but there are a few annoying exceptions:
10+
"D401", # First line of docstring should be in imperative mood - false positives
11+
"ERA001",
12+
"UP031",
13+
"D205", # 1 blank line required between summary line and description
14+
"D400", # First line should end with a period
15+
"D415", # First line should end with a period
16+
# Introduces bugs
17+
"RUF001", "RUF002", "RUF005",
18+
"ARG",
19+
"DTZ001", # The use of `datetime.datetime()` without `tzinfo` is necessary
20+
# Personal preference
21+
"D406", # Section name should end with a newline ("Returns")
22+
"D212", # I want multiline-docstrings to start at the second line
23+
"D407", # google-style docstrings don't have dashses
24+
"N806", # Variable `NO` in function should be lowercase
25+
"N814", # Camelcase `PageAttributes` imported as constant `PG`
26+
"N817", # CamelCase `PagesAttributes` imported as acronym `PA`
27+
"ANN101", # annotating 'self' seems weird (at least before 3.11)
28+
"ANN102", # Missing type annotation for `cls` in classmethod
29+
"ANN204", # Missing return type annotation for special method `__init__`
30+
"ANN401", # Dynamically typed expressions (typing.Any) are disallowed
31+
"BLE", # we want to capture Exception sometimes
32+
"COM812", # yes, they make the diff smaller
33+
"D105", # Missing docstring in magic method
34+
"D106", # Missing docstring in public nested class
35+
"D107", # Missing docstring in `__init__`
36+
"D203", # one-blank-line-before-class
37+
"EM", # exception messages
38+
"G004", # f-string in logging statement
39+
"RET",
40+
"S110", # `try`-`except`-`pass` detected, consider logging the exception
41+
"SIM105", # contextlib.suppress
42+
"SIM108", # don't enforce ternary operators
43+
"SIM300", # yoda conditions
44+
"TID252", # we want relative imports
45+
"TRY", # I don't know what this is about
46+
# As long as we are not on Python 3.9+
47+
"UP035", # PEP 585
48+
# As long as we are not on Python 3.10+
49+
"UP038", # Use `X | Y` in `isinstance` call instead of `(X, Y)` - PEP 604
50+
# As long as we are not on Python 3.11+
51+
"UP006", "UP007",
52+
# for the moment, fix it later:
53+
"A", # Variable is shadowing a built-in
54+
"B904", # Within an `except` clause, raise exceptions with
55+
"B905", # `zip()` without an explicit `strict=` parameter
56+
"C901",
57+
"D101", # Missing docstring in public class
58+
"D102", # Missing docstring in public method
59+
"D103", # Missing docstring in public function
60+
"D417", # Missing argument descriptions in the docstring
61+
"FBT001", # Boolean positional arg in function definition
62+
"FBT002", # Boolean default value in function definition
63+
"FBT003", # Boolean positional value in function call
64+
"PGH", # Use specific error messages
65+
"PLE", # too many arguments for logging
66+
"PLR0911", # Too many return statements
67+
"PLR0912", # Too many branches
68+
"PLR0913", # Too many arguments to function call
69+
"PLR0915", # Too many statements
70+
"PLR2004", # Magic value
71+
"PLW", # global variables
72+
"PT011", # `pytest.raises(ValueError)` is too broad, set the `match`
73+
"PT012", # `pytest.raises()` block should contain a single simple statement
74+
"PTH123", # `open()` should be replaced by `Path.open()`
75+
"S101", # Use of `assert` detected
76+
"SLF001", # Private member accessed
77+
]

setup.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ classifiers =
3434
[options]
3535
python_requires = >=3.6.1
3636
install_requires =
37-
PyPDF2
37+
pypdf>3.0.0
3838
typer
3939
pillow
4040
pydantic

0 commit comments

Comments
 (0)