Skip to content

Commit 19063f8

Browse files
authored
ENH: Add metadata extractor (#3)
1 parent ddac7bb commit 19063f8

File tree

7 files changed

+95
-3
lines changed

7 files changed

+95
-3
lines changed

.github/workflows/code-quality.yaml

+6-2
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,17 @@ jobs:
3232
run: |
3333
pip install -r requirements/ci.txt
3434
35+
- name: Install cpdf
36+
run: |
37+
pip install .
38+
3539
- name: Test with black
3640
run: black --check .
3741

3842
- name: Test with flake8
3943
run: |
40-
flake8
44+
flake8 . --exclude build
4145
4246
- name: Test with mypy
4347
run: |
44-
mypy . --ignore-missing-imports
48+
mypy . --ignore-missing-imports --exclude build

.isort.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ multi_line_output=3
55
length_sort=0
66
include_trailing_comma=True
77
skip=docs
8-
known_third_party = PIL,PyPDF2,setuptools,typer
8+
known_third_party = PIL,PyPDF2,pydantic,setuptools,typer

.pre-commit-config.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ repos:
3131
rev: v0.942
3232
hooks:
3333
- id: mypy
34+
additional_dependencies:
35+
- 'pydantic'
3436
- repo: https://github.com/asottile/seed-isort-config
3537
rev: v2.2.0
3638
hooks:

cpdf/cli.py

+16
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import cpdf.cat
77
import cpdf.extract_images
8+
import cpdf.metadata
89
import cpdf.up2
910

1011

@@ -52,6 +53,21 @@ def cat(
5253
cpdf.cat.main(filename, fn_pgrgs, output, verbose)
5354

5455

56+
@entry_point.command(name="meta") # type: ignore[misc]
57+
def metadata(
58+
pdf: Path,
59+
output: cpdf.metadata.OutputOptions = typer.Option( # noqa
60+
...,
61+
"--output",
62+
"-o",
63+
help="output format",
64+
show_default=True,
65+
),
66+
) -> None:
67+
cpdf.metadata.main(pdf, output)
68+
69+
5570
up2.__doc__ = cpdf.up2.__doc__
5671
extract_images.__doc__ = cpdf.extract_images.__doc__
5772
cat.__doc__ = cpdf.cat.__doc__
73+
metadata.__doc__ = cpdf.metadata.__doc__

cpdf/metadata.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""Show metadata of a PDF file"""
2+
3+
from enum import Enum
4+
from pathlib import Path
5+
from typing import Optional, Tuple
6+
7+
from pydantic import BaseModel
8+
from PyPDF2 import PdfFileReader
9+
10+
11+
class MetaInfo(BaseModel):
12+
title: Optional[str] = None
13+
producer: Optional[str] = None
14+
pages: int
15+
encrypted: bool
16+
file_size: int # in bytes
17+
page_size: Tuple[float, float] # (width, height)
18+
pdf_file_version: str
19+
20+
21+
class OutputOptions(Enum):
22+
json = "json"
23+
text = "text"
24+
25+
26+
def main(pdf: Path, output: OutputOptions) -> None:
27+
with open(pdf, "rb") as f:
28+
reader = PdfFileReader(f)
29+
info = reader.getDocumentInfo()
30+
x1, y1, x2, y2 = reader.getPage(0).mediaBox
31+
32+
reader.stream.seek(0)
33+
pdf_file_version = reader.stream.readline().decode()
34+
35+
meta = MetaInfo(
36+
title=info.title,
37+
producer=info.producer,
38+
pages=reader.getNumPages(),
39+
encrypted=reader.isEncrypted,
40+
file_size=pdf.stat().st_size,
41+
page_size=(x2 - x1, y2 - y1),
42+
pdf_file_version=pdf_file_version,
43+
)
44+
45+
if output == OutputOptions.json:
46+
print(meta.json())
47+
else:
48+
from rich.console import Console
49+
from rich.table import Table
50+
51+
table = Table(title=f"{pdf}")
52+
table.add_column("Attribute", justify="right", style="cyan", no_wrap=True)
53+
table.add_column("Value", style="white")
54+
55+
table.add_row("Title", meta.title)
56+
table.add_row("Producer", meta.producer)
57+
table.add_row("Pages", f"{meta.pages:,}")
58+
table.add_row("Encrypted", f"{meta.encrypted}")
59+
table.add_row("File size", f"{meta.file_size:,} bytes")
60+
table.add_row(
61+
"Page size", f"{meta.page_size[0]} x {meta.page_size[1]} pts (w x h)"
62+
)
63+
table.add_row("PDF File Version", meta.pdf_file_version)
64+
65+
console = Console()
66+
console.print(table)

mypy.ini

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[mypy]
2+
plugins = pydantic.mypy

setup.cfg

+2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ install_requires =
3737
PyPDF2
3838
typer
3939
pillow
40+
pydantic
41+
rich
4042

4143
[options.entry_points]
4244
console_scripts =

0 commit comments

Comments
 (0)