Skip to content

Commit 96b6e3b

Browse files
committed
init
0 parents  commit 96b6e3b

File tree

424 files changed

+41885
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

424 files changed

+41885
-0
lines changed

.chalice/config.json

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"version": "2.0",
3+
"app_name": "pdf2image",
4+
"stages": {
5+
"dev": {
6+
"api_gateway_stage": "api",
7+
"lambda_memory_size": 2048,
8+
"lambda_timeout": 900
9+
}
10+
},
11+
"environment_variables": {
12+
"ORIGIN_BUCKET": "s3-sample-input",
13+
"DESTINATION_BUCKET": "s3-write-data-999",
14+
"FMT": "png",
15+
"DPI": "300"
16+
}
17+
}

.chalice/deployed/dev.json

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
{
2+
"resources": [
3+
{
4+
"name": "default-role",
5+
"resource_type": "iam_role",
6+
"role_arn": "arn:aws:iam::581522787838:role/pdf2image-dev",
7+
"role_name": "pdf2image-dev"
8+
},
9+
{
10+
"name": "pdf_to_image",
11+
"resource_type": "lambda_function",
12+
"lambda_arn": "arn:aws:lambda:us-west-2:581522787838:function:pdf2image-dev-pdf_to_image"
13+
},
14+
{
15+
"name": "pdf_to_image-s3event",
16+
"resource_type": "s3_event",
17+
"bucket": "s3-sample-input",
18+
"lambda_arn": "arn:aws:lambda:us-west-2:581522787838:function:pdf2image-dev-pdf_to_image"
19+
}
20+
],
21+
"schema_version": "2.0",
22+
"backend": "api"
23+
}
Binary file not shown.
Binary file not shown.
Binary file not shown.

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
__pycache__
2+
env

app.py

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import boto3
2+
import logging
3+
import os
4+
5+
from chalice import Chalice, Response
6+
from io import BytesIO
7+
from pdf2image import convert_from_bytes
8+
9+
app = Chalice(app_name='pdf2image')
10+
11+
12+
DPI = 300
13+
if 'DPI' in os.environ:
14+
try:
15+
DPI = int(os.environ['DPI'])
16+
except Exception as e:
17+
logging.debug(f"Couldn't process DPI environment variable: {str(e)}. Using the default: DPI=300")
18+
else:
19+
logging.info(f"No DPI environment variable set. Using the default: DPI=300")
20+
21+
_SUPPORTED_IMAGE_EXTENSIONS = ["ppm", "jpeg", "png", "tiff"]
22+
FMT = "png"
23+
if 'FMT' in os.environ:
24+
environ_fmt = str(os.environ['FMT'])
25+
if environ_fmt in _SUPPORTED_IMAGE_EXTENSIONS:
26+
FMT = environ_fmt
27+
else:
28+
logging.debug(f"Couldn't process FMT variable. "
29+
f"Only the following formats are supported: {','.join(_SUPPORTED_IMAGE_EXTENSIONS)}. "
30+
f"Using the default: FMI='png'")
31+
else:
32+
logging.info(f"No FMT environment variable set. Using the default: FMT='png'")
33+
34+
DESTINATION_BUCKET = None
35+
if 'DESTINATION_BUCKET' in os.environ:
36+
DESTINATION_BUCKET = str(os.environ['DESTINATION_BUCKET'])
37+
logging.info(f"Setting the destination bucket: {DESTINATION_BUCKET}. "
38+
f"Be sure to set the S3 bucket trigger on the Lambda's configuration")
39+
else:
40+
raise Exception(f"Couldn't process the DESTINATION_BUCKET environment variable. "
41+
f"The DESTINATION_BUCKET needs to be set to a valid S3 bucket to which the user has full access.")
42+
43+
ORIGIN_BUCKET = ''
44+
if 'ORIGIN_BUCKET' in os.environ:
45+
ORIGIN_BUCKET = str(os.environ['ORIGIN_BUCKET'])
46+
logging.info(f"Setting the origin bucket: {ORIGIN_BUCKET}. "
47+
f"Be sure to set the S3 bucket trigger on the Lambda's configuration")
48+
else:
49+
logging.info(f"Couldn't process the ORIGIN_BUCKET environment variable. "
50+
f"Be sure to set the S3 bucket trigger on the Lambda's configuration.")
51+
52+
_SUPPORTED_FILE_EXTENSION = '.pdf'
53+
54+
55+
@app.on_s3_event(bucket=ORIGIN_BUCKET,
56+
events=['s3:ObjectCreated:*'])
57+
def pdf_to_image(event):
58+
"""Take a pdf fom an S3 bucket and convert it to a list of pillow images (one for each page of the pdf).
59+
:param event: A Lambda event (referring to an S3 event object created event).
60+
:return:
61+
"""
62+
if not event.key.endswith(_SUPPORTED_FILE_EXTENSION):
63+
raise Exception(f"Only .pdf files are supported by this module.")
64+
65+
logging.info(f"Fetching item (bucket: '{event.bucket}', key: '{event.key}') from S3.")
66+
67+
# Fetch the image bytes
68+
s3 = boto3.resource('s3')
69+
obj = s3.Object(event.bucket, event.key)
70+
infile = obj.get()['Body'].read()
71+
logging.info("Successfully retrieved S3 object.")
72+
73+
# Set poppler path
74+
poppler_path = "/var/task/lib/poppler-utils-0.26/usr/bin"
75+
images = convert_from_bytes(infile,
76+
dpi=DPI,
77+
fmt=FMT,
78+
poppler_path=poppler_path)
79+
logging.info("Successfully converted pdf to image.")
80+
81+
for page_num, image in enumerate(images):
82+
83+
# The directory is: <name of the pdf>-num_pages-<number of pages in the pdf>
84+
directory = event.key.split('.')[0] + "-num_pages-" + str(len(images))
85+
86+
# Then save the image and name it: <name of the pdf>-page<page number>.FMT
87+
location = directory + "/" + event.key.split('.')[0] + "-page" + str(page_num) + '.' + FMT
88+
89+
logging.info(f"Saving page number {str(page_num)} to S3 at location: {DESTINATION_BUCKET}, {location}.")
90+
91+
# Load it into the buffer and save the boytjie to S3
92+
buffer = BytesIO()
93+
image.save(buffer, FMT.upper())
94+
buffer.seek(0)
95+
s3.Object(
96+
DESTINATION_BUCKET,
97+
location
98+
).put(
99+
Body=buffer,
100+
Metadata={
101+
'ORIGINAL_DOCUMENT_BUCKET': event.bucket,
102+
'ORIGINAL_DOCUMENT_KEY': event.key,
103+
'PAGE_NUMBER': str(page_num),
104+
'PAGE_COUNT': str(len(images))
105+
}
106+
)
107+
108+
return Response(f"PDF document ({event.key}) successfully converted to a series of images.")

requirements.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
chalice
2+
pdf2image
3+
boto3

vendor/PIL/BdfFontFile.py

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#
2+
# The Python Imaging Library
3+
# $Id$
4+
#
5+
# bitmap distribution font (bdf) file parser
6+
#
7+
# history:
8+
# 1996-05-16 fl created (as bdf2pil)
9+
# 1997-08-25 fl converted to FontFile driver
10+
# 2001-05-25 fl removed bogus __init__ call
11+
# 2002-11-20 fl robustification (from Kevin Cazabon, Dmitry Vasiliev)
12+
# 2003-04-22 fl more robustification (from Graham Dumpleton)
13+
#
14+
# Copyright (c) 1997-2003 by Secret Labs AB.
15+
# Copyright (c) 1997-2003 by Fredrik Lundh.
16+
#
17+
# See the README file for information on usage and redistribution.
18+
#
19+
20+
"""
21+
Parse X Bitmap Distribution Format (BDF)
22+
"""
23+
24+
25+
from . import FontFile, Image
26+
27+
bdf_slant = {
28+
"R": "Roman",
29+
"I": "Italic",
30+
"O": "Oblique",
31+
"RI": "Reverse Italic",
32+
"RO": "Reverse Oblique",
33+
"OT": "Other",
34+
}
35+
36+
bdf_spacing = {"P": "Proportional", "M": "Monospaced", "C": "Cell"}
37+
38+
39+
def bdf_char(f):
40+
# skip to STARTCHAR
41+
while True:
42+
s = f.readline()
43+
if not s:
44+
return None
45+
if s[:9] == b"STARTCHAR":
46+
break
47+
id = s[9:].strip().decode("ascii")
48+
49+
# load symbol properties
50+
props = {}
51+
while True:
52+
s = f.readline()
53+
if not s or s[:6] == b"BITMAP":
54+
break
55+
i = s.find(b" ")
56+
props[s[:i].decode("ascii")] = s[i + 1 : -1].decode("ascii")
57+
58+
# load bitmap
59+
bitmap = []
60+
while True:
61+
s = f.readline()
62+
if not s or s[:7] == b"ENDCHAR":
63+
break
64+
bitmap.append(s[:-1])
65+
bitmap = b"".join(bitmap)
66+
67+
[x, y, l, d] = [int(p) for p in props["BBX"].split()]
68+
[dx, dy] = [int(p) for p in props["DWIDTH"].split()]
69+
70+
bbox = (dx, dy), (l, -d - y, x + l, -d), (0, 0, x, y)
71+
72+
try:
73+
im = Image.frombytes("1", (x, y), bitmap, "hex", "1")
74+
except ValueError:
75+
# deal with zero-width characters
76+
im = Image.new("1", (x, y))
77+
78+
return id, int(props["ENCODING"]), bbox, im
79+
80+
81+
class BdfFontFile(FontFile.FontFile):
82+
"""Font file plugin for the X11 BDF format."""
83+
84+
def __init__(self, fp):
85+
super().__init__()
86+
87+
s = fp.readline()
88+
if s[:13] != b"STARTFONT 2.1":
89+
msg = "not a valid BDF file"
90+
raise SyntaxError(msg)
91+
92+
props = {}
93+
comments = []
94+
95+
while True:
96+
s = fp.readline()
97+
if not s or s[:13] == b"ENDPROPERTIES":
98+
break
99+
i = s.find(b" ")
100+
props[s[:i].decode("ascii")] = s[i + 1 : -1].decode("ascii")
101+
if s[:i] in [b"COMMENT", b"COPYRIGHT"]:
102+
if s.find(b"LogicalFontDescription") < 0:
103+
comments.append(s[i + 1 : -1].decode("ascii"))
104+
105+
while True:
106+
c = bdf_char(fp)
107+
if not c:
108+
break
109+
id, ch, (xy, dst, src), im = c
110+
if 0 <= ch < len(self.glyph):
111+
self.glyph[ch] = xy, dst, src, im

0 commit comments

Comments
 (0)