Skip to content

Commit 6d5a9c7

Browse files
committedAug 15, 2024
refactor: integrate core functionality into CHSIConverter class
1 parent ac092d6 commit 6d5a9c7

File tree

5 files changed

+272
-350
lines changed

5 files changed

+272
-350
lines changed
 

‎add_float_picture.py

-103
This file was deleted.

‎app.py

+5-108
Original file line numberDiff line numberDiff line change
@@ -1,122 +1,19 @@
1-
from flask import Flask, request, render_template, send_from_directory, make_response
2-
from werkzeug.utils import secure_filename
1+
from flask import Flask, render_template
2+
from utils import CHSIConverter
33
import os
4-
from add_float_picture import add_float_picture
5-
from extract_img import extract_image_from_pdf
6-
from extract_info import extract_info_from_pdf
7-
from docx import Document
8-
from docx.shared import Inches, Pt
9-
from docx.enum.table import WD_ALIGN_VERTICAL
10-
from docx.oxml import parse_xml
11-
import uuid
12-
import shutil
134

145
app = Flask(__name__)
156

16-
def convert_to_docx(path):
17-
try:
18-
extracted_info = extract_info_from_pdf(path)
19-
doc = Document("static/template.docx")
20-
21-
paragraph = doc.add_paragraph()
22-
doc.element.body.insert(1, paragraph._element)
23-
paragraph.alignment = 1
24-
paragraph.add_run('Update date:' + extracted_info['Update Date'])
25-
26-
del extracted_info['Update Date']
27-
28-
table = doc.add_table(rows=1, cols=2)
29-
table.autofit = False
30-
31-
for cell in table.columns[0].cells:
32-
cell.width = Inches(0.5)
33-
for cell in table.columns[1].cells:
34-
cell.width = Inches(5.0)
35-
36-
border_xml = '<w:tcBorders xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">' \
37-
'<w:top w:val="nil"/>' \
38-
'<w:left w:val="nil"/>' \
39-
'<w:bottom w:val="nil"/>' \
40-
'<w:right w:val="nil"/>' \
41-
'</w:tcBorders>'
42-
43-
for key, value in extracted_info.items():
44-
cells = table.add_row().cells
45-
for cell in cells:
46-
cell._element.get_or_add_tcPr().append(parse_xml(border_xml))
47-
cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER
48-
49-
is_last = key == list(extracted_info.keys())[-1]
50-
cells[0].text = key + ("" if is_last else "\n")
51-
cells[1].text = value + ("" if is_last else "\n")
52-
53-
cropped_image_1 = extract_image_from_pdf(path, 1, 1898, 583, 2230, 1026)
54-
add_float_picture(doc.add_paragraph(), cropped_image_1, width=Inches(1.2), pos_x=Pt(430), pos_y=Pt(140))
55-
56-
cropped_image_2 = extract_image_from_pdf(path, 1, 300, 2690, 630, 2985)
57-
add_float_picture(doc.add_paragraph(), cropped_image_2, width=Inches(1.2), pos_x=Pt(78), pos_y=Pt(643))
58-
59-
output_path = path.replace(".pdf", ".docx")
60-
doc.save(output_path)
61-
62-
return output_path
63-
64-
except Exception as e:
65-
return make_response(f"<script>alert('Error during DOCX conversion: {e}'); window.location.href = document.referrer;</script>")
66-
677
@app.route('/')
688
def home():
699
return render_template('index.html')
7010

7111
@app.route('/convert', methods=['POST'])
72-
def convert_file():
73-
if 'file' not in request.files:
74-
return make_response("<script>alert('缺少文件部分'); window.location.href = document.referrer;</script>")
75-
76-
file = request.files['file']
77-
if file.filename == '':
78-
return make_response("<script>alert('没有选中的文件'); window.location.href = document.referrer;</script>")
79-
80-
if not file.filename.lower().endswith('.pdf'):
81-
return make_response("<script>alert('只接受 PDF 文件'); window.location.href = document.referrer;</script>")
82-
83-
if not file.filename.startswith('教育部学籍在线验证报告_'):
84-
return make_response("<script>alert('请不要传入无关文件'); window.location.href = document.referrer;</script>")
85-
86-
try:
87-
filename = secure_filename(file.filename)
88-
filepath = os.path.join(os.getcwd(), 'upload', filename)
89-
file.save(filepath)
90-
91-
output_path = convert_to_docx(filepath)
92-
93-
directory = os.path.dirname(output_path)
94-
filename = os.path.basename(output_path)
95-
output_filename = str(uuid.uuid4()) + '.docx'
96-
97-
response = make_response(send_from_directory(directory, filename, as_attachment=True))
98-
response.headers["Content-Disposition"] = f"attachment; filename={output_filename}"
99-
100-
# 隐私处理
101-
upload_folder = os.path.join(os.getcwd(), 'upload')
102-
for filename in os.listdir(upload_folder):
103-
if filename != '.gitkeep':
104-
file_path = os.path.join(upload_folder, filename)
105-
try:
106-
if os.path.isfile(file_path) or os.path.islink(file_path):
107-
os.unlink(file_path)
108-
elif os.path.isdir(file_path):
109-
shutil.rmtree(file_path)
110-
except Exception as e:
111-
print(f"Failed to delete {file_path}. Reason: {e}")
112-
113-
return response
114-
except Exception as e:
115-
return make_response(f"<script>alert('处理文件时发生错误: {e}'); window.location.href = document.referrer;</script>")
12+
def handle_convert():
13+
return CHSIConverter.convert_file()
11614

11715
if __name__ == '__main__':
118-
# debug_mode = os.getenv('FLASK_DEBUG', 'false').lower() == 'true'
11916
port = int(os.getenv('FLASK_PORT', 5001))
12017
app.run(debug=True, port=port, host='0.0.0.0')
12118
else:
122-
application=app
19+
application = app

‎extract_img.py

-28
This file was deleted.

‎extract_info.py

-111
This file was deleted.

‎utils.py

+267
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
from flask import request, make_response, send_from_directory
2+
from werkzeug.utils import secure_filename
3+
from docx import Document
4+
from docx.shared import Inches, Pt
5+
from docx.enum.table import WD_ALIGN_VERTICAL
6+
from docx.oxml import parse_xml, register_element_cls
7+
from docx.oxml.ns import nsdecls
8+
from docx.oxml.shape import CT_Picture
9+
from docx.oxml.xmlchemy import BaseOxmlElement, OneAndOnlyOne
10+
from pypdf import PdfReader
11+
from pypinyin import lazy_pinyin
12+
from pdf2image import convert_from_path
13+
import re
14+
import os
15+
import uuid
16+
import shutil
17+
18+
class CT_Anchor(BaseOxmlElement):
19+
extent = OneAndOnlyOne('wp:extent')
20+
docPr = OneAndOnlyOne('wp:docPr')
21+
graphic = OneAndOnlyOne('a:graphic')
22+
23+
@classmethod
24+
def new(cls, cx, cy, shape_id, pic, pos_x, pos_y):
25+
anchor = parse_xml(cls._anchor_xml(pos_x, pos_y))
26+
anchor.extent.cx = cx
27+
anchor.extent.cy = cy
28+
anchor.docPr.id = shape_id
29+
anchor.docPr.name = f'Picture {shape_id}'
30+
anchor.graphic.graphicData.uri = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
31+
anchor.graphic.graphicData._insert_pic(pic)
32+
return anchor
33+
34+
@classmethod
35+
def new_pic_anchor(cls, shape_id, rId, filename, cx, cy, pos_x, pos_y):
36+
pic_id = 0
37+
pic = CT_Picture.new(pic_id, filename, rId, cx, cy)
38+
anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y)
39+
anchor.graphic.graphicData._insert_pic(pic)
40+
return anchor
41+
42+
@classmethod
43+
def _anchor_xml(cls, pos_x, pos_y):
44+
return (
45+
'<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="0" \n'
46+
' behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1" \n'
47+
' %s>\n'
48+
' <wp:simplePos x="0" y="0"/>\n'
49+
' <wp:positionH relativeFrom="page">\n'
50+
' <wp:posOffset>%d</wp:posOffset>\n'
51+
' </wp:positionH>\n'
52+
' <wp:positionV relativeFrom="page">\n'
53+
' <wp:posOffset>%d</wp:posOffset>\n'
54+
' </wp:positionV>\n'
55+
' <wp:extent cx="914400" cy="914400"/>\n'
56+
' <wp:wrapNone/>\n'
57+
' <wp:docPr id="666" name="unnamed"/>\n'
58+
' <wp:cNvGraphicFramePr>\n'
59+
' <a:graphicFrameLocks noChangeAspect="1"/>\n'
60+
' </wp:cNvGraphicFramePr>\n'
61+
' <a:graphic>\n'
62+
' <a:graphicData uri="URI not set"/>\n'
63+
' </a:graphic>\n'
64+
'</wp:anchor>' % (nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y))
65+
)
66+
67+
def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y):
68+
rId, image = part.get_or_add_image(image_descriptor)
69+
cx, cy = image.scaled_dimensions(width, height)
70+
shape_id, filename = part.next_id, image.filename
71+
return CT_Anchor.new_pic_anchor(shape_id, rId, filename, cx, cy, pos_x, pos_y)
72+
73+
class CHSIConverter:
74+
@staticmethod
75+
def extract_text_from_pdf(pdf_path):
76+
try:
77+
with open(pdf_path, 'rb') as pdf_file_obj:
78+
pdf_reader = PdfReader(pdf_file_obj)
79+
text = "".join(page.extract_text() for page in pdf_reader.pages)
80+
return text
81+
except Exception as e:
82+
return make_response(f"<script>alert('从PDF提取文本错误: {e}'); window.location.href = document.referrer;</script>")
83+
84+
@staticmethod
85+
def extract_info(patterns_dict, text):
86+
results = {}
87+
for prop, pattern in patterns_dict.items():
88+
match = re.search(pattern, text)
89+
if match:
90+
value = match.group(1)
91+
if prop == 'Name':
92+
pinyin = lazy_pinyin(value)
93+
results[prop] = f"{''.join(pinyin[1:]).capitalize()} {pinyin[0].capitalize()}"
94+
elif prop == 'Gender':
95+
results[prop] = 'Male' if value == '男' else 'Female' if value == '女' else None
96+
elif prop == 'Ethnic':
97+
results[prop] = ''.join(lazy_pinyin(value[:-1])).title()
98+
elif prop in ['Date of Birth', 'Date of Enrollment', 'Update Date']:
99+
year, rest = value.split('年')
100+
month, day = rest.split('月')
101+
results[prop] = f"{month}/{day.replace('日', '')}/{year}"
102+
elif prop == 'Levels':
103+
results[prop] = 'Undergraduate'
104+
elif prop == 'Form':
105+
results[prop] = 'General full-time remote study'
106+
elif prop == 'Educational System':
107+
results[prop] = f'{value} years'
108+
elif prop == 'Type':
109+
results[prop] = 'General higher education'
110+
elif prop == 'School Status':
111+
date_part = value.split(":")[1]
112+
year, rest = date_part.split('年')
113+
month, day = rest.split('月')
114+
day = day.replace('日', '').replace('', '')
115+
formatted_date = f"{month}/{day}/{year}"
116+
results[prop] = f'Student registration (Expected graduation date: {formatted_date})'
117+
else:
118+
results[prop] = value
119+
else:
120+
results[prop] = None
121+
return results
122+
123+
@classmethod
124+
def extract_info_from_pdf(cls, path):
125+
text = cls.extract_text_from_pdf(path)
126+
patterns_dict = {prop: re.compile(r'{}\s*([^\s]*)'.format(pattern)) for prop, pattern in {
127+
'Update Date': '更新日期:',
128+
'Name': '姓名',
129+
'Gender': '性别',
130+
'Id Number': '证件号码',
131+
'Ethnic': '民族',
132+
'Date of Birth': '出生日期 ',
133+
'Institution': '院校',
134+
'Levels': '层次',
135+
'Faculties': '院系',
136+
'Class': '班级',
137+
'Major': '专业',
138+
'Student Number': '学号',
139+
'Form': '形式',
140+
'Date of Enrollment': '入学日期',
141+
'Educational System': '学制',
142+
'Type': '类型',
143+
'School Status': '学籍状态',
144+
}.items()}
145+
return cls.extract_info(patterns_dict, text)
146+
147+
@staticmethod
148+
def extract_image_from_pdf(path, page_number, left, top, right, bottom):
149+
try:
150+
images = convert_from_path(path, dpi=300, first_page=page_number, last_page=page_number)
151+
image = images[0]
152+
cropped_image = image.crop((left, top, right, bottom))
153+
file_name = os.path.splitext(path)[0]
154+
image_path = f"{file_name}_image.png"
155+
cropped_image.save(image_path)
156+
return image_path
157+
except Exception as e:
158+
return make_response(f"<script>alert('从PDF提取图片错误: {e}'); window.location.href = document.referrer;</script>")
159+
160+
@staticmethod
161+
def add_float_picture(p, image_path_or_stream, width=None, height=None, pos_x=0, pos_y=0):
162+
try:
163+
run = p.add_run()
164+
anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y)
165+
run._r.add_drawing(anchor)
166+
except Exception:
167+
return make_response("<script>alert('浮动图片添加时发生错误'); window.location.href = document.referrer;</script>")
168+
169+
@classmethod
170+
def convert_to_docx(cls, path):
171+
try:
172+
extracted_info = cls.extract_info_from_pdf(path)
173+
doc = Document("static/template.docx")
174+
175+
paragraph = doc.add_paragraph()
176+
doc.element.body.insert(1, paragraph._element)
177+
paragraph.alignment = 1
178+
paragraph.add_run('Update date:' + extracted_info['Update Date'])
179+
180+
del extracted_info['Update Date']
181+
182+
table = doc.add_table(rows=1, cols=2)
183+
table.autofit = False
184+
185+
for cell in table.columns[0].cells:
186+
cell.width = Inches(0.5)
187+
for cell in table.columns[1].cells:
188+
cell.width = Inches(5.0)
189+
190+
border_xml = '<w:tcBorders xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">' \
191+
'<w:top w:val="nil"/>' \
192+
'<w:left w:val="nil"/>' \
193+
'<w:bottom w:val="nil"/>' \
194+
'<w:right w:val="nil"/>' \
195+
'</w:tcBorders>'
196+
197+
for key, value in extracted_info.items():
198+
cells = table.add_row().cells
199+
for cell in cells:
200+
cell._element.get_or_add_tcPr().append(parse_xml(border_xml))
201+
cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER
202+
203+
is_last = key == list(extracted_info.keys())[-1]
204+
cells[0].text = key + ("" if is_last else "\n")
205+
cells[1].text = value + ("" if is_last else "\n")
206+
207+
cropped_image_1 = cls.extract_image_from_pdf(path, 1, 1898, 583, 2230, 1026)
208+
cls.add_float_picture(doc.add_paragraph(), cropped_image_1, width=Inches(1.2), pos_x=Pt(430), pos_y=Pt(140))
209+
210+
cropped_image_2 = cls.extract_image_from_pdf(path, 1, 300, 2690, 630, 2985)
211+
cls.add_float_picture(doc.add_paragraph(), cropped_image_2, width=Inches(1.2), pos_x=Pt(78), pos_y=Pt(643))
212+
213+
output_path = path.replace(".pdf", ".docx")
214+
doc.save(output_path)
215+
216+
return output_path
217+
218+
except Exception as e:
219+
return make_response(f"<script>alert('Error during DOCX conversion: {e}'); window.location.href = document.referrer;</script>")
220+
221+
@classmethod
222+
def convert_file(cls):
223+
if 'file' not in request.files:
224+
return make_response("<script>alert('缺少文件部分'); window.location.href = document.referrer;</script>")
225+
226+
file = request.files['file']
227+
if file.filename == '':
228+
return make_response("<script>alert('没有选中的文件'); window.location.href = document.referrer;</script>")
229+
230+
if not file.filename.lower().endswith('.pdf'):
231+
return make_response("<script>alert('只接受 PDF 文件'); window.location.href = document.referrer;</script>")
232+
233+
if not file.filename.startswith('教育部学籍在线验证报告_'):
234+
return make_response("<script>alert('请不要传入无关文件'); window.location.href = document.referrer;</script>")
235+
236+
try:
237+
filename = secure_filename(file.filename)
238+
filepath = os.path.join(os.getcwd(), 'upload', filename)
239+
file.save(filepath)
240+
241+
output_path = cls.convert_to_docx(filepath)
242+
243+
directory = os.path.dirname(output_path)
244+
filename = os.path.basename(output_path)
245+
output_filename = str(uuid.uuid4()) + '.docx'
246+
247+
response = make_response(send_from_directory(directory, filename, as_attachment=True))
248+
response.headers["Content-Disposition"] = f"attachment; filename={output_filename}"
249+
250+
# 隐私处理
251+
upload_folder = os.path.join(os.getcwd(), 'upload')
252+
for filename in os.listdir(upload_folder):
253+
if filename != '.gitkeep':
254+
file_path = os.path.join(upload_folder, filename)
255+
try:
256+
if os.path.isfile(file_path) or os.path.islink(file_path):
257+
os.unlink(file_path)
258+
elif os.path.isdir(file_path):
259+
shutil.rmtree(file_path)
260+
except Exception as e:
261+
print(f"Failed to delete {file_path}. Reason: {e}")
262+
263+
return response
264+
except Exception as e:
265+
return make_response(f"<script>alert('处理文件时发生错误: {e}'); window.location.href = document.referrer;</script>")
266+
267+
register_element_cls('wp:anchor', CT_Anchor)

0 commit comments

Comments
 (0)
Please sign in to comment.