1
+ from flask import request , make_response , send_from_directory
2
+ from werkzeug .utils import secure_filename
3
+ from docx import Document
4
+ from docx .shared import Inches , Pt
5
+ from docx .enum .table import WD_ALIGN_VERTICAL
6
+ from docx .oxml import parse_xml , register_element_cls
7
+ from docx .oxml .ns import nsdecls
8
+ from docx .oxml .shape import CT_Picture
9
+ from docx .oxml .xmlchemy import BaseOxmlElement , OneAndOnlyOne
10
+ from pypdf import PdfReader
11
+ from pypinyin import lazy_pinyin
12
+ from pdf2image import convert_from_path
13
+ import re
14
+ import os
15
+ import uuid
16
+ import shutil
17
+
18
+ class CT_Anchor (BaseOxmlElement ):
19
+ extent = OneAndOnlyOne ('wp:extent' )
20
+ docPr = OneAndOnlyOne ('wp:docPr' )
21
+ graphic = OneAndOnlyOne ('a:graphic' )
22
+
23
+ @classmethod
24
+ def new (cls , cx , cy , shape_id , pic , pos_x , pos_y ):
25
+ anchor = parse_xml (cls ._anchor_xml (pos_x , pos_y ))
26
+ anchor .extent .cx = cx
27
+ anchor .extent .cy = cy
28
+ anchor .docPr .id = shape_id
29
+ anchor .docPr .name = f'Picture { shape_id } '
30
+ anchor .graphic .graphicData .uri = 'http://schemas.openxmlformats.org/drawingml/2006/picture'
31
+ anchor .graphic .graphicData ._insert_pic (pic )
32
+ return anchor
33
+
34
+ @classmethod
35
+ def new_pic_anchor (cls , shape_id , rId , filename , cx , cy , pos_x , pos_y ):
36
+ pic_id = 0
37
+ pic = CT_Picture .new (pic_id , filename , rId , cx , cy )
38
+ anchor = cls .new (cx , cy , shape_id , pic , pos_x , pos_y )
39
+ anchor .graphic .graphicData ._insert_pic (pic )
40
+ return anchor
41
+
42
+ @classmethod
43
+ def _anchor_xml (cls , pos_x , pos_y ):
44
+ return (
45
+ '<wp:anchor distT="0" distB="0" distL="0" distR="0" simplePos="0" relativeHeight="0" \n '
46
+ ' behindDoc="1" locked="0" layoutInCell="1" allowOverlap="1" \n '
47
+ ' %s>\n '
48
+ ' <wp:simplePos x="0" y="0"/>\n '
49
+ ' <wp:positionH relativeFrom="page">\n '
50
+ ' <wp:posOffset>%d</wp:posOffset>\n '
51
+ ' </wp:positionH>\n '
52
+ ' <wp:positionV relativeFrom="page">\n '
53
+ ' <wp:posOffset>%d</wp:posOffset>\n '
54
+ ' </wp:positionV>\n '
55
+ ' <wp:extent cx="914400" cy="914400"/>\n '
56
+ ' <wp:wrapNone/>\n '
57
+ ' <wp:docPr id="666" name="unnamed"/>\n '
58
+ ' <wp:cNvGraphicFramePr>\n '
59
+ ' <a:graphicFrameLocks noChangeAspect="1"/>\n '
60
+ ' </wp:cNvGraphicFramePr>\n '
61
+ ' <a:graphic>\n '
62
+ ' <a:graphicData uri="URI not set"/>\n '
63
+ ' </a:graphic>\n '
64
+ '</wp:anchor>' % (nsdecls ('wp' , 'a' , 'pic' , 'r' ), int (pos_x ), int (pos_y ))
65
+ )
66
+
67
+ def new_pic_anchor (part , image_descriptor , width , height , pos_x , pos_y ):
68
+ rId , image = part .get_or_add_image (image_descriptor )
69
+ cx , cy = image .scaled_dimensions (width , height )
70
+ shape_id , filename = part .next_id , image .filename
71
+ return CT_Anchor .new_pic_anchor (shape_id , rId , filename , cx , cy , pos_x , pos_y )
72
+
73
+ class CHSIConverter :
74
+ @staticmethod
75
+ def extract_text_from_pdf (pdf_path ):
76
+ try :
77
+ with open (pdf_path , 'rb' ) as pdf_file_obj :
78
+ pdf_reader = PdfReader (pdf_file_obj )
79
+ text = "" .join (page .extract_text () for page in pdf_reader .pages )
80
+ return text
81
+ except Exception as e :
82
+ return make_response (f"<script>alert('从PDF提取文本错误: { e } '); window.location.href = document.referrer;</script>" )
83
+
84
+ @staticmethod
85
+ def extract_info (patterns_dict , text ):
86
+ results = {}
87
+ for prop , pattern in patterns_dict .items ():
88
+ match = re .search (pattern , text )
89
+ if match :
90
+ value = match .group (1 )
91
+ if prop == 'Name' :
92
+ pinyin = lazy_pinyin (value )
93
+ results [prop ] = f"{ '' .join (pinyin [1 :]).capitalize ()} { pinyin [0 ].capitalize ()} "
94
+ elif prop == 'Gender' :
95
+ results [prop ] = 'Male' if value == '男' else 'Female' if value == '女' else None
96
+ elif prop == 'Ethnic' :
97
+ results [prop ] = '' .join (lazy_pinyin (value [:- 1 ])).title ()
98
+ elif prop in ['Date of Birth' , 'Date of Enrollment' , 'Update Date' ]:
99
+ year , rest = value .split ('年' )
100
+ month , day = rest .split ('月' )
101
+ results [prop ] = f"{ month } /{ day .replace ('日' , '' )} /{ year } "
102
+ elif prop == 'Levels' :
103
+ results [prop ] = 'Undergraduate'
104
+ elif prop == 'Form' :
105
+ results [prop ] = 'General full-time remote study'
106
+ elif prop == 'Educational System' :
107
+ results [prop ] = f'{ value } years'
108
+ elif prop == 'Type' :
109
+ results [prop ] = 'General higher education'
110
+ elif prop == 'School Status' :
111
+ date_part = value .split (":" )[1 ]
112
+ year , rest = date_part .split ('年' )
113
+ month , day = rest .split ('月' )
114
+ day = day .replace ('日' , '' ).replace ('' , '' )
115
+ formatted_date = f"{ month } /{ day } /{ year } "
116
+ results [prop ] = f'Student registration (Expected graduation date: { formatted_date } )'
117
+ else :
118
+ results [prop ] = value
119
+ else :
120
+ results [prop ] = None
121
+ return results
122
+
123
+ @classmethod
124
+ def extract_info_from_pdf (cls , path ):
125
+ text = cls .extract_text_from_pdf (path )
126
+ patterns_dict = {prop : re .compile (r'{}\s*([^\s]*)' .format (pattern )) for prop , pattern in {
127
+ 'Update Date' : '更新日期:' ,
128
+ 'Name' : '姓名' ,
129
+ 'Gender' : '性别' ,
130
+ 'Id Number' : '证件号码' ,
131
+ 'Ethnic' : '民族' ,
132
+ 'Date of Birth' : '出生日期 ' ,
133
+ 'Institution' : '院校' ,
134
+ 'Levels' : '层次' ,
135
+ 'Faculties' : '院系' ,
136
+ 'Class' : '班级' ,
137
+ 'Major' : '专业' ,
138
+ 'Student Number' : '学号' ,
139
+ 'Form' : '形式' ,
140
+ 'Date of Enrollment' : '入学日期' ,
141
+ 'Educational System' : '学制' ,
142
+ 'Type' : '类型' ,
143
+ 'School Status' : '学籍状态' ,
144
+ }.items ()}
145
+ return cls .extract_info (patterns_dict , text )
146
+
147
+ @staticmethod
148
+ def extract_image_from_pdf (path , page_number , left , top , right , bottom ):
149
+ try :
150
+ images = convert_from_path (path , dpi = 300 , first_page = page_number , last_page = page_number )
151
+ image = images [0 ]
152
+ cropped_image = image .crop ((left , top , right , bottom ))
153
+ file_name = os .path .splitext (path )[0 ]
154
+ image_path = f"{ file_name } _image.png"
155
+ cropped_image .save (image_path )
156
+ return image_path
157
+ except Exception as e :
158
+ return make_response (f"<script>alert('从PDF提取图片错误: { e } '); window.location.href = document.referrer;</script>" )
159
+
160
+ @staticmethod
161
+ def add_float_picture (p , image_path_or_stream , width = None , height = None , pos_x = 0 , pos_y = 0 ):
162
+ try :
163
+ run = p .add_run ()
164
+ anchor = new_pic_anchor (run .part , image_path_or_stream , width , height , pos_x , pos_y )
165
+ run ._r .add_drawing (anchor )
166
+ except Exception :
167
+ return make_response ("<script>alert('浮动图片添加时发生错误'); window.location.href = document.referrer;</script>" )
168
+
169
+ @classmethod
170
+ def convert_to_docx (cls , path ):
171
+ try :
172
+ extracted_info = cls .extract_info_from_pdf (path )
173
+ doc = Document ("static/template.docx" )
174
+
175
+ paragraph = doc .add_paragraph ()
176
+ doc .element .body .insert (1 , paragraph ._element )
177
+ paragraph .alignment = 1
178
+ paragraph .add_run ('Update date:' + extracted_info ['Update Date' ])
179
+
180
+ del extracted_info ['Update Date' ]
181
+
182
+ table = doc .add_table (rows = 1 , cols = 2 )
183
+ table .autofit = False
184
+
185
+ for cell in table .columns [0 ].cells :
186
+ cell .width = Inches (0.5 )
187
+ for cell in table .columns [1 ].cells :
188
+ cell .width = Inches (5.0 )
189
+
190
+ border_xml = '<w:tcBorders xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">' \
191
+ '<w:top w:val="nil"/>' \
192
+ '<w:left w:val="nil"/>' \
193
+ '<w:bottom w:val="nil"/>' \
194
+ '<w:right w:val="nil"/>' \
195
+ '</w:tcBorders>'
196
+
197
+ for key , value in extracted_info .items ():
198
+ cells = table .add_row ().cells
199
+ for cell in cells :
200
+ cell ._element .get_or_add_tcPr ().append (parse_xml (border_xml ))
201
+ cell .vertical_alignment = WD_ALIGN_VERTICAL .CENTER
202
+
203
+ is_last = key == list (extracted_info .keys ())[- 1 ]
204
+ cells [0 ].text = key + ("" if is_last else "\n " )
205
+ cells [1 ].text = value + ("" if is_last else "\n " )
206
+
207
+ cropped_image_1 = cls .extract_image_from_pdf (path , 1 , 1898 , 583 , 2230 , 1026 )
208
+ cls .add_float_picture (doc .add_paragraph (), cropped_image_1 , width = Inches (1.2 ), pos_x = Pt (430 ), pos_y = Pt (140 ))
209
+
210
+ cropped_image_2 = cls .extract_image_from_pdf (path , 1 , 300 , 2690 , 630 , 2985 )
211
+ cls .add_float_picture (doc .add_paragraph (), cropped_image_2 , width = Inches (1.2 ), pos_x = Pt (78 ), pos_y = Pt (643 ))
212
+
213
+ output_path = path .replace (".pdf" , ".docx" )
214
+ doc .save (output_path )
215
+
216
+ return output_path
217
+
218
+ except Exception as e :
219
+ return make_response (f"<script>alert('Error during DOCX conversion: { e } '); window.location.href = document.referrer;</script>" )
220
+
221
+ @classmethod
222
+ def convert_file (cls ):
223
+ if 'file' not in request .files :
224
+ return make_response ("<script>alert('缺少文件部分'); window.location.href = document.referrer;</script>" )
225
+
226
+ file = request .files ['file' ]
227
+ if file .filename == '' :
228
+ return make_response ("<script>alert('没有选中的文件'); window.location.href = document.referrer;</script>" )
229
+
230
+ if not file .filename .lower ().endswith ('.pdf' ):
231
+ return make_response ("<script>alert('只接受 PDF 文件'); window.location.href = document.referrer;</script>" )
232
+
233
+ if not file .filename .startswith ('教育部学籍在线验证报告_' ):
234
+ return make_response ("<script>alert('请不要传入无关文件'); window.location.href = document.referrer;</script>" )
235
+
236
+ try :
237
+ filename = secure_filename (file .filename )
238
+ filepath = os .path .join (os .getcwd (), 'upload' , filename )
239
+ file .save (filepath )
240
+
241
+ output_path = cls .convert_to_docx (filepath )
242
+
243
+ directory = os .path .dirname (output_path )
244
+ filename = os .path .basename (output_path )
245
+ output_filename = str (uuid .uuid4 ()) + '.docx'
246
+
247
+ response = make_response (send_from_directory (directory , filename , as_attachment = True ))
248
+ response .headers ["Content-Disposition" ] = f"attachment; filename={ output_filename } "
249
+
250
+ # 隐私处理
251
+ upload_folder = os .path .join (os .getcwd (), 'upload' )
252
+ for filename in os .listdir (upload_folder ):
253
+ if filename != '.gitkeep' :
254
+ file_path = os .path .join (upload_folder , filename )
255
+ try :
256
+ if os .path .isfile (file_path ) or os .path .islink (file_path ):
257
+ os .unlink (file_path )
258
+ elif os .path .isdir (file_path ):
259
+ shutil .rmtree (file_path )
260
+ except Exception as e :
261
+ print (f"Failed to delete { file_path } . Reason: { e } " )
262
+
263
+ return response
264
+ except Exception as e :
265
+ return make_response (f"<script>alert('处理文件时发生错误: { e } '); window.location.href = document.referrer;</script>" )
266
+
267
+ register_element_cls ('wp:anchor' , CT_Anchor )
0 commit comments