107dc260 by 周伟奇

add f3 folder

1 parent a825541a
1 import os
2 import time
3 import json
4 import shutil
5 import base64
6 import signal
7 import requests
8 import traceback
9 from django.core.management import BaseCommand
10 from multiprocessing import Process
11 from openpyxl import load_workbook, Workbook
12
13 from settings import conf
14 from common.mixins import LoggerMixin
15 from common.tools.pdf_to_img import PDFHandler, PDFBuild
16 from apps.doc import consts
17 from apps.doc.exceptions import OCR1Exception, OCR2Exception
18
19
20 class Command(BaseCommand, LoggerMixin):
21
22 def __init__(self):
23 super().__init__()
24 self.log_base = '[folder f3 process]'
25 # input folder
26 self.input_dir = conf.F3_DIR
27 # 处理文件开关
28 self.switch = True
29 # 睡眠时间
30 self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
31 # 输出结果
32 self.wb_name = 'result.xlsx'
33
34 self.field_map = {
35 # sheet_name, key_field, side_field_order, src_field_order
36 consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2),
37 }
38
39 self.pdf_name_map = {
40 consts.IC_CLASSIFY: consts.IC_CN_NAME,
41 consts.BL_CLASSIFY: consts.BL_CN_NAME,
42 }
43
44 # ocr url
45 self.ocr_url = conf.OCR_URL_FOLDER
46 self.ocr_url_2 = conf.OCR2_URL_FOLDER
47
48 # 优雅退出信号:15
49 signal.signal(signal.SIGTERM, self.signal_handler)
50
51 def signal_handler(self, sig, frame):
52 self.switch = False # 停止处理文件
53
54 @staticmethod
55 def license1_process(ocr_data, ocr_res, classify):
56 # 类别:'0'身份证, '1'居住证
57 license_data = ocr_data.get('data')
58 if not license_data:
59 return
60 if isinstance(license_data, dict):
61 license_data.pop('base64_img', '')
62 if classify == consts.IC_CLASSIFY:
63 id_card_dict = {}
64 card_type = license_data.get('type', '')
65 is_ic = card_type.startswith('身份证')
66 is_info_side = card_type.endswith('信息面')
67 id_card_dict['类别'] = '0' if is_ic else '1'
68 if is_ic:
69 field_map = consts.IC_MAP_0 if is_info_side else consts.IC_MAP_1
70 else:
71 field_map = consts.RP_MAP_0 if is_info_side else consts.RP_MAP_1
72 for write_field, search_field in field_map:
73 id_card_dict[write_field] = license_data.get('words_result', {}).get(search_field, {}).get('words', '')
74 if not is_info_side:
75 start_time = license_data.get('words_result', {}).get('签发日期', {}).get('words', '')
76 end_time = license_data.get('words_result', {}).get('失效日期', {}).get('words', '')
77 id_card_dict['有效期限'] = '{0}-{1}'.format(start_time, end_time)
78 # for id_card_dict in license_data:
79 # try:
80 # id_card_dict.pop('base64_img')
81 # except Exception as e:
82 # continue
83 ocr_res.setdefault(classify, []).append(id_card_dict)
84
85 def license2_process(self, ocr_data, ocr_res, classify, img_path):
86 if classify != consts.BL_CLASSIFY:
87 return
88
89 pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
90 file_data = ocr_data.get('section_img')
91 if file_data is None:
92 with open(img_path, 'rb') as f:
93 base64_data = base64.b64encode(f.read())
94 # 获取解码后的base64值
95 file_data = base64_data.decode()
96 json_data_2 = {
97 "pid": str(pid),
98 "filedata": file_data
99 }
100
101 for times in range(consts.RETRY_TIMES):
102 try:
103 start_time = time.time()
104 ocr_2_response = requests.post(self.ocr_url_2, data=json_data_2)
105 if ocr_2_response.status_code != 200:
106 raise OCR2Exception('ocr_2 status code: {0}'.format(ocr_2_response.status_code))
107 except Exception as e:
108 self.folder_log.warn(
109 '{0} [ocr_2 failed] [times={1}] [img_path={2}] [error={3}]'.format(
110 self.log_base, times, img_path, traceback.format_exc()))
111 else:
112 ocr_res_2 = json.loads(ocr_2_response.text)
113 end_time = time.time()
114 speed_time = int(end_time - start_time)
115 self.folder_log.info(
116 '{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
117 self.log_base, img_path, speed_time))
118
119 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
120 if pid == consts.BC_PID:
121 ocr_res.append(ocr_res_2)
122 else:
123 # 营业执照等
124 for result_dict in ocr_res_2.get('ResultList', []):
125 res_dict = {}
126 for field_dict in result_dict.get('FieldList', []):
127 res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
128 ocr_res.append(res_dict)
129 break
130
131 def wb_process(self, ocr_res, output_dir):
132 excel_path = os.path.join(output_dir, self.wb_name)
133
134 try:
135 if os.path.exists(excel_path):
136 wb = load_workbook(excel_path)
137 else:
138 wb = Workbook()
139
140 for c, res_list in ocr_res.items():
141 sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(c)
142 if sheet_name in wb.sheetnames:
143 ws = wb.get_sheet_by_name(sheet_name)
144 else:
145 ws = wb.create_sheet(sheet_name)
146
147 for res in res_list:
148 if key_field is not None and key_field in res:
149 field_order = side_field_order
150 else:
151 field_order = src_field_order
152 for search_field, write_field in field_order:
153 field_value = res.get(search_field, '')
154 if isinstance(field_value, list):
155 ws.append((write_field, *field_value))
156 else:
157 ws.append((write_field, field_value))
158 ws.append((None,))
159
160 if 'Sheet' in wb.sheetnames and len(wb.sheetnames) > 1:
161 wb.remove(wb.get_sheet_by_name('Sheet'))
162 wb.save(excel_path)
163 except Exception as e:
164 self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
165 self.log_base, excel_path, traceback.format_exc()))
166
167 def ocr_process(self, img_path, all_ocr_res, img_res):
168 if os.path.exists(img_path):
169 with open(img_path, 'rb') as f:
170 base64_data = base64.b64encode(f.read())
171 # 获取解码后的base64值
172 file_data = base64_data.decode()
173 json_data = {
174 "file": file_data,
175 "channel": consts.AFC_PREFIX,
176 }
177
178 for times in range(consts.RETRY_TIMES):
179 try:
180 start_time = time.time()
181 ocr_response = requests.post(self.ocr_url, json=json_data)
182 if ocr_response.status_code != 200:
183 raise OCR1Exception('{0} ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
184 except Exception as e:
185 self.folder_log.warn('{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'.format(
186 self.log_base, times, img_path, traceback.format_exc()))
187 else:
188 ocr_res = ocr_response.json()
189 end_time = time.time()
190 speed_time = int(end_time - start_time)
191 self.folder_log.info('{0} [ocr success] [img={1}] [speed_time={2}]'.format(
192 self.log_base, img_path, speed_time))
193
194 if isinstance(ocr_res, dict):
195 if ocr_res.get('code') == 1:
196 data_list = ocr_res.get('data', [])
197 if isinstance(data_list, list):
198 for ocr_data in data_list:
199 classify = ocr_data.get('classify')
200 img_res.setdefault(classify, set()).add(img_path)
201 if classify in consts.LICENSE_CLASSIFY_SET_1:
202 self.license1_process(ocr_data, all_ocr_res, classify)
203 elif classify in consts.LICENSE_CLASSIFY_SET_2:
204 self.license2_process(ocr_data, all_ocr_res, classify, img_path)
205 break
206 else:
207 self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
208
209 def img_process(self, img_res, output_dir):
210 for classify, img_path_set in img_res.items():
211 pdf_path = os.path.join(output_dir, '{0}.pdf'.format(self.pdf_name_map.get(classify, '其他')))
212 pdf_build = PDFBuild(pdf_path)
213 pdf_build.insert_img(img_path_set)
214
215 def images_process(self, img_path_list, output_dir):
216 ocr_res = dict()
217 img_res = dict()
218 for img_path in img_path_list:
219 self.ocr_process(img_path, ocr_res, img_res)
220 self.wb_process(ocr_res, output_dir)
221 self.img_process(img_res, output_dir)
222
223 def pdf_process(self, name, path, img_output_dir, output_dir):
224 pdf_handler = PDFHandler(path, os.path.join(img_output_dir, name))
225
226 try:
227 self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
228 pdf_handler.extract_image()
229 self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
230 except Exception as e:
231 self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
232 self.log_base, path, traceback.format_exc()))
233 raise e
234 else:
235 self.images_process(pdf_handler.img_path_list, output_dir)
236
237 def folder_process(self, unique_folder_name, folder_path, main_output_dir):
238 output_dir = os.path.join(main_output_dir, unique_folder_name)
239
240 img_output_dir = os.path.join(output_dir, 'image')
241 failed_output_dir = os.path.join(output_dir, 'failed')
242 os.makedirs(output_dir, exist_ok=True)
243 os.makedirs(img_output_dir, exist_ok=True)
244 os.makedirs(failed_output_dir, exist_ok=True)
245
246 os_error_filename_set = set()
247
248 list_dir = os.listdir(folder_path)
249 if not list_dir and len(os_error_filename_set) == 0:
250 self.folder_log.info('{0} [folder empty, completed] [path={1}]'.format(self.log_base, folder_path))
251 return
252
253 all_file_set = set(list_dir)
254 true_file_set = all_file_set - os_error_filename_set
255 if len(true_file_set) == 0 and len(os_error_filename_set) > 0:
256 true_file_set.add(os_error_filename_set.pop())
257 for name in true_file_set:
258 time.sleep(5)
259 path = os.path.join(folder_path, name)
260
261 try:
262 if not os.path.exists(path):
263 self.folder_log.info('{0} [path is not exists] [path={1}]'.format(self.log_base, path))
264 continue
265 elif os.path.isfile(path):
266 self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
267 self.pdf_process(name, path, img_output_dir, output_dir)
268 self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
269 else:
270 self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, path))
271 failed_path = os.path.join(failed_output_dir, name)
272 shutil.copyfile(path, failed_path)
273 except OSError:
274 os_error_filename_set.add(name)
275 self.folder_log.error('{0} [os error] [path={1}] [error={2}]'.format(
276 self.log_base, path, traceback.format_exc()))
277 except Exception as e:
278 try:
279 self.folder_log.error('{0} [file error] [path={1}] [error={2}]'.format(self.log_base, path,
280 traceback.format_exc()))
281 failed_path = os.path.join(failed_output_dir, name)
282 shutil.copyfile(path, failed_path)
283 except Exception as e:
284 os_error_filename_set.add(name)
285 self.folder_log.error('{0} [file move error] [path={1}] [error={2}]'.format(
286 self.log_base, path, traceback.format_exc()))
287
288 def main_folder_process(self, input_dir):
289 while not os.path.isdir(input_dir):
290 self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
291 if self.switch:
292 time.sleep(self.sleep_time)
293 continue
294 else:
295 return
296
297 output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
298 completed_output_dir = os.path.join(output_dir, 'Completed')
299 failed_output_dir = os.path.join(output_dir, 'Failed')
300 os.makedirs(output_dir, exist_ok=True)
301 os.makedirs(completed_output_dir, exist_ok=True)
302 os.makedirs(failed_output_dir, exist_ok=True)
303
304 os_error_filename_set = set()
305 while self.switch:
306 # 1. 从input dir获取pdf or image
307 list_dir = os.listdir(input_dir)
308 if not list_dir and len(os_error_filename_set) == 0:
309 self.folder_log.info('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
310 time.sleep(self.sleep_time)
311 continue
312 all_file_set = set(list_dir)
313 true_file_set = all_file_set - os_error_filename_set
314 if len(true_file_set) == 0 and len(os_error_filename_set) > 0:
315 true_file_set.add(os_error_filename_set.pop())
316 for name in true_file_set:
317 time.sleep(10)
318 unique_folder_name = '{0}_{1}'.format(time.time(), name)
319 path = os.path.join(input_dir, name)
320
321 try:
322 if not os.path.exists(path):
323 self.folder_log.info('{0} [path is not exists] [path={1}]'.format(self.log_base, path))
324 continue
325 elif os.path.isdir(path):
326 self.folder_log.info('{0} [dir start] [path={1}]'.format(self.log_base, name))
327 self.folder_process(unique_folder_name, path, output_dir)
328 completed_path = os.path.join(completed_output_dir, unique_folder_name)
329 shutil.move(path, completed_path)
330 self.folder_log.info('{0} [dir end] [path={1}]'.format(self.log_base, name))
331 else:
332 self.folder_log.info('{0} [path is not dir] [path={1}]'.format(self.log_base, path))
333 failed_path = os.path.join(failed_output_dir, unique_folder_name)
334 shutil.move(path, failed_path)
335 except OSError:
336 os_error_filename_set.add(name)
337 self.folder_log.error('{0} [os error] [path={1}] [error={2}]'.format(
338 self.log_base, path, traceback.format_exc()))
339 except Exception as e:
340 try:
341 self.folder_log.error('{0} [file error] [path={1}] [error={2}]'.format(self.log_base, path,
342 traceback.format_exc()))
343 failed_path = os.path.join(failed_output_dir, unique_folder_name)
344 shutil.move(path, failed_path)
345 except Exception as e:
346 os_error_filename_set.add(name)
347 self.folder_log.error('{0} [file move error] [path={1}] [error={2}]'.format(
348 self.log_base, path, traceback.format_exc()))
349
350 def handle(self, *args, **kwargs):
351 process_list = []
352
353 process = Process(target=self.main_folder_process, args=(self.input_dir, ))
354 process_list.append(process)
355
356 for p in process_list:
357 p.start()
358 for p in process_list:
359 p.join()
360
361 self.folder_log.info('{0} [stop safely]'.format(self.log_base))
...@@ -24,6 +24,23 @@ WH_COUPLE_4 = (100, 300) ...@@ -24,6 +24,23 @@ WH_COUPLE_4 = (100, 300)
24 WH_COUPLE_5 = (100, 200) 24 WH_COUPLE_5 = (100, 200)
25 25
26 26
27 class PDFBuild:
28
29 def __init__(self, path):
30 self.path = path
31
32 def insert_img(self, img_path_list):
33 if os.path.exists(self.path):
34 pdf = fitz.Document(self.path)
35 else:
36 pdf = fitz.Document()
37 for img_path in img_path_list:
38 new_page = pdf.newPage()
39 new_page.insertImage(new_page.rect, img_path)
40 pdf.save(self.path)
41 pdf.close()
42
43
27 class PDFHandler: 44 class PDFHandler:
28 45
29 def __init__(self, path, img_dir_path, document_name=None): 46 def __init__(self, path, img_dir_path, document_name=None):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!