Merge branch 'feature/main2' into feature/main
Showing
3 changed files
with
89 additions
and
663 deletions
1 | import os | ||
2 | import time | ||
3 | import json | ||
4 | import signal | ||
5 | import asyncio | ||
6 | import aiohttp | ||
7 | import difflib | ||
8 | import base64 | ||
9 | import requests | ||
10 | from datetime import datetime, date | ||
11 | from collections import Counter | ||
12 | from apps.doc.ocr.wb import BSWorkbook, Workbook | ||
13 | from django.core.management import BaseCommand | ||
14 | |||
15 | from settings import conf | ||
16 | from common.mixins import LoggerMixin | ||
17 | from common.tools.file_tools import write_zip_file | ||
18 | from common.tools.pdf_to_img import PDFHandler | ||
19 | from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords | ||
20 | from apps.doc.named_enum import KeywordsType | ||
21 | from apps.doc import consts | ||
22 | from apps.doc.ocr.edms import EDMS, rh | ||
23 | from apps.doc.exceptions import EDMSException | ||
24 | |||
25 | |||
26 | class Command(BaseCommand, LoggerMixin): | ||
27 | |||
28 | def __init__(self): | ||
29 | super().__init__() | ||
30 | self.log_base = '[doc ocr process]' | ||
31 | # 处理文件开关 | ||
32 | self.switch = True | ||
33 | # 数据目录 | ||
34 | self.data_dir = conf.DATA_DIR | ||
35 | # ocr相关 | ||
36 | self.ocr_url_1 = conf.OCR_URL_1 | ||
37 | self.ocr_url_2 = conf.OCR_URL_2 | ||
38 | self.ocr_url_3 = conf.BC_URL | ||
39 | # EDMS web_service_api | ||
40 | self.edms = EDMS() | ||
41 | # 优雅退出信号:15 | ||
42 | signal.signal(signal.SIGTERM, self.signal_handler) | ||
43 | |||
44 | def signal_handler(self, sig, frame): | ||
45 | self.switch = False # 停止处理文件 | ||
46 | |||
47 | def get_doc_info(self): | ||
48 | task_str, is_priority = rh.dequeue() | ||
49 | if task_str is None: | ||
50 | self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | ||
51 | return None, None | ||
52 | |||
53 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | ||
54 | doc_id = int(doc_id_str) | ||
55 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | ||
56 | # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values( | ||
57 | # 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first() | ||
58 | doc = doc_class.objects.filter(id=doc_id).first() | ||
59 | if doc is None: | ||
60 | self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( | ||
61 | self.log_base, task_str, is_priority)) | ||
62 | return None, None | ||
63 | elif doc.status != DocStatus.INIT.value: | ||
64 | self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' | ||
65 | '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) | ||
66 | return None, None | ||
67 | doc.status = DocStatus.PROCESSING.value | ||
68 | doc.save() | ||
69 | self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format( | ||
70 | self.log_base, task_str, is_priority)) | ||
71 | return doc, business_type | ||
72 | |||
73 | def pdf_download(self, doc, business_type): | ||
74 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) | ||
75 | os.makedirs(doc_data_path, exist_ok=True) | ||
76 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | ||
77 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | ||
78 | for times in range(consts.RETRY_TIMES): | ||
79 | try: | ||
80 | self.edms.download(pdf_path, doc.metadata_version_id) | ||
81 | except Exception as e: | ||
82 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
83 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
84 | edms_exc = str(e) | ||
85 | else: | ||
86 | break | ||
87 | else: | ||
88 | raise EDMSException(edms_exc) | ||
89 | |||
90 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) | ||
91 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') | ||
92 | self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( | ||
93 | self.log_base, business_type, doc.id, pdf_path)) | ||
94 | return doc_data_path, excel_path, src_excel_path, pdf_path | ||
95 | |||
96 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino): | ||
97 | sheets = ocr_data.get('data', []) | ||
98 | if not sheets: | ||
99 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
100 | return | ||
101 | confidence = ocr_data.get('confidence', 1) | ||
102 | img_name = 'page_{0}_img_{1}'.format(pno, ino) | ||
103 | cells_exists = False | ||
104 | for i, sheet in enumerate(sheets): | ||
105 | cells = sheet.get('cells') | ||
106 | if not cells: | ||
107 | continue | ||
108 | cells_exists = True | ||
109 | sheet_name = '{0}_{1}'.format(img_name, i) | ||
110 | ws = wb.create_sheet(sheet_name) | ||
111 | for cell in cells: | ||
112 | c1 = cell.get('start_column') | ||
113 | r1 = cell.get('start_row') | ||
114 | words = cell.get('words') | ||
115 | ws.cell(row=r1 + 1, column=c1 + 1, value=words) | ||
116 | |||
117 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] | ||
118 | summary = sheet.get('summary') | ||
119 | card = summary[1] | ||
120 | if card is None: | ||
121 | classify_dict = unknown_summary.setdefault(classify, {}) | ||
122 | role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0] | ||
123 | role_dict = classify_dict.setdefault(role, {}) | ||
124 | role_dict['classify'] = classify | ||
125 | role_dict['role'] = role | ||
126 | role_dict.setdefault('sheet', []).append(sheet_name) | ||
127 | role_dict.setdefault('confidence', []).append(confidence) | ||
128 | code_list = role_dict.setdefault('code', []) | ||
129 | pt_list = role_dict.setdefault('print_time', []) | ||
130 | sd_list = role_dict.setdefault('start_date', []) | ||
131 | ed_list = role_dict.setdefault('end_date', []) | ||
132 | if summary[3] is not None: | ||
133 | code_list.append((summary[2], summary[3])) | ||
134 | if summary[4] is not None: | ||
135 | pt_list.append(summary[4]) | ||
136 | if summary[5] is not None: | ||
137 | sd_list.append(summary[5]) | ||
138 | if summary[6] is not None: | ||
139 | ed_list.append(summary[6]) | ||
140 | else: | ||
141 | card_dict = bs_summary.setdefault(card, {}) | ||
142 | card_dict['count'] = card_dict.get('count', 0) + 1 | ||
143 | card_dict.setdefault('classify', []).append(classify) | ||
144 | card_dict.setdefault('confidence', []).append(confidence) | ||
145 | card_dict.setdefault('sheet', []).append(sheet_name) | ||
146 | role_list = card_dict.setdefault('role', []) | ||
147 | role_set = card_dict.setdefault('role_set', set()) | ||
148 | code_list = card_dict.setdefault('code', []) | ||
149 | pt_list = card_dict.setdefault('print_time', []) | ||
150 | sd_list = card_dict.setdefault('start_date', []) | ||
151 | ed_list = card_dict.setdefault('end_date', []) | ||
152 | if summary[0] is not None: | ||
153 | role_list.append(summary[0]) | ||
154 | role_set.add(summary[0]) | ||
155 | if summary[3] is not None: | ||
156 | code_list.append((summary[2], summary[3])) | ||
157 | if summary[4] is not None: | ||
158 | pt_list.append(summary[4]) | ||
159 | if summary[5] is not None: | ||
160 | sd_list.append(summary[5]) | ||
161 | if summary[6] is not None: | ||
162 | ed_list.append(summary[6]) | ||
163 | |||
164 | if cells_exists: | ||
165 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
166 | else: | ||
167 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
168 | |||
169 | def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino): | ||
170 | # 类别:'0'身份证, '1'居住证 | ||
171 | license_data = ocr_data.get('data', []) | ||
172 | if not license_data: | ||
173 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
174 | return | ||
175 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
176 | license_summary.setdefault(classify, []).extend(license_data) | ||
177 | |||
178 | def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino): | ||
179 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: | ||
180 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
181 | if pid == consts.BC_PID: | ||
182 | # 银行卡 | ||
183 | # res_dict = {} | ||
184 | # for en_key, chn_key in consts.BC_FIELD: | ||
185 | # res_dict[chn_key] = ocr_res_2.get(en_key, '') | ||
186 | license_summary.setdefault(classify, []).append(ocr_res_2) | ||
187 | else: | ||
188 | # 营业执照等 | ||
189 | for result_dict in ocr_res_2.get('ResultList', []): | ||
190 | res_dict = {} | ||
191 | for field_dict in result_dict.get('FieldList', []): | ||
192 | res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') | ||
193 | license_summary.setdefault(classify, []).append(res_dict) | ||
194 | else: | ||
195 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
196 | |||
197 | @staticmethod | ||
198 | async def fetch_ocr_1_result(url, json_data): | ||
199 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
200 | async with session.post(url, json=json_data) as response: | ||
201 | if response.status == 200: | ||
202 | return await response.json() | ||
203 | |||
204 | @staticmethod | ||
205 | async def fetch_ocr_2_result(url, json_data): | ||
206 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
207 | async with session.post(url, data=json_data) as response: | ||
208 | if response.status == 200: | ||
209 | return await response.text() | ||
210 | |||
211 | @staticmethod | ||
212 | async def fetch_bc_name_result(url, json_data): | ||
213 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
214 | async with session.post(url, json=json_data) as response: | ||
215 | if response.status == 200: | ||
216 | return await response.json() | ||
217 | |||
218 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list): | ||
219 | pno, ino = self.parse_img_path(img_path) | ||
220 | with open(img_path, 'rb') as f: | ||
221 | base64_data = base64.b64encode(f.read()) | ||
222 | # 获取解码后的base64值 | ||
223 | file_data = base64_data.decode() | ||
224 | json_data_1 = { | ||
225 | "file": file_data | ||
226 | } | ||
227 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) | ||
228 | if ocr_res_1 is None: | ||
229 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
230 | self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path)) | ||
231 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
232 | else: | ||
233 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format( | ||
234 | self.log_base, img_path, ocr_res_1)) | ||
235 | |||
236 | if ocr_res_1.get('code') == 1: | ||
237 | ocr_data = ocr_res_1.get('data', {}) | ||
238 | classify = ocr_data.get('classify') | ||
239 | if classify is None: | ||
240 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
241 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
242 | self.log_base, img_path, ocr_res_1)) | ||
243 | return | ||
244 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
245 | res_list.append((pno, ino, consts.RES_SUCCESS_OTHER)) | ||
246 | return | ||
247 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
248 | self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino) | ||
249 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
250 | pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
251 | json_data_2 = { | ||
252 | "pid": str(pid), | ||
253 | # "key": conf.OCR_KEY, | ||
254 | # "secret": conf.OCR_SECRET, | ||
255 | "filedata": file_data | ||
256 | } | ||
257 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) | ||
258 | if ocr_res_2 is None: | ||
259 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
260 | self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path)) | ||
261 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
262 | else: | ||
263 | # 识别结果 | ||
264 | ocr_res_2 = json.loads(ocr_res_2) | ||
265 | self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format( | ||
266 | self.log_base, img_path, ocr_res_2)) | ||
267 | if classify == consts.BC_CLASSIFY: | ||
268 | name = '有' | ||
269 | json_data_1['card_res'] = ocr_res_2 | ||
270 | card_name_res = await self.fetch_bc_name_result(self.ocr_url_3, json_data_1) | ||
271 | if isinstance(card_name_res, dict) and \ | ||
272 | card_name_res.get('data', {}).get('is_exists_name') == 0: | ||
273 | name = '无' | ||
274 | ocr_res_2['Name'] = name | ||
275 | self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino) | ||
276 | else: # 流水处理 | ||
277 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino) | ||
278 | else: | ||
279 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
280 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
281 | self.log_base, img_path, ocr_res_1)) | ||
282 | |||
283 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | ||
284 | # # # 流水 | ||
285 | # # res = { | ||
286 | # # 'code': 1, | ||
287 | # # 'msg': 'success', | ||
288 | # # 'data': { | ||
289 | # # 'classify': 0, | ||
290 | # # 'confidence': 0.999, | ||
291 | # # 'data': [ | ||
292 | # # { | ||
293 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
294 | # # 'cells': [] | ||
295 | # # }, | ||
296 | # # { | ||
297 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
298 | # # 'cells': [] | ||
299 | # # } | ||
300 | # # ] | ||
301 | # # } | ||
302 | # # } | ||
303 | # # | ||
304 | # # # 证件-1 | ||
305 | # # res = { | ||
306 | # # 'code': 1, | ||
307 | # # 'msg': 'success', | ||
308 | # # 'data': { | ||
309 | # # 'classify': 0, | ||
310 | # # 'confidence': 0.999, | ||
311 | # # 'data': [ | ||
312 | # # { | ||
313 | # # 'cn_key': 'value', | ||
314 | # # 'cn_key': 'value', | ||
315 | # # }, | ||
316 | # # { | ||
317 | # # 'cn_key': 'value', | ||
318 | # # 'cn_key': 'value', | ||
319 | # # }, | ||
320 | # # ] | ||
321 | # # } | ||
322 | # # } | ||
323 | # # | ||
324 | # # # 证件-2 or 其他类 | ||
325 | # # res = { | ||
326 | # # 'code': 1, | ||
327 | # # 'msg': 'success', | ||
328 | # # 'data': { | ||
329 | # # 'classify': 0, | ||
330 | # # 'confidence': 0.999, | ||
331 | # # } | ||
332 | # # } | ||
333 | # with open(img_path, 'rb') as f: | ||
334 | # base64_data = base64.b64encode(f.read()) | ||
335 | # # 获取解码后的base64值 | ||
336 | # file_data = base64_data.decode() | ||
337 | # json_data_1 = { | ||
338 | # "file": file_data | ||
339 | # } | ||
340 | # response_1 = requests.post(self.ocr_url_1, json=json_data_1) | ||
341 | # if response_1.status_code == 200: | ||
342 | # ocr_res_1 = response_1.json() | ||
343 | # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
344 | # self.log_base, img_path, ocr_res_1)) | ||
345 | # | ||
346 | # if ocr_res_1.get('code') == 1: | ||
347 | # ocr_data = ocr_res_1.get('data', {}) | ||
348 | # classify = ocr_data.get('classify') | ||
349 | # if classify is None: | ||
350 | # skip_img.append(self.parse_img_path(img_path)) | ||
351 | # return | ||
352 | # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
353 | # skip_img.append(self.parse_img_path(img_path)) | ||
354 | # return | ||
355 | # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
356 | # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) | ||
357 | # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
358 | # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
359 | # json_data_2 = { | ||
360 | # "pid": str(pid), | ||
361 | # "key": conf.OCR_KEY, | ||
362 | # "secret": conf.OCR_SECRET, | ||
363 | # "file": file_data | ||
364 | # } | ||
365 | # response_2 = requests.post(self.ocr_url_2, data=json_data_2) | ||
366 | # if response_2.status_code == 200: | ||
367 | # # 识别结果 | ||
368 | # ocr_res_2 = response_2.json() | ||
369 | # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
370 | # self.log_base, img_path, ocr_res_2)) | ||
371 | # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | ||
372 | # else: | ||
373 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
374 | # else: # 流水处理 | ||
375 | # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | ||
376 | # else: | ||
377 | # skip_img.append(self.parse_img_path(img_path)) | ||
378 | # else: | ||
379 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
380 | |||
381 | @staticmethod | ||
382 | def parse_img_path(img_path): | ||
383 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
384 | part_list = img_name.split('_') | ||
385 | # page_7_img_11_0 | ||
386 | return int(part_list[1])+1, int(part_list[3])+1 | ||
387 | |||
388 | @staticmethod | ||
389 | def get_most(value_list): | ||
390 | if value_list: | ||
391 | most_common = Counter(value_list).most_common(1) | ||
392 | return most_common[0][0] if most_common else None | ||
393 | |||
394 | @staticmethod | ||
395 | def date_format(date_str, format_str): | ||
396 | try: | ||
397 | date_res = datetime.strptime(date_str, format_str).date() | ||
398 | except Exception as e: | ||
399 | return | ||
400 | else: | ||
401 | return date_res | ||
402 | |||
403 | def get_validate_date(self, date_list): | ||
404 | for date_str in date_list: | ||
405 | for format_str in consts.DATE_FORMAT: | ||
406 | date_res = self.date_format(date_str, format_str) | ||
407 | if isinstance(date_res, date): | ||
408 | return date_res | ||
409 | |||
410 | def merge_card(self, bs_summary): | ||
411 | merged_bs_summary = {} | ||
412 | sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True) | ||
413 | for main_card in sorted_card: | ||
414 | if bs_summary.get(main_card) is None: | ||
415 | continue | ||
416 | merged_bs_summary[main_card] = bs_summary.pop(main_card) | ||
417 | del merged_bs_summary[main_card]['count'] | ||
418 | merge_cards = [] | ||
419 | for card in bs_summary.keys(): | ||
420 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: | ||
421 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) | ||
422 | merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) | ||
423 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) | ||
424 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) | ||
425 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) | ||
426 | merged_bs_summary[main_card]['code'].extend(bs_summary[card]['code']) | ||
427 | merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time']) | ||
428 | merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date']) | ||
429 | merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date']) | ||
430 | merge_cards.append(card) | ||
431 | for card in merge_cards: | ||
432 | del bs_summary[card] | ||
433 | merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify']) | ||
434 | merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role']) | ||
435 | del bs_summary | ||
436 | return merged_bs_summary | ||
437 | |||
438 | def prune_bs_summary(self, bs_summary): | ||
439 | for summary in bs_summary.values(): | ||
440 | del summary['count'] | ||
441 | summary['classify'] = self.get_most(summary['classify']) | ||
442 | summary['role'] = self.get_most(summary['role']) | ||
443 | return bs_summary | ||
444 | |||
445 | def rebuild_bs_summary(self, bs_summary, unknown_summary): | ||
446 | # bs_summary = { | ||
447 | # '卡号': { | ||
448 | # 'count': 100, | ||
449 | # 'classify': [], | ||
450 | # 'confidence': [], | ||
451 | # 'role': [], | ||
452 | # 'code': [('page', 'code')], | ||
453 | # 'print_time': [], | ||
454 | # 'start_date': [], | ||
455 | # 'end_date': [], | ||
456 | # 'sheet': ['sheet_name'] | ||
457 | # } | ||
458 | # } | ||
459 | # | ||
460 | # unknown_summary = { | ||
461 | # 0: { | ||
462 | # '户名': { | ||
463 | # 'classify': 0, | ||
464 | # 'confidence': [], | ||
465 | # 'role': '户名', | ||
466 | # 'code': [('page', 'code')], | ||
467 | # 'print_time': [], | ||
468 | # 'start_date': [], | ||
469 | # 'end_date': [], | ||
470 | # 'sheet': ['sheet_name'] | ||
471 | # } | ||
472 | # } | ||
473 | # } | ||
474 | # 无卡号 | ||
475 | if len(bs_summary) == 0: | ||
476 | del bs_summary | ||
477 | merged_bs_summary = {} | ||
478 | card_num = 1 | ||
479 | for role_dict in unknown_summary.values(): | ||
480 | if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict: | ||
481 | summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {}) | ||
482 | for summary in role_dict.values(): | ||
483 | summary_dict['confidence'].extend(summary['confidence']) | ||
484 | summary_dict['role'] = summary['role'] | ||
485 | summary_dict['code'].extend(summary['code']) | ||
486 | summary_dict['print_time'].extend(summary['print_time']) | ||
487 | summary_dict['start_date'].extend(summary['start_date']) | ||
488 | summary_dict['end_date'].extend(summary['end_date']) | ||
489 | summary_dict['sheet'].extend(summary['sheet']) | ||
490 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
491 | merged_bs_summary[card] = summary_dict | ||
492 | else: | ||
493 | for summary in role_dict.values(): | ||
494 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
495 | card_num += 1 | ||
496 | merged_bs_summary[card] = summary | ||
497 | else: | ||
498 | # 1卡号 | ||
499 | one_card = False | ||
500 | if len(bs_summary) == 1: | ||
501 | merged_bs_summary = self.prune_bs_summary(bs_summary) | ||
502 | one_card = True | ||
503 | # 多卡号 | ||
504 | else: | ||
505 | merged_bs_summary = self.merge_card(bs_summary) | ||
506 | |||
507 | for card_summary in merged_bs_summary.values(): | ||
508 | merge_role = [] | ||
509 | classify_summary = unknown_summary.get(card_summary['classify'], {}) | ||
510 | for role, summary in classify_summary.items(): | ||
511 | if one_card or role in card_summary['role_set']: | ||
512 | merge_role.append(role) | ||
513 | card_summary['confidence'].extend(summary['confidence']) | ||
514 | card_summary['sheet'].extend(summary['sheet']) | ||
515 | card_summary['code'].extend(summary['code']) | ||
516 | card_summary['print_time'].extend(summary['print_time']) | ||
517 | card_summary['start_date'].extend(summary['start_date']) | ||
518 | card_summary['end_date'].extend(summary['end_date']) | ||
519 | |||
520 | for role in merge_role: | ||
521 | del classify_summary[role] | ||
522 | |||
523 | card_num = 1 | ||
524 | for role_dict in unknown_summary.values(): | ||
525 | for summary in role_dict.values(): | ||
526 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
527 | card_num += 1 | ||
528 | merged_bs_summary[card] = summary | ||
529 | |||
530 | del unknown_summary | ||
531 | for summary in merged_bs_summary.values(): | ||
532 | if summary.get('role_set') is not None: | ||
533 | del summary['role_set'] | ||
534 | summary['print_time'] = self.get_validate_date(summary['print_time']) | ||
535 | summary['start_date'] = self.get_validate_date(summary['start_date']) | ||
536 | summary['end_date'] = self.get_validate_date(summary['end_date']) | ||
537 | summary['confidence'] = max(summary['confidence']) | ||
538 | return merged_bs_summary | ||
539 | |||
540 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 | ||
541 | # TODO 异常邮件通知 | ||
542 | # 识别失败:普通异常,如PDF异常、构建过程异常 | ||
543 | # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件 | ||
544 | # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件 | ||
545 | # TODO OCR接口调用重试 | ||
546 | def handle(self, *args, **kwargs): | ||
547 | sleep_second = int(conf.SLEEP_SECOND) | ||
548 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | ||
549 | |||
550 | while self.switch: | ||
551 | # 1. 从队列获取文件信息 | ||
552 | doc, business_type = self.get_doc_info() | ||
553 | # 队列为空时的处理 | ||
554 | if doc is None: | ||
555 | time.sleep(sleep_second) | ||
556 | sleep_second = min(max_sleep_second, sleep_second + 5) | ||
557 | continue | ||
558 | sleep_second = int(conf.SLEEP_SECOND) | ||
559 | |||
560 | try: | ||
561 | start_time = time.time() | ||
562 | # 2. 从EDMS获取PDF文件 | ||
563 | doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type) | ||
564 | |||
565 | # 3.PDF文件提取图片 | ||
566 | img_save_path = os.path.join(doc_data_path, 'img') | ||
567 | self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( | ||
568 | self.log_base, business_type, doc.id)) | ||
569 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
570 | pdf_handler.extract_image() | ||
571 | self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( | ||
572 | self.log_base, business_type, doc.id)) | ||
573 | |||
574 | # 4.获取OCR结果并且构建excel文件 | ||
575 | bs_summary = {} | ||
576 | license_summary = {} | ||
577 | unknown_summary = {} | ||
578 | res_list = [] | ||
579 | interest_keyword = Keywords.objects.filter( | ||
580 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) | ||
581 | salary_keyword = Keywords.objects.filter( | ||
582 | type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True) | ||
583 | loan_keyword = Keywords.objects.filter( | ||
584 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list( | ||
585 | 'keyword', flat=True) | ||
586 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | ||
587 | |||
588 | # wb = Workbook() | ||
589 | |||
590 | # 4.1 获取OCR结果 | ||
591 | loop = asyncio.get_event_loop() | ||
592 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) | ||
593 | for img_path in pdf_handler.img_path_list] | ||
594 | loop.run_until_complete(asyncio.wait(tasks)) | ||
595 | # loop.close() | ||
596 | |||
597 | # for img_path in pdf_handler.img_path_list: | ||
598 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) | ||
599 | |||
600 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' | ||
601 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, | ||
602 | unknown_summary, license_summary)) | ||
603 | |||
604 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | ||
605 | |||
606 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' | ||
607 | '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type, | ||
608 | doc.id, merged_bs_summary, | ||
609 | unknown_summary, res_list)) | ||
610 | del unknown_summary | ||
611 | |||
612 | # 4.2 重构Excel文件 | ||
613 | wb.save(src_excel_path) | ||
614 | wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme) | ||
615 | wb.save(excel_path) | ||
616 | except EDMSException as e: | ||
617 | doc.status = DocStatus.PROCESS_FAILED.value | ||
618 | doc.save() | ||
619 | self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] ' | ||
620 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
621 | except Exception as e: | ||
622 | doc.status = DocStatus.PROCESS_FAILED.value | ||
623 | doc.save() | ||
624 | self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] ' | ||
625 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
626 | else: | ||
627 | try: | ||
628 | # 5.上传至EDMS | ||
629 | for times in range(consts.RETRY_TIMES): | ||
630 | try: | ||
631 | self.edms.upload(excel_path, doc, business_type) | ||
632 | except Exception as e: | ||
633 | self.cronjob_log.warn( | ||
634 | '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
635 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
636 | edms_exc = str(e) | ||
637 | else: | ||
638 | break | ||
639 | else: | ||
640 | raise EDMSException(edms_exc) | ||
641 | except Exception as e: | ||
642 | doc.status = DocStatus.UPLOAD_FAILED.value | ||
643 | doc.save() | ||
644 | end_time = time.time() | ||
645 | speed_time = int(end_time - start_time) | ||
646 | self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] ' | ||
647 | '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id, | ||
648 | speed_time, e)) | ||
649 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
650 | |||
651 | else: | ||
652 | doc.status = DocStatus.COMPLETE.value | ||
653 | doc.save() | ||
654 | end_time = time.time() | ||
655 | speed_time = int(end_time - start_time) | ||
656 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' | ||
657 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) | ||
658 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
659 | |||
660 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) |
1 | import re | ||
2 | import os | ||
3 | import ast | ||
4 | import datetime | ||
5 | from openpyxl import Workbook | ||
6 | from django.core.management import BaseCommand | ||
7 | from settings import conf | ||
8 | from common.mixins import LoggerMixin | ||
9 | from apps.doc.models import HILDoc, AFCDoc | ||
10 | from apps.doc import consts | ||
11 | |||
12 | |||
13 | class Command(BaseCommand, LoggerMixin): | ||
14 | |||
15 | def __init__(self): | ||
16 | super().__init__() | ||
17 | self.sheet_names = ('AFC', 'HIL') | ||
18 | self.header = ('申请号', '身份证号', '民族', '时间戳') | ||
19 | |||
20 | def add_arguments(self, parser): | ||
21 | parser.add_argument( | ||
22 | '--date', | ||
23 | default=datetime.date.today() - datetime.timedelta(days=1), | ||
24 | dest='date', | ||
25 | help='将要计算的日期,格式: 2018-01-01' | ||
26 | ) | ||
27 | |||
28 | def handle(self, *args, **kwargs): | ||
29 | date = kwargs.get('date') | ||
30 | if isinstance(date, str): | ||
31 | if not re.match(r'\d{4}-\d{2}-\d{2}', date): | ||
32 | print('date format error') | ||
33 | return | ||
34 | date_str = date | ||
35 | else: | ||
36 | date_str = date.strftime('%Y-%m-%d') | ||
37 | |||
38 | excel_dir = os.path.join(conf.DATA_DIR, 'AFC', 'Logs') | ||
39 | if not os.path.exists(excel_dir): | ||
40 | print('excel dir not exists') | ||
41 | return | ||
42 | excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(date_str)) | ||
43 | log_path = os.path.join(conf.LOG_DIR, 'idcard.log.{0}'.format(date_str)) | ||
44 | if not os.path.exists(log_path): | ||
45 | print('log_path not exists') | ||
46 | return | ||
47 | |||
48 | wb = Workbook() | ||
49 | for name in self.sheet_names: | ||
50 | ws = wb.create_sheet(name) | ||
51 | ws.append(self.header) | ||
52 | wb.remove(wb.get_sheet_by_name('Sheet')) | ||
53 | |||
54 | with open(log_path, 'r', encoding='utf-8') as fp: | ||
55 | for line in fp: | ||
56 | search_obj = re.search(r'[(.*)] [task=(.*)] [idcard=(.*)]', line) | ||
57 | task_str = search_obj.group(1) | ||
58 | license_summary = ast.literal_eval(search_obj.group(2)) | ||
59 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | ||
60 | doc_id = int(doc_id_str) | ||
61 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | ||
62 | application_id = doc_class.objects.filter(id=doc_id).values_list('application_id', flat=True) | ||
63 | |||
64 | for classify, (_, name, field_order, side_diff, _, _) in consts.LICENSE_ORDER: | ||
65 | license_list = license_summary.get(classify) | ||
66 | if not license_list: | ||
67 | continue | ||
68 | ws = wb.get_sheet_by_name(name) | ||
69 | for license_dict in license_list: | ||
70 | if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1': # 居住证处理 | ||
71 | license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict) | ||
72 | continue | ||
73 | if side_diff: | ||
74 | key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify) | ||
75 | field_order = field_order_yes if key in license_dict else field_order_no | ||
76 | all_value = [] | ||
77 | for search_field, write_field in field_order: | ||
78 | if write_field is None: | ||
79 | continue | ||
80 | field_value = license_dict.get(search_field, '') | ||
81 | if isinstance(field_value, list): | ||
82 | all_value.append('\n'.join(field_value)) | ||
83 | else: | ||
84 | all_value.append(field_value) | ||
85 | ws.append((application_id[0], *all_value)) | ||
86 | wb.save(excel_path) |
... | @@ -641,14 +641,14 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -641,14 +641,14 @@ class Command(BaseCommand, LoggerMixin): |
641 | '[license_summary={4}]'.format(self.log_base, task_str, bs_summary, | 641 | '[license_summary={4}]'.format(self.log_base, task_str, bs_summary, |
642 | unknown_summary, license_summary)) | 642 | unknown_summary, license_summary)) |
643 | 643 | ||
644 | self.license_log.info('[license_summary={0}]'.format(license_summary)) | 644 | self.license_log.info('[task={0}] [license_summary={1}]'.format(task_str, license_summary)) |
645 | idcard_list = license_summary.get(consts.IC_CLASSIFY) | 645 | idcard_list = license_summary.get(consts.IC_CLASSIFY) |
646 | if idcard_list: | 646 | if idcard_list: |
647 | self.idcard_log.info('[idcard={0}]'.format(idcard_list)) | 647 | self.idcard_log.info('[task={0}] [idcard={1}]'.format(task_str, idcard_list)) |
648 | 648 | ||
649 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | 649 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) |
650 | 650 | ||
651 | self.bs_log.info('[bs_summary={0}]'.format(merged_bs_summary)) | 651 | self.bs_log.info('[task={0}] [bs_summary={1}]'.format(task_str, merged_bs_summary)) |
652 | 652 | ||
653 | self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] ' | 653 | self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] ' |
654 | '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary, | 654 | '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary, | ... | ... |
-
Please register or sign in to post a comment