9e8023a0 by 周伟奇

Merge branch 'feature/main2' into feature/main

2 parents 548dc7a0 c001972a
1 import os
2 import time
3 import json
4 import signal
5 import asyncio
6 import aiohttp
7 import difflib
8 import base64
9 import requests
10 from datetime import datetime, date
11 from collections import Counter
12 from apps.doc.ocr.wb import BSWorkbook, Workbook
13 from django.core.management import BaseCommand
14
15 from settings import conf
16 from common.mixins import LoggerMixin
17 from common.tools.file_tools import write_zip_file
18 from common.tools.pdf_to_img import PDFHandler
19 from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
20 from apps.doc.named_enum import KeywordsType
21 from apps.doc import consts
22 from apps.doc.ocr.edms import EDMS, rh
23 from apps.doc.exceptions import EDMSException
24
25
26 class Command(BaseCommand, LoggerMixin):
27
28 def __init__(self):
29 super().__init__()
30 self.log_base = '[doc ocr process]'
31 # 处理文件开关
32 self.switch = True
33 # 数据目录
34 self.data_dir = conf.DATA_DIR
35 # ocr相关
36 self.ocr_url_1 = conf.OCR_URL_1
37 self.ocr_url_2 = conf.OCR_URL_2
38 self.ocr_url_3 = conf.BC_URL
39 # EDMS web_service_api
40 self.edms = EDMS()
41 # 优雅退出信号:15
42 signal.signal(signal.SIGTERM, self.signal_handler)
43
44 def signal_handler(self, sig, frame):
45 self.switch = False # 停止处理文件
46
47 def get_doc_info(self):
48 task_str, is_priority = rh.dequeue()
49 if task_str is None:
50 self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
51 return None, None
52
53 business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
54 doc_id = int(doc_id_str)
55 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
56 # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
57 # 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
58 doc = doc_class.objects.filter(id=doc_id).first()
59 if doc is None:
60 self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
61 self.log_base, task_str, is_priority))
62 return None, None
63 elif doc.status != DocStatus.INIT.value:
64 self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
65 '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
66 return None, None
67 doc.status = DocStatus.PROCESSING.value
68 doc.save()
69 self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format(
70 self.log_base, task_str, is_priority))
71 return doc, business_type
72
73 def pdf_download(self, doc, business_type):
74 doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
75 os.makedirs(doc_data_path, exist_ok=True)
76 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
77 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
78 for times in range(consts.RETRY_TIMES):
79 try:
80 self.edms.download(pdf_path, doc.metadata_version_id)
81 except Exception as e:
82 self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
83 '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
84 edms_exc = str(e)
85 else:
86 break
87 else:
88 raise EDMSException(edms_exc)
89
90 excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
91 src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
92 self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
93 self.log_base, business_type, doc.id, pdf_path))
94 return doc_data_path, excel_path, src_excel_path, pdf_path
95
96 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino):
97 sheets = ocr_data.get('data', [])
98 if not sheets:
99 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
100 return
101 confidence = ocr_data.get('confidence', 1)
102 img_name = 'page_{0}_img_{1}'.format(pno, ino)
103 cells_exists = False
104 for i, sheet in enumerate(sheets):
105 cells = sheet.get('cells')
106 if not cells:
107 continue
108 cells_exists = True
109 sheet_name = '{0}_{1}'.format(img_name, i)
110 ws = wb.create_sheet(sheet_name)
111 for cell in cells:
112 c1 = cell.get('start_column')
113 r1 = cell.get('start_row')
114 words = cell.get('words')
115 ws.cell(row=r1 + 1, column=c1 + 1, value=words)
116
117 # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
118 summary = sheet.get('summary')
119 card = summary[1]
120 if card is None:
121 classify_dict = unknown_summary.setdefault(classify, {})
122 role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0]
123 role_dict = classify_dict.setdefault(role, {})
124 role_dict['classify'] = classify
125 role_dict['role'] = role
126 role_dict.setdefault('sheet', []).append(sheet_name)
127 role_dict.setdefault('confidence', []).append(confidence)
128 code_list = role_dict.setdefault('code', [])
129 pt_list = role_dict.setdefault('print_time', [])
130 sd_list = role_dict.setdefault('start_date', [])
131 ed_list = role_dict.setdefault('end_date', [])
132 if summary[3] is not None:
133 code_list.append((summary[2], summary[3]))
134 if summary[4] is not None:
135 pt_list.append(summary[4])
136 if summary[5] is not None:
137 sd_list.append(summary[5])
138 if summary[6] is not None:
139 ed_list.append(summary[6])
140 else:
141 card_dict = bs_summary.setdefault(card, {})
142 card_dict['count'] = card_dict.get('count', 0) + 1
143 card_dict.setdefault('classify', []).append(classify)
144 card_dict.setdefault('confidence', []).append(confidence)
145 card_dict.setdefault('sheet', []).append(sheet_name)
146 role_list = card_dict.setdefault('role', [])
147 role_set = card_dict.setdefault('role_set', set())
148 code_list = card_dict.setdefault('code', [])
149 pt_list = card_dict.setdefault('print_time', [])
150 sd_list = card_dict.setdefault('start_date', [])
151 ed_list = card_dict.setdefault('end_date', [])
152 if summary[0] is not None:
153 role_list.append(summary[0])
154 role_set.add(summary[0])
155 if summary[3] is not None:
156 code_list.append((summary[2], summary[3]))
157 if summary[4] is not None:
158 pt_list.append(summary[4])
159 if summary[5] is not None:
160 sd_list.append(summary[5])
161 if summary[6] is not None:
162 ed_list.append(summary[6])
163
164 if cells_exists:
165 res_list.append((pno, ino, consts.RES_SUCCESS))
166 else:
167 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
168
169 def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino):
170 # 类别:'0'身份证, '1'居住证
171 license_data = ocr_data.get('data', [])
172 if not license_data:
173 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
174 return
175 res_list.append((pno, ino, consts.RES_SUCCESS))
176 license_summary.setdefault(classify, []).extend(license_data)
177
178 def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino):
179 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
180 res_list.append((pno, ino, consts.RES_SUCCESS))
181 if pid == consts.BC_PID:
182 # 银行卡
183 # res_dict = {}
184 # for en_key, chn_key in consts.BC_FIELD:
185 # res_dict[chn_key] = ocr_res_2.get(en_key, '')
186 license_summary.setdefault(classify, []).append(ocr_res_2)
187 else:
188 # 营业执照等
189 for result_dict in ocr_res_2.get('ResultList', []):
190 res_dict = {}
191 for field_dict in result_dict.get('FieldList', []):
192 res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
193 license_summary.setdefault(classify, []).append(res_dict)
194 else:
195 res_list.append((pno, ino, consts.RES_FAILED))
196
197 @staticmethod
198 async def fetch_ocr_1_result(url, json_data):
199 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
200 async with session.post(url, json=json_data) as response:
201 if response.status == 200:
202 return await response.json()
203
204 @staticmethod
205 async def fetch_ocr_2_result(url, json_data):
206 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
207 async with session.post(url, data=json_data) as response:
208 if response.status == 200:
209 return await response.text()
210
211 @staticmethod
212 async def fetch_bc_name_result(url, json_data):
213 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
214 async with session.post(url, json=json_data) as response:
215 if response.status == 200:
216 return await response.json()
217
218 async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list):
219 pno, ino = self.parse_img_path(img_path)
220 with open(img_path, 'rb') as f:
221 base64_data = base64.b64encode(f.read())
222 # 获取解码后的base64值
223 file_data = base64_data.decode()
224 json_data_1 = {
225 "file": file_data
226 }
227 ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
228 if ocr_res_1 is None:
229 res_list.append((pno, ino, consts.RES_FAILED))
230 self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path))
231 # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
232 else:
233 self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format(
234 self.log_base, img_path, ocr_res_1))
235
236 if ocr_res_1.get('code') == 1:
237 ocr_data = ocr_res_1.get('data', {})
238 classify = ocr_data.get('classify')
239 if classify is None:
240 res_list.append((pno, ino, consts.RES_FAILED))
241 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
242 self.log_base, img_path, ocr_res_1))
243 return
244 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
245 res_list.append((pno, ino, consts.RES_SUCCESS_OTHER))
246 return
247 elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
248 self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino)
249 elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
250 pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
251 json_data_2 = {
252 "pid": str(pid),
253 # "key": conf.OCR_KEY,
254 # "secret": conf.OCR_SECRET,
255 "filedata": file_data
256 }
257 ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
258 if ocr_res_2 is None:
259 res_list.append((pno, ino, consts.RES_FAILED))
260 self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path))
261 # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
262 else:
263 # 识别结果
264 ocr_res_2 = json.loads(ocr_res_2)
265 self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format(
266 self.log_base, img_path, ocr_res_2))
267 if classify == consts.BC_CLASSIFY:
268 name = '有'
269 json_data_1['card_res'] = ocr_res_2
270 card_name_res = await self.fetch_bc_name_result(self.ocr_url_3, json_data_1)
271 if isinstance(card_name_res, dict) and \
272 card_name_res.get('data', {}).get('is_exists_name') == 0:
273 name = '无'
274 ocr_res_2['Name'] = name
275 self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino)
276 else: # 流水处理
277 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino)
278 else:
279 res_list.append((pno, ino, consts.RES_FAILED))
280 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
281 self.log_base, img_path, ocr_res_1))
282
283 # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
284 # # # 流水
285 # # res = {
286 # # 'code': 1,
287 # # 'msg': 'success',
288 # # 'data': {
289 # # 'classify': 0,
290 # # 'confidence': 0.999,
291 # # 'data': [
292 # # {
293 # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
294 # # 'cells': []
295 # # },
296 # # {
297 # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
298 # # 'cells': []
299 # # }
300 # # ]
301 # # }
302 # # }
303 # #
304 # # # 证件-1
305 # # res = {
306 # # 'code': 1,
307 # # 'msg': 'success',
308 # # 'data': {
309 # # 'classify': 0,
310 # # 'confidence': 0.999,
311 # # 'data': [
312 # # {
313 # # 'cn_key': 'value',
314 # # 'cn_key': 'value',
315 # # },
316 # # {
317 # # 'cn_key': 'value',
318 # # 'cn_key': 'value',
319 # # },
320 # # ]
321 # # }
322 # # }
323 # #
324 # # # 证件-2 or 其他类
325 # # res = {
326 # # 'code': 1,
327 # # 'msg': 'success',
328 # # 'data': {
329 # # 'classify': 0,
330 # # 'confidence': 0.999,
331 # # }
332 # # }
333 # with open(img_path, 'rb') as f:
334 # base64_data = base64.b64encode(f.read())
335 # # 获取解码后的base64值
336 # file_data = base64_data.decode()
337 # json_data_1 = {
338 # "file": file_data
339 # }
340 # response_1 = requests.post(self.ocr_url_1, json=json_data_1)
341 # if response_1.status_code == 200:
342 # ocr_res_1 = response_1.json()
343 # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
344 # self.log_base, img_path, ocr_res_1))
345 #
346 # if ocr_res_1.get('code') == 1:
347 # ocr_data = ocr_res_1.get('data', {})
348 # classify = ocr_data.get('classify')
349 # if classify is None:
350 # skip_img.append(self.parse_img_path(img_path))
351 # return
352 # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
353 # skip_img.append(self.parse_img_path(img_path))
354 # return
355 # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
356 # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
357 # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
358 # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
359 # json_data_2 = {
360 # "pid": str(pid),
361 # "key": conf.OCR_KEY,
362 # "secret": conf.OCR_SECRET,
363 # "file": file_data
364 # }
365 # response_2 = requests.post(self.ocr_url_2, data=json_data_2)
366 # if response_2.status_code == 200:
367 # # 识别结果
368 # ocr_res_2 = response_2.json()
369 # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
370 # self.log_base, img_path, ocr_res_2))
371 # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
372 # else:
373 # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
374 # else: # 流水处理
375 # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
376 # else:
377 # skip_img.append(self.parse_img_path(img_path))
378 # else:
379 # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
380
381 @staticmethod
382 def parse_img_path(img_path):
383 img_name, _ = os.path.splitext(os.path.basename(img_path))
384 part_list = img_name.split('_')
385 # page_7_img_11_0
386 return int(part_list[1])+1, int(part_list[3])+1
387
388 @staticmethod
389 def get_most(value_list):
390 if value_list:
391 most_common = Counter(value_list).most_common(1)
392 return most_common[0][0] if most_common else None
393
394 @staticmethod
395 def date_format(date_str, format_str):
396 try:
397 date_res = datetime.strptime(date_str, format_str).date()
398 except Exception as e:
399 return
400 else:
401 return date_res
402
403 def get_validate_date(self, date_list):
404 for date_str in date_list:
405 for format_str in consts.DATE_FORMAT:
406 date_res = self.date_format(date_str, format_str)
407 if isinstance(date_res, date):
408 return date_res
409
410 def merge_card(self, bs_summary):
411 merged_bs_summary = {}
412 sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True)
413 for main_card in sorted_card:
414 if bs_summary.get(main_card) is None:
415 continue
416 merged_bs_summary[main_card] = bs_summary.pop(main_card)
417 del merged_bs_summary[main_card]['count']
418 merge_cards = []
419 for card in bs_summary.keys():
420 if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO:
421 merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify'])
422 merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
423 merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet'])
424 merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role'])
425 merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set'])
426 merged_bs_summary[main_card]['code'].extend(bs_summary[card]['code'])
427 merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time'])
428 merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date'])
429 merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date'])
430 merge_cards.append(card)
431 for card in merge_cards:
432 del bs_summary[card]
433 merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify'])
434 merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role'])
435 del bs_summary
436 return merged_bs_summary
437
438 def prune_bs_summary(self, bs_summary):
439 for summary in bs_summary.values():
440 del summary['count']
441 summary['classify'] = self.get_most(summary['classify'])
442 summary['role'] = self.get_most(summary['role'])
443 return bs_summary
444
445 def rebuild_bs_summary(self, bs_summary, unknown_summary):
446 # bs_summary = {
447 # '卡号': {
448 # 'count': 100,
449 # 'classify': [],
450 # 'confidence': [],
451 # 'role': [],
452 # 'code': [('page', 'code')],
453 # 'print_time': [],
454 # 'start_date': [],
455 # 'end_date': [],
456 # 'sheet': ['sheet_name']
457 # }
458 # }
459 #
460 # unknown_summary = {
461 # 0: {
462 # '户名': {
463 # 'classify': 0,
464 # 'confidence': [],
465 # 'role': '户名',
466 # 'code': [('page', 'code')],
467 # 'print_time': [],
468 # 'start_date': [],
469 # 'end_date': [],
470 # 'sheet': ['sheet_name']
471 # }
472 # }
473 # }
474 # 无卡号
475 if len(bs_summary) == 0:
476 del bs_summary
477 merged_bs_summary = {}
478 card_num = 1
479 for role_dict in unknown_summary.values():
480 if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict:
481 summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {})
482 for summary in role_dict.values():
483 summary_dict['confidence'].extend(summary['confidence'])
484 summary_dict['role'] = summary['role']
485 summary_dict['code'].extend(summary['code'])
486 summary_dict['print_time'].extend(summary['print_time'])
487 summary_dict['start_date'].extend(summary['start_date'])
488 summary_dict['end_date'].extend(summary['end_date'])
489 summary_dict['sheet'].extend(summary['sheet'])
490 card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
491 merged_bs_summary[card] = summary_dict
492 else:
493 for summary in role_dict.values():
494 card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
495 card_num += 1
496 merged_bs_summary[card] = summary
497 else:
498 # 1卡号
499 one_card = False
500 if len(bs_summary) == 1:
501 merged_bs_summary = self.prune_bs_summary(bs_summary)
502 one_card = True
503 # 多卡号
504 else:
505 merged_bs_summary = self.merge_card(bs_summary)
506
507 for card_summary in merged_bs_summary.values():
508 merge_role = []
509 classify_summary = unknown_summary.get(card_summary['classify'], {})
510 for role, summary in classify_summary.items():
511 if one_card or role in card_summary['role_set']:
512 merge_role.append(role)
513 card_summary['confidence'].extend(summary['confidence'])
514 card_summary['sheet'].extend(summary['sheet'])
515 card_summary['code'].extend(summary['code'])
516 card_summary['print_time'].extend(summary['print_time'])
517 card_summary['start_date'].extend(summary['start_date'])
518 card_summary['end_date'].extend(summary['end_date'])
519
520 for role in merge_role:
521 del classify_summary[role]
522
523 card_num = 1
524 for role_dict in unknown_summary.values():
525 for summary in role_dict.values():
526 card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
527 card_num += 1
528 merged_bs_summary[card] = summary
529
530 del unknown_summary
531 for summary in merged_bs_summary.values():
532 if summary.get('role_set') is not None:
533 del summary['role_set']
534 summary['print_time'] = self.get_validate_date(summary['print_time'])
535 summary['start_date'] = self.get_validate_date(summary['start_date'])
536 summary['end_date'] = self.get_validate_date(summary['end_date'])
537 summary['confidence'] = max(summary['confidence'])
538 return merged_bs_summary
539
540 # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
541 # TODO 异常邮件通知
542 # 识别失败:普通异常,如PDF异常、构建过程异常
543 # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件
544 # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件
545 # TODO OCR接口调用重试
546 def handle(self, *args, **kwargs):
547 sleep_second = int(conf.SLEEP_SECOND)
548 max_sleep_second = int(conf.MAX_SLEEP_SECOND)
549
550 while self.switch:
551 # 1. 从队列获取文件信息
552 doc, business_type = self.get_doc_info()
553 # 队列为空时的处理
554 if doc is None:
555 time.sleep(sleep_second)
556 sleep_second = min(max_sleep_second, sleep_second + 5)
557 continue
558 sleep_second = int(conf.SLEEP_SECOND)
559
560 try:
561 start_time = time.time()
562 # 2. 从EDMS获取PDF文件
563 doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type)
564
565 # 3.PDF文件提取图片
566 img_save_path = os.path.join(doc_data_path, 'img')
567 self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format(
568 self.log_base, business_type, doc.id))
569 pdf_handler = PDFHandler(pdf_path, img_save_path)
570 pdf_handler.extract_image()
571 self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format(
572 self.log_base, business_type, doc.id))
573
574 # 4.获取OCR结果并且构建excel文件
575 bs_summary = {}
576 license_summary = {}
577 unknown_summary = {}
578 res_list = []
579 interest_keyword = Keywords.objects.filter(
580 type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
581 salary_keyword = Keywords.objects.filter(
582 type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True)
583 loan_keyword = Keywords.objects.filter(
584 type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list(
585 'keyword', flat=True)
586 wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
587
588 # wb = Workbook()
589
590 # 4.1 获取OCR结果
591 loop = asyncio.get_event_loop()
592 tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
593 for img_path in pdf_handler.img_path_list]
594 loop.run_until_complete(asyncio.wait(tasks))
595 # loop.close()
596
597 # for img_path in pdf_handler.img_path_list:
598 # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
599
600 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
601 '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
602 unknown_summary, license_summary))
603
604 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
605
606 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
607 '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type,
608 doc.id, merged_bs_summary,
609 unknown_summary, res_list))
610 del unknown_summary
611
612 # 4.2 重构Excel文件
613 wb.save(src_excel_path)
614 wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
615 wb.save(excel_path)
616 except EDMSException as e:
617 doc.status = DocStatus.PROCESS_FAILED.value
618 doc.save()
619 self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
620 '[err={3}]'.format(self.log_base, business_type, doc.id, e))
621 except Exception as e:
622 doc.status = DocStatus.PROCESS_FAILED.value
623 doc.save()
624 self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
625 '[err={3}]'.format(self.log_base, business_type, doc.id, e))
626 else:
627 try:
628 # 5.上传至EDMS
629 for times in range(consts.RETRY_TIMES):
630 try:
631 self.edms.upload(excel_path, doc, business_type)
632 except Exception as e:
633 self.cronjob_log.warn(
634 '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
635 '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
636 edms_exc = str(e)
637 else:
638 break
639 else:
640 raise EDMSException(edms_exc)
641 except Exception as e:
642 doc.status = DocStatus.UPLOAD_FAILED.value
643 doc.save()
644 end_time = time.time()
645 speed_time = int(end_time - start_time)
646 self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
647 '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id,
648 speed_time, e))
649 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
650
651 else:
652 doc.status = DocStatus.COMPLETE.value
653 doc.save()
654 end_time = time.time()
655 speed_time = int(end_time - start_time)
656 self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] '
657 '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time))
658 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
659
660 self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
1 import re
2 import os
3 import ast
4 import datetime
5 from openpyxl import Workbook
6 from django.core.management import BaseCommand
7 from settings import conf
8 from common.mixins import LoggerMixin
9 from apps.doc.models import HILDoc, AFCDoc
10 from apps.doc import consts
11
12
13 class Command(BaseCommand, LoggerMixin):
14
15 def __init__(self):
16 super().__init__()
17 self.sheet_names = ('AFC', 'HIL')
18 self.header = ('申请号', '身份证号', '民族', '时间戳')
19
20 def add_arguments(self, parser):
21 parser.add_argument(
22 '--date',
23 default=datetime.date.today() - datetime.timedelta(days=1),
24 dest='date',
25 help='将要计算的日期,格式: 2018-01-01'
26 )
27
28 def handle(self, *args, **kwargs):
29 date = kwargs.get('date')
30 if isinstance(date, str):
31 if not re.match(r'\d{4}-\d{2}-\d{2}', date):
32 print('date format error')
33 return
34 date_str = date
35 else:
36 date_str = date.strftime('%Y-%m-%d')
37
38 excel_dir = os.path.join(conf.DATA_DIR, 'AFC', 'Logs')
39 if not os.path.exists(excel_dir):
40 print('excel dir not exists')
41 return
42 excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(date_str))
43 log_path = os.path.join(conf.LOG_DIR, 'idcard.log.{0}'.format(date_str))
44 if not os.path.exists(log_path):
45 print('log_path not exists')
46 return
47
48 wb = Workbook()
49 for name in self.sheet_names:
50 ws = wb.create_sheet(name)
51 ws.append(self.header)
52 wb.remove(wb.get_sheet_by_name('Sheet'))
53
54 with open(log_path, 'r', encoding='utf-8') as fp:
55 for line in fp:
56 search_obj = re.search(r'[(.*)] [task=(.*)] [idcard=(.*)]', line)
57 task_str = search_obj.group(1)
58 license_summary = ast.literal_eval(search_obj.group(2))
59 business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
60 doc_id = int(doc_id_str)
61 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
62 application_id = doc_class.objects.filter(id=doc_id).values_list('application_id', flat=True)
63
64 for classify, (_, name, field_order, side_diff, _, _) in consts.LICENSE_ORDER:
65 license_list = license_summary.get(classify)
66 if not license_list:
67 continue
68 ws = wb.get_sheet_by_name(name)
69 for license_dict in license_list:
70 if classify == consts.IC_CLASSIFY and license_dict.get('类别') == '1': # 居住证处理
71 license_summary.setdefault(consts.RP_CLASSIFY, []).append(license_dict)
72 continue
73 if side_diff:
74 key, field_order_yes, field_order_no = consts.FIELD_ORDER_MAP.get(classify)
75 field_order = field_order_yes if key in license_dict else field_order_no
76 all_value = []
77 for search_field, write_field in field_order:
78 if write_field is None:
79 continue
80 field_value = license_dict.get(search_field, '')
81 if isinstance(field_value, list):
82 all_value.append('\n'.join(field_value))
83 else:
84 all_value.append(field_value)
85 ws.append((application_id[0], *all_value))
86 wb.save(excel_path)
...@@ -641,14 +641,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -641,14 +641,14 @@ class Command(BaseCommand, LoggerMixin):
641 '[license_summary={4}]'.format(self.log_base, task_str, bs_summary, 641 '[license_summary={4}]'.format(self.log_base, task_str, bs_summary,
642 unknown_summary, license_summary)) 642 unknown_summary, license_summary))
643 643
644 self.license_log.info('[license_summary={0}]'.format(license_summary)) 644 self.license_log.info('[task={0}] [license_summary={1}]'.format(task_str, license_summary))
645 idcard_list = license_summary.get(consts.IC_CLASSIFY) 645 idcard_list = license_summary.get(consts.IC_CLASSIFY)
646 if idcard_list: 646 if idcard_list:
647 self.idcard_log.info('[idcard={0}]'.format(idcard_list)) 647 self.idcard_log.info('[task={0}] [idcard={1}]'.format(task_str, idcard_list))
648 648
649 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) 649 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
650 650
651 self.bs_log.info('[bs_summary={0}]'.format(merged_bs_summary)) 651 self.bs_log.info('[task={0}] [bs_summary={1}]'.format(task_str, merged_bs_summary))
652 652
653 self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] ' 653 self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] '
654 '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary, 654 '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary,
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!