15dccc97 by 周伟奇

Merge branch 'feature/main' into feature/mssql

2 parents 236b64e0 5e463cbd
...@@ -152,7 +152,7 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果') ...@@ -152,7 +152,7 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果')
152 # '借贷': ('贷', '借'), # 竖版-无表格-广发银行 152 # '借贷': ('贷', '借'), # 竖版-无表格-广发银行
153 # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行 153 # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行
154 # '收/支': ('收入', '支出'), # 横版-表格-北京银行 154 # '收/支': ('收入', '支出'), # 横版-表格-北京银行
155 BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支', '收支标志'} 155 BORROW_HEADERS_SET = {'借贷', '借\n贷', '借贷状态', '收/支', '收支标志'}
156 BORROW_INCOME_SET = {'贷', '收入', '收', '收(Cr)'} 156 BORROW_INCOME_SET = {'贷', '收入', '收', '收(Cr)'}
157 BORROW_OUTLAY_SET = {'借', '支出', '支', '付(Dr)'} 157 BORROW_OUTLAY_SET = {'借', '支出', '支', '付(Dr)'}
158 INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'} 158 INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'}
...@@ -165,6 +165,7 @@ HEADERS_MAPPING = {} ...@@ -165,6 +165,7 @@ HEADERS_MAPPING = {}
165 HEADERS_MAPPING.update( 165 HEADERS_MAPPING.update(
166 { 166 {
167 '借贷': BORROW_KEY, 167 '借贷': BORROW_KEY,
168 '借\n贷': BORROW_KEY,
168 '借贷状态': BORROW_KEY, 169 '借贷状态': BORROW_KEY,
169 '收支标志': BORROW_KEY, 170 '收支标志': BORROW_KEY,
170 '收/支': BORROW_KEY, 171 '收/支': BORROW_KEY,
......
...@@ -40,7 +40,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -40,7 +40,8 @@ class Command(BaseCommand, LoggerMixin):
40 print('excel dir not exists') 40 print('excel dir not exists')
41 return 41 return
42 excel_path = os.path.join(excel_dir, 'bs_{0}.xlsx'.format(date_str)) 42 excel_path = os.path.join(excel_dir, 'bs_{0}.xlsx'.format(date_str))
43 log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str)) 43 # log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str))
44 log_path = os.path.join(conf.LOG_DIR, 'bs_statistics.log.{0}'.format(date_str))
44 if not os.path.exists(log_path): 45 if not os.path.exists(log_path):
45 print('log_path not exists') 46 print('log_path not exists')
46 return 47 return
...@@ -48,7 +49,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -48,7 +49,8 @@ class Command(BaseCommand, LoggerMixin):
48 summary_dict = {} 49 summary_dict = {}
49 with open(log_path, 'r', encoding='utf-8') as fp: 50 with open(log_path, 'r', encoding='utf-8') as fp:
50 for line in fp: 51 for line in fp:
51 search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line) 52 # search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line)
53 search_obj = re.search(r'\[task=(.*)] \[bs_summary=(.*)]', line)
52 task_str = search_obj.group(1) 54 task_str = search_obj.group(1)
53 business_type, doc_id_str = task_str.split(consts.SPLIT_STR) 55 business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
54 doc_id = int(doc_id_str) 56 doc_id = int(doc_id_str)
......
1 import os
2 import time
3 import json
4 import signal
5 import asyncio
6 import aiohttp
7 import difflib
8 import base64
9 import requests
10 from datetime import datetime, date
11 from collections import Counter
12 from apps.doc.ocr.wb import BSWorkbook, Workbook
13 from django.core.management import BaseCommand
14
15 from settings import conf
16 from common.mixins import LoggerMixin
17 from common.tools.file_tools import write_zip_file
18 from common.tools.pdf_to_img import PDFHandler
19 from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
20 from apps.doc.named_enum import KeywordsType
21 from apps.doc import consts
22 from apps.doc.ocr.edms import EDMS, rh
23 from apps.doc.exceptions import EDMSException
24
25
26 class Command(BaseCommand, LoggerMixin):
27
28 def __init__(self):
29 super().__init__()
30 self.log_base = '[doc ocr process]'
31 # 处理文件开关
32 self.switch = True
33 # 数据目录
34 self.data_dir = conf.DATA_DIR
35 # ocr相关
36 self.ocr_url_1 = conf.OCR_URL_1
37 self.ocr_url_2 = conf.OCR_URL_2
38 self.ocr_url_3 = conf.BC_URL
39 # EDMS web_service_api
40 self.edms = EDMS()
41 # 优雅退出信号:15
42 signal.signal(signal.SIGTERM, self.signal_handler)
43
44 def signal_handler(self, sig, frame):
45 self.switch = False # 停止处理文件
46
47 def get_doc_info(self):
48 task_str, is_priority = rh.dequeue()
49 if task_str is None:
50 self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
51 return None, None
52
53 business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
54 doc_id = int(doc_id_str)
55 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
56 # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values(
57 # 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first()
58 doc = doc_class.objects.filter(id=doc_id).first()
59 if doc is None:
60 self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
61 self.log_base, task_str, is_priority))
62 return None, None
63 elif doc.status != DocStatus.INIT.value:
64 self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
65 '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
66 return None, None
67 doc.status = DocStatus.PROCESSING.value
68 doc.save()
69 self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format(
70 self.log_base, task_str, is_priority))
71 return doc, business_type
72
73 def pdf_download(self, doc, business_type):
74 doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id))
75 os.makedirs(doc_data_path, exist_ok=True)
76 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
77 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
78 for times in range(consts.RETRY_TIMES):
79 try:
80 self.edms.download(pdf_path, doc.metadata_version_id)
81 except Exception as e:
82 self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
83 '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
84 edms_exc = str(e)
85 else:
86 break
87 else:
88 raise EDMSException(edms_exc)
89
90 excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
91 src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
92 self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
93 self.log_base, business_type, doc.id, pdf_path))
94 return doc_data_path, excel_path, src_excel_path, pdf_path
95
96 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino):
97 sheets = ocr_data.get('data', [])
98 if not sheets:
99 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
100 return
101 confidence = ocr_data.get('confidence', 1)
102 img_name = 'page_{0}_img_{1}'.format(pno, ino)
103 cells_exists = False
104 for i, sheet in enumerate(sheets):
105 cells = sheet.get('cells')
106 if not cells:
107 continue
108 cells_exists = True
109 sheet_name = '{0}_{1}'.format(img_name, i)
110 ws = wb.create_sheet(sheet_name)
111 for cell in cells:
112 c1 = cell.get('start_column')
113 r1 = cell.get('start_row')
114 words = cell.get('words')
115 ws.cell(row=r1 + 1, column=c1 + 1, value=words)
116
117 # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间']
118 summary = sheet.get('summary')
119 card = summary[1]
120 if card is None:
121 classify_dict = unknown_summary.setdefault(classify, {})
122 role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0]
123 role_dict = classify_dict.setdefault(role, {})
124 role_dict['classify'] = classify
125 role_dict['role'] = role
126 role_dict.setdefault('sheet', []).append(sheet_name)
127 role_dict.setdefault('confidence', []).append(confidence)
128 code_list = role_dict.setdefault('code', [])
129 pt_list = role_dict.setdefault('print_time', [])
130 sd_list = role_dict.setdefault('start_date', [])
131 ed_list = role_dict.setdefault('end_date', [])
132 if summary[3] is not None:
133 code_list.append((summary[2], summary[3]))
134 if summary[4] is not None:
135 pt_list.append(summary[4])
136 if summary[5] is not None:
137 sd_list.append(summary[5])
138 if summary[6] is not None:
139 ed_list.append(summary[6])
140 else:
141 card_dict = bs_summary.setdefault(card, {})
142 card_dict['count'] = card_dict.get('count', 0) + 1
143 card_dict.setdefault('classify', []).append(classify)
144 card_dict.setdefault('confidence', []).append(confidence)
145 card_dict.setdefault('sheet', []).append(sheet_name)
146 role_list = card_dict.setdefault('role', [])
147 role_set = card_dict.setdefault('role_set', set())
148 code_list = card_dict.setdefault('code', [])
149 pt_list = card_dict.setdefault('print_time', [])
150 sd_list = card_dict.setdefault('start_date', [])
151 ed_list = card_dict.setdefault('end_date', [])
152 if summary[0] is not None:
153 role_list.append(summary[0])
154 role_set.add(summary[0])
155 if summary[3] is not None:
156 code_list.append((summary[2], summary[3]))
157 if summary[4] is not None:
158 pt_list.append(summary[4])
159 if summary[5] is not None:
160 sd_list.append(summary[5])
161 if summary[6] is not None:
162 ed_list.append(summary[6])
163
164 if cells_exists:
165 res_list.append((pno, ino, consts.RES_SUCCESS))
166 else:
167 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
168
169 def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino):
170 # 类别:'0'身份证, '1'居住证
171 license_data = ocr_data.get('data', [])
172 if not license_data:
173 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
174 return
175 res_list.append((pno, ino, consts.RES_SUCCESS))
176 license_summary.setdefault(classify, []).extend(license_data)
177
178 def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino):
179 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
180 res_list.append((pno, ino, consts.RES_SUCCESS))
181 if pid == consts.BC_PID:
182 # 银行卡
183 # res_dict = {}
184 # for en_key, chn_key in consts.BC_FIELD:
185 # res_dict[chn_key] = ocr_res_2.get(en_key, '')
186 license_summary.setdefault(classify, []).append(ocr_res_2)
187 else:
188 # 营业执照等
189 for result_dict in ocr_res_2.get('ResultList', []):
190 res_dict = {}
191 for field_dict in result_dict.get('FieldList', []):
192 res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
193 license_summary.setdefault(classify, []).append(res_dict)
194 else:
195 res_list.append((pno, ino, consts.RES_FAILED))
196
197 @staticmethod
198 async def fetch_ocr_1_result(url, json_data):
199 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
200 async with session.post(url, json=json_data) as response:
201 if response.status == 200:
202 return await response.json()
203
204 @staticmethod
205 async def fetch_ocr_2_result(url, json_data):
206 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
207 async with session.post(url, data=json_data) as response:
208 if response.status == 200:
209 return await response.text()
210
211 @staticmethod
212 async def fetch_bc_name_result(url, json_data):
213 async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
214 async with session.post(url, json=json_data) as response:
215 if response.status == 200:
216 return await response.json()
217
218 async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list):
219 pno, ino = self.parse_img_path(img_path)
220 with open(img_path, 'rb') as f:
221 base64_data = base64.b64encode(f.read())
222 # 获取解码后的base64值
223 file_data = base64_data.decode()
224 json_data_1 = {
225 "file": file_data
226 }
227 ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
228 if ocr_res_1 is None:
229 res_list.append((pno, ino, consts.RES_FAILED))
230 self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path))
231 # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
232 else:
233 self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format(
234 self.log_base, img_path, ocr_res_1))
235
236 if ocr_res_1.get('code') == 1:
237 ocr_data = ocr_res_1.get('data', {})
238 classify = ocr_data.get('classify')
239 if classify is None:
240 res_list.append((pno, ino, consts.RES_FAILED))
241 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
242 self.log_base, img_path, ocr_res_1))
243 return
244 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
245 res_list.append((pno, ino, consts.RES_SUCCESS_OTHER))
246 return
247 elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
248 self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino)
249 elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
250 pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
251 json_data_2 = {
252 "pid": str(pid),
253 # "key": conf.OCR_KEY,
254 # "secret": conf.OCR_SECRET,
255 "filedata": file_data
256 }
257 ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
258 if ocr_res_2 is None:
259 res_list.append((pno, ino, consts.RES_FAILED))
260 self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path))
261 # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
262 else:
263 # 识别结果
264 ocr_res_2 = json.loads(ocr_res_2)
265 self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format(
266 self.log_base, img_path, ocr_res_2))
267 if classify == consts.BC_CLASSIFY:
268 name = '有'
269 json_data_1['card_res'] = ocr_res_2
270 card_name_res = await self.fetch_bc_name_result(self.ocr_url_3, json_data_1)
271 if isinstance(card_name_res, dict) and \
272 card_name_res.get('data', {}).get('is_exists_name') == 0:
273 name = '无'
274 ocr_res_2['Name'] = name
275 self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino)
276 else: # 流水处理
277 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino)
278 else:
279 res_list.append((pno, ino, consts.RES_FAILED))
280 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
281 self.log_base, img_path, ocr_res_1))
282
283 # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
284 # # # 流水
285 # # res = {
286 # # 'code': 1,
287 # # 'msg': 'success',
288 # # 'data': {
289 # # 'classify': 0,
290 # # 'confidence': 0.999,
291 # # 'data': [
292 # # {
293 # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
294 # # 'cells': []
295 # # },
296 # # {
297 # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
298 # # 'cells': []
299 # # }
300 # # ]
301 # # }
302 # # }
303 # #
304 # # # 证件-1
305 # # res = {
306 # # 'code': 1,
307 # # 'msg': 'success',
308 # # 'data': {
309 # # 'classify': 0,
310 # # 'confidence': 0.999,
311 # # 'data': [
312 # # {
313 # # 'cn_key': 'value',
314 # # 'cn_key': 'value',
315 # # },
316 # # {
317 # # 'cn_key': 'value',
318 # # 'cn_key': 'value',
319 # # },
320 # # ]
321 # # }
322 # # }
323 # #
324 # # # 证件-2 or 其他类
325 # # res = {
326 # # 'code': 1,
327 # # 'msg': 'success',
328 # # 'data': {
329 # # 'classify': 0,
330 # # 'confidence': 0.999,
331 # # }
332 # # }
333 # with open(img_path, 'rb') as f:
334 # base64_data = base64.b64encode(f.read())
335 # # 获取解码后的base64值
336 # file_data = base64_data.decode()
337 # json_data_1 = {
338 # "file": file_data
339 # }
340 # response_1 = requests.post(self.ocr_url_1, json=json_data_1)
341 # if response_1.status_code == 200:
342 # ocr_res_1 = response_1.json()
343 # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
344 # self.log_base, img_path, ocr_res_1))
345 #
346 # if ocr_res_1.get('code') == 1:
347 # ocr_data = ocr_res_1.get('data', {})
348 # classify = ocr_data.get('classify')
349 # if classify is None:
350 # skip_img.append(self.parse_img_path(img_path))
351 # return
352 # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
353 # skip_img.append(self.parse_img_path(img_path))
354 # return
355 # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
356 # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
357 # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
358 # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
359 # json_data_2 = {
360 # "pid": str(pid),
361 # "key": conf.OCR_KEY,
362 # "secret": conf.OCR_SECRET,
363 # "file": file_data
364 # }
365 # response_2 = requests.post(self.ocr_url_2, data=json_data_2)
366 # if response_2.status_code == 200:
367 # # 识别结果
368 # ocr_res_2 = response_2.json()
369 # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
370 # self.log_base, img_path, ocr_res_2))
371 # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
372 # else:
373 # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
374 # else: # 流水处理
375 # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
376 # else:
377 # skip_img.append(self.parse_img_path(img_path))
378 # else:
379 # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
380
381 @staticmethod
382 def parse_img_path(img_path):
383 img_name, _ = os.path.splitext(os.path.basename(img_path))
384 part_list = img_name.split('_')
385 # page_7_img_11_0
386 return int(part_list[1])+1, int(part_list[3])+1
387
388 @staticmethod
389 def get_most(value_list):
390 if value_list:
391 most_common = Counter(value_list).most_common(1)
392 return most_common[0][0] if most_common else None
393
394 @staticmethod
395 def date_format(date_str, format_str):
396 try:
397 date_res = datetime.strptime(date_str, format_str).date()
398 except Exception as e:
399 return
400 else:
401 return date_res
402
403 def get_validate_date(self, date_list):
404 for date_str in date_list:
405 for format_str in consts.DATE_FORMAT:
406 date_res = self.date_format(date_str, format_str)
407 if isinstance(date_res, date):
408 return date_res
409
410 def merge_card(self, bs_summary):
411 merged_bs_summary = {}
412 sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True)
413 for main_card in sorted_card:
414 if bs_summary.get(main_card) is None:
415 continue
416 merged_bs_summary[main_card] = bs_summary.pop(main_card)
417 del merged_bs_summary[main_card]['count']
418 merge_cards = []
419 for card in bs_summary.keys():
420 if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO:
421 merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify'])
422 merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence'])
423 merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet'])
424 merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role'])
425 merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set'])
426 merged_bs_summary[main_card]['code'].extend(bs_summary[card]['code'])
427 merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time'])
428 merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date'])
429 merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date'])
430 merge_cards.append(card)
431 for card in merge_cards:
432 del bs_summary[card]
433 merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify'])
434 merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role'])
435 del bs_summary
436 return merged_bs_summary
437
438 def prune_bs_summary(self, bs_summary):
439 for summary in bs_summary.values():
440 del summary['count']
441 summary['classify'] = self.get_most(summary['classify'])
442 summary['role'] = self.get_most(summary['role'])
443 return bs_summary
444
445 def rebuild_bs_summary(self, bs_summary, unknown_summary):
446 # bs_summary = {
447 # '卡号': {
448 # 'count': 100,
449 # 'classify': [],
450 # 'confidence': [],
451 # 'role': [],
452 # 'code': [('page', 'code')],
453 # 'print_time': [],
454 # 'start_date': [],
455 # 'end_date': [],
456 # 'sheet': ['sheet_name']
457 # }
458 # }
459 #
460 # unknown_summary = {
461 # 0: {
462 # '户名': {
463 # 'classify': 0,
464 # 'confidence': [],
465 # 'role': '户名',
466 # 'code': [('page', 'code')],
467 # 'print_time': [],
468 # 'start_date': [],
469 # 'end_date': [],
470 # 'sheet': ['sheet_name']
471 # }
472 # }
473 # }
474 # 无卡号
475 if len(bs_summary) == 0:
476 del bs_summary
477 merged_bs_summary = {}
478 card_num = 1
479 for role_dict in unknown_summary.values():
480 if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict:
481 summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {})
482 for summary in role_dict.values():
483 summary_dict['confidence'].extend(summary['confidence'])
484 summary_dict['role'] = summary['role']
485 summary_dict['code'].extend(summary['code'])
486 summary_dict['print_time'].extend(summary['print_time'])
487 summary_dict['start_date'].extend(summary['start_date'])
488 summary_dict['end_date'].extend(summary['end_date'])
489 summary_dict['sheet'].extend(summary['sheet'])
490 card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
491 merged_bs_summary[card] = summary_dict
492 else:
493 for summary in role_dict.values():
494 card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
495 card_num += 1
496 merged_bs_summary[card] = summary
497 else:
498 # 1卡号
499 one_card = False
500 if len(bs_summary) == 1:
501 merged_bs_summary = self.prune_bs_summary(bs_summary)
502 one_card = True
503 # 多卡号
504 else:
505 merged_bs_summary = self.merge_card(bs_summary)
506
507 for card_summary in merged_bs_summary.values():
508 merge_role = []
509 classify_summary = unknown_summary.get(card_summary['classify'], {})
510 for role, summary in classify_summary.items():
511 if one_card or role in card_summary['role_set']:
512 merge_role.append(role)
513 card_summary['confidence'].extend(summary['confidence'])
514 card_summary['sheet'].extend(summary['sheet'])
515 card_summary['code'].extend(summary['code'])
516 card_summary['print_time'].extend(summary['print_time'])
517 card_summary['start_date'].extend(summary['start_date'])
518 card_summary['end_date'].extend(summary['end_date'])
519
520 for role in merge_role:
521 del classify_summary[role]
522
523 card_num = 1
524 for role_dict in unknown_summary.values():
525 for summary in role_dict.values():
526 card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num)
527 card_num += 1
528 merged_bs_summary[card] = summary
529
530 del unknown_summary
531 for summary in merged_bs_summary.values():
532 if summary.get('role_set') is not None:
533 del summary['role_set']
534 summary['print_time'] = self.get_validate_date(summary['print_time'])
535 summary['start_date'] = self.get_validate_date(summary['start_date'])
536 summary['end_date'] = self.get_validate_date(summary['end_date'])
537 summary['confidence'] = max(summary['confidence'])
538 return merged_bs_summary
539
540 # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理
541 # TODO 异常邮件通知
542 # 识别失败:普通异常,如PDF异常、构建过程异常
543 # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件
544 # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件
545 # TODO OCR接口调用重试
546 def handle(self, *args, **kwargs):
547 sleep_second = int(conf.SLEEP_SECOND)
548 max_sleep_second = int(conf.MAX_SLEEP_SECOND)
549
550 while self.switch:
551 # 1. 从队列获取文件信息
552 doc, business_type = self.get_doc_info()
553 # 队列为空时的处理
554 if doc is None:
555 time.sleep(sleep_second)
556 sleep_second = min(max_sleep_second, sleep_second + 5)
557 continue
558 sleep_second = int(conf.SLEEP_SECOND)
559
560 try:
561 start_time = time.time()
562 # 2. 从EDMS获取PDF文件
563 doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type)
564
565 # 3.PDF文件提取图片
566 img_save_path = os.path.join(doc_data_path, 'img')
567 self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format(
568 self.log_base, business_type, doc.id))
569 pdf_handler = PDFHandler(pdf_path, img_save_path)
570 pdf_handler.extract_image()
571 self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format(
572 self.log_base, business_type, doc.id))
573
574 # 4.获取OCR结果并且构建excel文件
575 bs_summary = {}
576 license_summary = {}
577 unknown_summary = {}
578 res_list = []
579 interest_keyword = Keywords.objects.filter(
580 type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
581 salary_keyword = Keywords.objects.filter(
582 type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True)
583 loan_keyword = Keywords.objects.filter(
584 type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list(
585 'keyword', flat=True)
586 wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword)
587
588 # wb = Workbook()
589
590 # 4.1 获取OCR结果
591 loop = asyncio.get_event_loop()
592 tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
593 for img_path in pdf_handler.img_path_list]
594 loop.run_until_complete(asyncio.wait(tasks))
595 # loop.close()
596
597 # for img_path in pdf_handler.img_path_list:
598 # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
599
600 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
601 '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
602 unknown_summary, license_summary))
603
604 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
605
606 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
607 '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type,
608 doc.id, merged_bs_summary,
609 unknown_summary, res_list))
610 del unknown_summary
611
612 # 4.2 重构Excel文件
613 wb.save(src_excel_path)
614 wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
615 wb.save(excel_path)
616 except EDMSException as e:
617 doc.status = DocStatus.PROCESS_FAILED.value
618 doc.save()
619 self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
620 '[err={3}]'.format(self.log_base, business_type, doc.id, e))
621 except Exception as e:
622 doc.status = DocStatus.PROCESS_FAILED.value
623 doc.save()
624 self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
625 '[err={3}]'.format(self.log_base, business_type, doc.id, e))
626 else:
627 try:
628 # 5.上传至EDMS
629 for times in range(consts.RETRY_TIMES):
630 try:
631 self.edms.upload(excel_path, doc, business_type)
632 except Exception as e:
633 self.cronjob_log.warn(
634 '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
635 '[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
636 edms_exc = str(e)
637 else:
638 break
639 else:
640 raise EDMSException(edms_exc)
641 except Exception as e:
642 doc.status = DocStatus.UPLOAD_FAILED.value
643 doc.save()
644 end_time = time.time()
645 speed_time = int(end_time - start_time)
646 self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
647 '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id,
648 speed_time, e))
649 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
650
651 else:
652 doc.status = DocStatus.COMPLETE.value
653 doc.save()
654 end_time = time.time()
655 speed_time = int(end_time - start_time)
656 self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] '
657 '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time))
658 write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
659
660 self.cronjob_log.info('{0} [stop safely]'.format(self.log_base))
1 import re
2 import os
3 import ast
4 import datetime
5 from openpyxl import Workbook
6 from django.core.management import BaseCommand
7 from settings import conf
8 from common.mixins import LoggerMixin
9 from apps.doc.models import HILDoc, AFCDoc
10 from apps.doc import consts
11
12
13 class Command(BaseCommand, LoggerMixin):
14
15 def __init__(self):
16 super().__init__()
17 self.sheet_name = '身份证'
18 self.header = ('申请号', '身份证号', '民族', '时间戳')
19
20 def add_arguments(self, parser):
21 parser.add_argument(
22 '--date',
23 default=datetime.date.today() - datetime.timedelta(days=1),
24 dest='date',
25 help='将要计算的日期,格式: 2018-01-01'
26 )
27
28 def handle(self, *args, **kwargs):
29 date = kwargs.get('date')
30 if isinstance(date, str):
31 if not re.match(r'\d{4}-\d{2}-\d{2}', date):
32 print('date format error')
33 return
34 date_str = date
35 else:
36 date_str = date.strftime('%Y-%m-%d')
37
38 afc_excel_dir = os.path.join(conf.DATA_DIR, 'AFC', 'IdCard')
39 hil_excel_dir = os.path.join(conf.DATA_DIR, 'HIL', 'IdCard')
40 if not os.path.exists(afc_excel_dir) or not os.path.exists(hil_excel_dir):
41 print('excel_dir not exist')
42 return
43
44 log_path = os.path.join(conf.LOG_DIR, 'idcard.log.{0}'.format(date_str))
45 if not os.path.exists(log_path):
46 print('log_path not exists')
47 return
48
49 wb_afc = Workbook()
50 ws_afc = wb_afc.create_sheet(self.sheet_name)
51 ws_afc.append(self.header)
52 wb_afc.remove(wb_afc.get_sheet_by_name('Sheet'))
53
54 wb_hil = Workbook()
55 ws_hil = wb_hil.create_sheet(self.sheet_name)
56 ws_hil.append(self.header)
57 wb_hil.remove(wb_hil.get_sheet_by_name('Sheet'))
58
59 with open(log_path, 'r', encoding='utf-8') as fp:
60 for line in fp:
61 search_obj = re.match(r'\[(.*)] \[task=(.*)] \[idcard=(.*)]', line)
62 idcard_str = search_obj.group(3)
63 idcard_list = ast.literal_eval(idcard_str)
64 content_list = []
65 for idcard_dict in idcard_list:
66 nation = idcard_dict.get('民族')
67 if nation is None:
68 continue
69 if idcard_dict.get('类别') == '1':
70 continue
71 content_list.append((idcard_dict.get('公民身份号码'), nation))
72 if len(content_list) == 0:
73 continue
74
75 time_str = search_obj.group(1)
76 task_str = search_obj.group(2)
77 business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
78 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
79 application_id = doc_class.objects.filter(id=int(doc_id_str)).values_list('application_id', flat=True)
80
81 if business_type == consts.HIL_PREFIX:
82 for id_num, nation in content_list:
83 ws_hil.append((application_id[0], id_num, nation, time_str))
84 else:
85 for id_num, nation in content_list:
86 ws_afc.append((application_id[0], id_num, nation, time_str))
87
88 afc_excel_path = os.path.join(afc_excel_dir, 'idcard_{0}.xlsx'.format(date_str))
89 hil_excel_path = os.path.join(hil_excel_dir, 'idcard_{0}.xlsx'.format(date_str))
90 wb_afc.save(afc_excel_path)
91 wb_hil.save(hil_excel_path)
1 import os
2 import datetime
3 from calendar import monthrange
4 from openpyxl import Workbook, load_workbook
5 from django.core.management import BaseCommand
6 from settings import conf
7 from common.mixins import LoggerMixin
8
9
10 class Command(BaseCommand, LoggerMixin):
11
12 def __init__(self):
13 super().__init__()
14 self.dirs = ('AFC', 'HIL')
15
16 def handle(self, *args, **kwargs):
17 now_time = datetime.datetime.now()
18 end_day_in_mouth = now_time.replace(day=1)
19 pre_mouth = end_day_in_mouth - datetime.timedelta(days=1)
20
21 for target_dir in self.dirs:
22 excel_dir = os.path.join(conf.DATA_DIR, target_dir, 'IdCard')
23 if not os.path.exists(excel_dir):
24 print('excel dir not exists: {0}'.format(excel_dir))
25 return
26
27 monthly_wb = Workbook()
28
29 for d in range(1, monthrange(pre_mouth.year, pre_mouth.month)[1] + 1):
30 date_str = '{:04d}-{:02d}-{:02d}'.format(pre_mouth.year, pre_mouth.month, d)
31 daily_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(date_str))
32 if not os.path.exists(daily_excel_path):
33 print('daily excel path not exists: {0}'.format(daily_excel_path))
34 continue
35
36 monthly_ws = monthly_wb.create_sheet(date_str)
37 daily_wb = load_workbook(daily_excel_path)
38 daily_ws = daily_wb.get_sheet_by_name('身份证')
39 for row in daily_ws.iter_rows(min_row=1, values_only=True):
40 monthly_ws.append(row)
41
42 monthly_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(pre_mouth.strftime('%Y-%m')))
43 monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet'))
44 monthly_wb.save(monthly_excel_path)
...@@ -14,7 +14,6 @@ class Command(BaseCommand, LoggerMixin): ...@@ -14,7 +14,6 @@ class Command(BaseCommand, LoggerMixin):
14 14
15 def __init__(self): 15 def __init__(self):
16 super().__init__() 16 super().__init__()
17 self.log_base = '[license statistics]'
18 self.header_map = { 17 self.header_map = {
19 consts.MVI_CLASSIFY: [('申请ID', '发票代码', '发票号码', '开票日期', '不含税价', '发票联', '购买方名称', 18 consts.MVI_CLASSIFY: [('申请ID', '发票代码', '发票号码', '开票日期', '不含税价', '发票联', '购买方名称',
20 '购买方证件号码', '纳税人识别号', '车架号', '价税合计小写', '销货单位名称', '增值税税额', 19 '购买方证件号码', '纳税人识别号', '车架号', '价税合计小写', '销货单位名称', '增值税税额',
...@@ -75,7 +74,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -75,7 +74,8 @@ class Command(BaseCommand, LoggerMixin):
75 print('excel dir not exists') 74 print('excel dir not exists')
76 return 75 return
77 excel_path = os.path.join(excel_dir, 'license_{0}.xlsx'.format(date_str)) 76 excel_path = os.path.join(excel_dir, 'license_{0}.xlsx'.format(date_str))
78 log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str)) 77 # log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str))
78 log_path = os.path.join(conf.LOG_DIR, 'license_statistics.log.{0}'.format(date_str))
79 if not os.path.exists(log_path): 79 if not os.path.exists(log_path):
80 print('log_path not exists') 80 print('log_path not exists')
81 return 81 return
...@@ -92,7 +92,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -92,7 +92,8 @@ class Command(BaseCommand, LoggerMixin):
92 92
93 with open(log_path, 'r', encoding='utf-8') as fp: 93 with open(log_path, 'r', encoding='utf-8') as fp:
94 for line in fp: 94 for line in fp:
95 search_obj = re.search(r'task=(.*) license_summary=(.*)', line) 95 # search_obj = re.search(r'task=(.*) license_summary=(.*)', line)
96 search_obj = re.search(r'\[task=(.*)] \[license_summary=(.*)]', line)
96 task_str = search_obj.group(1) 97 task_str = search_obj.group(1)
97 license_summary = ast.literal_eval(search_obj.group(2)) 98 license_summary = ast.literal_eval(search_obj.group(2))
98 business_type, doc_id_str = task_str.split(consts.SPLIT_STR) 99 business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
......
...@@ -689,8 +689,15 @@ class Command(BaseCommand, LoggerMixin): ...@@ -689,8 +689,15 @@ class Command(BaseCommand, LoggerMixin):
689 '[license_summary={4}]'.format(self.log_base, task_str, bs_summary, 689 '[license_summary={4}]'.format(self.log_base, task_str, bs_summary,
690 unknown_summary, license_summary)) 690 unknown_summary, license_summary))
691 691
692 self.license_log.info('[task={0}] [license_summary={1}]'.format(task_str, license_summary))
693 idcard_list = license_summary.get(consts.IC_CLASSIFY)
694 if idcard_list:
695 self.idcard_log.info('[task={0}] [idcard={1}]'.format(task_str, idcard_list))
696
692 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) 697 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
693 698
699 self.bs_log.info('[task={0}] [bs_summary={1}]'.format(task_str, merged_bs_summary))
700
694 self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] ' 701 self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] '
695 '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary, 702 '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary,
696 unknown_summary, res_list)) 703 unknown_summary, res_list))
......
...@@ -40,6 +40,9 @@ class LoggerMixin: ...@@ -40,6 +40,9 @@ class LoggerMixin:
40 exception_log = logging.getLogger('exception') 40 exception_log = logging.getLogger('exception')
41 cronjob_log = logging.getLogger('cronjob') 41 cronjob_log = logging.getLogger('cronjob')
42 folder_log = logging.getLogger('folder') 42 folder_log = logging.getLogger('folder')
43 bs_log = logging.getLogger('bs')
44 license_log = logging.getLogger('license')
45 idcard_log = logging.getLogger('idcard')
43 46
44 47
45 class GenericView(LoggerMixin, GenericExceptionMixin, GenericAPIView): 48 class GenericView(LoggerMixin, GenericExceptionMixin, GenericAPIView):
......
...@@ -84,6 +84,12 @@ class PDFHandler: ...@@ -84,6 +84,12 @@ class PDFHandler:
84 def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): 84 def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0):
85 pix = self.recover_pix(pdf, xref, smask, colorspace) 85 pix = self.recover_pix(pdf, xref, smask, colorspace)
86 ext, img_data = self.get_img_data(pix) 86 ext, img_data = self.get_img_data(pix)
87 if ext == 'jpx':
88 img_save_path = self.get_img_save_path(pno, img_index=img_index, ext='jpeg')
89 jpx_pix = fitz.Pixmap(img_data)
90 jpx_pix.writeImage(img_save_path)
91 jpx_pix = None
92 else:
87 img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) 93 img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext)
88 with open(img_save_path, "wb") as f: 94 with open(img_save_path, "wb") as f:
89 f.write(img_data) 95 f.write(img_data)
......
1 [loggers] 1 [loggers]
2 keys=root, running, exception, cronjob, folder, django.db.backends 2 keys=root, running, exception, cronjob, folder, bs, license, idcard, django.db.backends
3 3
4 [handlers] 4 [handlers]
5 keys=consoleHandler, django_rotateFileHandler, exceptionFileHandler, cronjobFileHandler, folderFileHandler, djangodbFileHandler 5 keys=consoleHandler, django_rotateFileHandler, exceptionFileHandler, cronjobFileHandler, folderFileHandler, bsFileHandler, licenseFileHandler, idcardFileHandler, djangodbFileHandler
6 6
7 [formatters] 7 [formatters]
8 keys=SituFormatter, dataLogFormatter 8 keys=SituFormatter, dataLogFormatter, SimpleFormatter
9 9
10 [formatter_SituFormatter] 10 [formatter_SituFormatter]
11 format=[%(asctime)s] [%(process)d] [%(thread)d] [%(threadName)s] [%(filename)s:%(lineno)d] %(levelname)s %(message)s 11 format=[%(asctime)s] [%(process)d] [%(thread)d] [%(threadName)s] [%(filename)s:%(lineno)d] %(levelname)s %(message)s
...@@ -15,6 +15,10 @@ datefmt= ...@@ -15,6 +15,10 @@ datefmt=
15 class=situlogger.JsonFormatter 15 class=situlogger.JsonFormatter
16 format=%(asctime)s %(levelname)s %(funcName)s 16 format=%(asctime)s %(levelname)s %(funcName)s
17 17
18 [formatter_SimpleFormatter]
19 format=[%(asctime)s] %(message)s
20 datefmt=
21
18 [handler_consoleHandler] 22 [handler_consoleHandler]
19 class=StreamHandler 23 class=StreamHandler
20 level=ERROR 24 level=ERROR
...@@ -45,6 +49,24 @@ level=DEBUG ...@@ -45,6 +49,24 @@ level=DEBUG
45 formatter=SituFormatter 49 formatter=SituFormatter
46 args=('../logs/folder_ocr.log',) 50 args=('../logs/folder_ocr.log',)
47 51
52 [handler_bsFileHandler]
53 class=situlogger.SituRotatingFileHandler
54 level=DEBUG
55 formatter=SimpleFormatter
56 args=('../logs/bs_statistics.log',)
57
58 [handler_licenseFileHandler]
59 class=situlogger.SituRotatingFileHandler
60 level=DEBUG
61 formatter=SimpleFormatter
62 args=('../logs/license_statistics.log',)
63
64 [handler_idcardFileHandler]
65 class=situlogger.SituRotatingFileHandler
66 level=DEBUG
67 formatter=SimpleFormatter
68 args=('../logs/idcard.log',)
69
48 [handler_djangodbFileHandler] 70 [handler_djangodbFileHandler]
49 class=situlogger.SituRotatingFileHandler 71 class=situlogger.SituRotatingFileHandler
50 level=DEBUG 72 level=DEBUG
...@@ -79,6 +101,24 @@ handlers=folderFileHandler ...@@ -79,6 +101,24 @@ handlers=folderFileHandler
79 qualname=folder 101 qualname=folder
80 propagate=0 102 propagate=0
81 103
104 [logger_bs]
105 level=INFO
106 handlers=bsFileHandler
107 qualname=bs
108 propagate=0
109
110 [logger_license]
111 level=INFO
112 handlers=licenseFileHandler
113 qualname=license
114 propagate=0
115
116 [logger_idcard]
117 level=INFO
118 handlers=idcardFileHandler
119 qualname=idcard
120 propagate=0
121
82 [logger_django.db.backends] 122 [logger_django.db.backends]
83 level=DEBUG 123 level=DEBUG
84 handlers=djangodbFileHandler 124 handlers=djangodbFileHandler
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!