Merge branch 'feature/main' into feature/mssql
Showing
10 changed files
with
207 additions
and
672 deletions
... | @@ -152,7 +152,7 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果') | ... | @@ -152,7 +152,7 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果') |
152 | # '借贷': ('贷', '借'), # 竖版-无表格-广发银行 | 152 | # '借贷': ('贷', '借'), # 竖版-无表格-广发银行 |
153 | # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行 | 153 | # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行 |
154 | # '收/支': ('收入', '支出'), # 横版-表格-北京银行 | 154 | # '收/支': ('收入', '支出'), # 横版-表格-北京银行 |
155 | BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支', '收支标志'} | 155 | BORROW_HEADERS_SET = {'借贷', '借\n贷', '借贷状态', '收/支', '收支标志'} |
156 | BORROW_INCOME_SET = {'贷', '收入', '收', '收(Cr)'} | 156 | BORROW_INCOME_SET = {'贷', '收入', '收', '收(Cr)'} |
157 | BORROW_OUTLAY_SET = {'借', '支出', '支', '付(Dr)'} | 157 | BORROW_OUTLAY_SET = {'借', '支出', '支', '付(Dr)'} |
158 | INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'} | 158 | INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'} |
... | @@ -165,6 +165,7 @@ HEADERS_MAPPING = {} | ... | @@ -165,6 +165,7 @@ HEADERS_MAPPING = {} |
165 | HEADERS_MAPPING.update( | 165 | HEADERS_MAPPING.update( |
166 | { | 166 | { |
167 | '借贷': BORROW_KEY, | 167 | '借贷': BORROW_KEY, |
168 | '借\n贷': BORROW_KEY, | ||
168 | '借贷状态': BORROW_KEY, | 169 | '借贷状态': BORROW_KEY, |
169 | '收支标志': BORROW_KEY, | 170 | '收支标志': BORROW_KEY, |
170 | '收/支': BORROW_KEY, | 171 | '收/支': BORROW_KEY, | ... | ... |
... | @@ -40,7 +40,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -40,7 +40,8 @@ class Command(BaseCommand, LoggerMixin): |
40 | print('excel dir not exists') | 40 | print('excel dir not exists') |
41 | return | 41 | return |
42 | excel_path = os.path.join(excel_dir, 'bs_{0}.xlsx'.format(date_str)) | 42 | excel_path = os.path.join(excel_dir, 'bs_{0}.xlsx'.format(date_str)) |
43 | log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str)) | 43 | # log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str)) |
44 | log_path = os.path.join(conf.LOG_DIR, 'bs_statistics.log.{0}'.format(date_str)) | ||
44 | if not os.path.exists(log_path): | 45 | if not os.path.exists(log_path): |
45 | print('log_path not exists') | 46 | print('log_path not exists') |
46 | return | 47 | return |
... | @@ -48,7 +49,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -48,7 +49,8 @@ class Command(BaseCommand, LoggerMixin): |
48 | summary_dict = {} | 49 | summary_dict = {} |
49 | with open(log_path, 'r', encoding='utf-8') as fp: | 50 | with open(log_path, 'r', encoding='utf-8') as fp: |
50 | for line in fp: | 51 | for line in fp: |
51 | search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line) | 52 | # search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line) |
53 | search_obj = re.search(r'\[task=(.*)] \[bs_summary=(.*)]', line) | ||
52 | task_str = search_obj.group(1) | 54 | task_str = search_obj.group(1) |
53 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | 55 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) |
54 | doc_id = int(doc_id_str) | 56 | doc_id = int(doc_id_str) | ... | ... |
1 | import os | ||
2 | import time | ||
3 | import json | ||
4 | import signal | ||
5 | import asyncio | ||
6 | import aiohttp | ||
7 | import difflib | ||
8 | import base64 | ||
9 | import requests | ||
10 | from datetime import datetime, date | ||
11 | from collections import Counter | ||
12 | from apps.doc.ocr.wb import BSWorkbook, Workbook | ||
13 | from django.core.management import BaseCommand | ||
14 | |||
15 | from settings import conf | ||
16 | from common.mixins import LoggerMixin | ||
17 | from common.tools.file_tools import write_zip_file | ||
18 | from common.tools.pdf_to_img import PDFHandler | ||
19 | from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords | ||
20 | from apps.doc.named_enum import KeywordsType | ||
21 | from apps.doc import consts | ||
22 | from apps.doc.ocr.edms import EDMS, rh | ||
23 | from apps.doc.exceptions import EDMSException | ||
24 | |||
25 | |||
26 | class Command(BaseCommand, LoggerMixin): | ||
27 | |||
28 | def __init__(self): | ||
29 | super().__init__() | ||
30 | self.log_base = '[doc ocr process]' | ||
31 | # 处理文件开关 | ||
32 | self.switch = True | ||
33 | # 数据目录 | ||
34 | self.data_dir = conf.DATA_DIR | ||
35 | # ocr相关 | ||
36 | self.ocr_url_1 = conf.OCR_URL_1 | ||
37 | self.ocr_url_2 = conf.OCR_URL_2 | ||
38 | self.ocr_url_3 = conf.BC_URL | ||
39 | # EDMS web_service_api | ||
40 | self.edms = EDMS() | ||
41 | # 优雅退出信号:15 | ||
42 | signal.signal(signal.SIGTERM, self.signal_handler) | ||
43 | |||
44 | def signal_handler(self, sig, frame): | ||
45 | self.switch = False # 停止处理文件 | ||
46 | |||
47 | def get_doc_info(self): | ||
48 | task_str, is_priority = rh.dequeue() | ||
49 | if task_str is None: | ||
50 | self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | ||
51 | return None, None | ||
52 | |||
53 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | ||
54 | doc_id = int(doc_id_str) | ||
55 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | ||
56 | # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values( | ||
57 | # 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first() | ||
58 | doc = doc_class.objects.filter(id=doc_id).first() | ||
59 | if doc is None: | ||
60 | self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( | ||
61 | self.log_base, task_str, is_priority)) | ||
62 | return None, None | ||
63 | elif doc.status != DocStatus.INIT.value: | ||
64 | self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' | ||
65 | '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) | ||
66 | return None, None | ||
67 | doc.status = DocStatus.PROCESSING.value | ||
68 | doc.save() | ||
69 | self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format( | ||
70 | self.log_base, task_str, is_priority)) | ||
71 | return doc, business_type | ||
72 | |||
73 | def pdf_download(self, doc, business_type): | ||
74 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) | ||
75 | os.makedirs(doc_data_path, exist_ok=True) | ||
76 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | ||
77 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | ||
78 | for times in range(consts.RETRY_TIMES): | ||
79 | try: | ||
80 | self.edms.download(pdf_path, doc.metadata_version_id) | ||
81 | except Exception as e: | ||
82 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
83 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
84 | edms_exc = str(e) | ||
85 | else: | ||
86 | break | ||
87 | else: | ||
88 | raise EDMSException(edms_exc) | ||
89 | |||
90 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) | ||
91 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') | ||
92 | self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( | ||
93 | self.log_base, business_type, doc.id, pdf_path)) | ||
94 | return doc_data_path, excel_path, src_excel_path, pdf_path | ||
95 | |||
96 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino): | ||
97 | sheets = ocr_data.get('data', []) | ||
98 | if not sheets: | ||
99 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
100 | return | ||
101 | confidence = ocr_data.get('confidence', 1) | ||
102 | img_name = 'page_{0}_img_{1}'.format(pno, ino) | ||
103 | cells_exists = False | ||
104 | for i, sheet in enumerate(sheets): | ||
105 | cells = sheet.get('cells') | ||
106 | if not cells: | ||
107 | continue | ||
108 | cells_exists = True | ||
109 | sheet_name = '{0}_{1}'.format(img_name, i) | ||
110 | ws = wb.create_sheet(sheet_name) | ||
111 | for cell in cells: | ||
112 | c1 = cell.get('start_column') | ||
113 | r1 = cell.get('start_row') | ||
114 | words = cell.get('words') | ||
115 | ws.cell(row=r1 + 1, column=c1 + 1, value=words) | ||
116 | |||
117 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] | ||
118 | summary = sheet.get('summary') | ||
119 | card = summary[1] | ||
120 | if card is None: | ||
121 | classify_dict = unknown_summary.setdefault(classify, {}) | ||
122 | role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0] | ||
123 | role_dict = classify_dict.setdefault(role, {}) | ||
124 | role_dict['classify'] = classify | ||
125 | role_dict['role'] = role | ||
126 | role_dict.setdefault('sheet', []).append(sheet_name) | ||
127 | role_dict.setdefault('confidence', []).append(confidence) | ||
128 | code_list = role_dict.setdefault('code', []) | ||
129 | pt_list = role_dict.setdefault('print_time', []) | ||
130 | sd_list = role_dict.setdefault('start_date', []) | ||
131 | ed_list = role_dict.setdefault('end_date', []) | ||
132 | if summary[3] is not None: | ||
133 | code_list.append((summary[2], summary[3])) | ||
134 | if summary[4] is not None: | ||
135 | pt_list.append(summary[4]) | ||
136 | if summary[5] is not None: | ||
137 | sd_list.append(summary[5]) | ||
138 | if summary[6] is not None: | ||
139 | ed_list.append(summary[6]) | ||
140 | else: | ||
141 | card_dict = bs_summary.setdefault(card, {}) | ||
142 | card_dict['count'] = card_dict.get('count', 0) + 1 | ||
143 | card_dict.setdefault('classify', []).append(classify) | ||
144 | card_dict.setdefault('confidence', []).append(confidence) | ||
145 | card_dict.setdefault('sheet', []).append(sheet_name) | ||
146 | role_list = card_dict.setdefault('role', []) | ||
147 | role_set = card_dict.setdefault('role_set', set()) | ||
148 | code_list = card_dict.setdefault('code', []) | ||
149 | pt_list = card_dict.setdefault('print_time', []) | ||
150 | sd_list = card_dict.setdefault('start_date', []) | ||
151 | ed_list = card_dict.setdefault('end_date', []) | ||
152 | if summary[0] is not None: | ||
153 | role_list.append(summary[0]) | ||
154 | role_set.add(summary[0]) | ||
155 | if summary[3] is not None: | ||
156 | code_list.append((summary[2], summary[3])) | ||
157 | if summary[4] is not None: | ||
158 | pt_list.append(summary[4]) | ||
159 | if summary[5] is not None: | ||
160 | sd_list.append(summary[5]) | ||
161 | if summary[6] is not None: | ||
162 | ed_list.append(summary[6]) | ||
163 | |||
164 | if cells_exists: | ||
165 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
166 | else: | ||
167 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
168 | |||
169 | def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino): | ||
170 | # 类别:'0'身份证, '1'居住证 | ||
171 | license_data = ocr_data.get('data', []) | ||
172 | if not license_data: | ||
173 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
174 | return | ||
175 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
176 | license_summary.setdefault(classify, []).extend(license_data) | ||
177 | |||
178 | def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino): | ||
179 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: | ||
180 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
181 | if pid == consts.BC_PID: | ||
182 | # 银行卡 | ||
183 | # res_dict = {} | ||
184 | # for en_key, chn_key in consts.BC_FIELD: | ||
185 | # res_dict[chn_key] = ocr_res_2.get(en_key, '') | ||
186 | license_summary.setdefault(classify, []).append(ocr_res_2) | ||
187 | else: | ||
188 | # 营业执照等 | ||
189 | for result_dict in ocr_res_2.get('ResultList', []): | ||
190 | res_dict = {} | ||
191 | for field_dict in result_dict.get('FieldList', []): | ||
192 | res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') | ||
193 | license_summary.setdefault(classify, []).append(res_dict) | ||
194 | else: | ||
195 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
196 | |||
197 | @staticmethod | ||
198 | async def fetch_ocr_1_result(url, json_data): | ||
199 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
200 | async with session.post(url, json=json_data) as response: | ||
201 | if response.status == 200: | ||
202 | return await response.json() | ||
203 | |||
204 | @staticmethod | ||
205 | async def fetch_ocr_2_result(url, json_data): | ||
206 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
207 | async with session.post(url, data=json_data) as response: | ||
208 | if response.status == 200: | ||
209 | return await response.text() | ||
210 | |||
211 | @staticmethod | ||
212 | async def fetch_bc_name_result(url, json_data): | ||
213 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
214 | async with session.post(url, json=json_data) as response: | ||
215 | if response.status == 200: | ||
216 | return await response.json() | ||
217 | |||
218 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list): | ||
219 | pno, ino = self.parse_img_path(img_path) | ||
220 | with open(img_path, 'rb') as f: | ||
221 | base64_data = base64.b64encode(f.read()) | ||
222 | # 获取解码后的base64值 | ||
223 | file_data = base64_data.decode() | ||
224 | json_data_1 = { | ||
225 | "file": file_data | ||
226 | } | ||
227 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) | ||
228 | if ocr_res_1 is None: | ||
229 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
230 | self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path)) | ||
231 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
232 | else: | ||
233 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format( | ||
234 | self.log_base, img_path, ocr_res_1)) | ||
235 | |||
236 | if ocr_res_1.get('code') == 1: | ||
237 | ocr_data = ocr_res_1.get('data', {}) | ||
238 | classify = ocr_data.get('classify') | ||
239 | if classify is None: | ||
240 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
241 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
242 | self.log_base, img_path, ocr_res_1)) | ||
243 | return | ||
244 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
245 | res_list.append((pno, ino, consts.RES_SUCCESS_OTHER)) | ||
246 | return | ||
247 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
248 | self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino) | ||
249 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
250 | pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
251 | json_data_2 = { | ||
252 | "pid": str(pid), | ||
253 | # "key": conf.OCR_KEY, | ||
254 | # "secret": conf.OCR_SECRET, | ||
255 | "filedata": file_data | ||
256 | } | ||
257 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) | ||
258 | if ocr_res_2 is None: | ||
259 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
260 | self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path)) | ||
261 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
262 | else: | ||
263 | # 识别结果 | ||
264 | ocr_res_2 = json.loads(ocr_res_2) | ||
265 | self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format( | ||
266 | self.log_base, img_path, ocr_res_2)) | ||
267 | if classify == consts.BC_CLASSIFY: | ||
268 | name = '有' | ||
269 | json_data_1['card_res'] = ocr_res_2 | ||
270 | card_name_res = await self.fetch_bc_name_result(self.ocr_url_3, json_data_1) | ||
271 | if isinstance(card_name_res, dict) and \ | ||
272 | card_name_res.get('data', {}).get('is_exists_name') == 0: | ||
273 | name = '无' | ||
274 | ocr_res_2['Name'] = name | ||
275 | self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino) | ||
276 | else: # 流水处理 | ||
277 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino) | ||
278 | else: | ||
279 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
280 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
281 | self.log_base, img_path, ocr_res_1)) | ||
282 | |||
283 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | ||
284 | # # # 流水 | ||
285 | # # res = { | ||
286 | # # 'code': 1, | ||
287 | # # 'msg': 'success', | ||
288 | # # 'data': { | ||
289 | # # 'classify': 0, | ||
290 | # # 'confidence': 0.999, | ||
291 | # # 'data': [ | ||
292 | # # { | ||
293 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
294 | # # 'cells': [] | ||
295 | # # }, | ||
296 | # # { | ||
297 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
298 | # # 'cells': [] | ||
299 | # # } | ||
300 | # # ] | ||
301 | # # } | ||
302 | # # } | ||
303 | # # | ||
304 | # # # 证件-1 | ||
305 | # # res = { | ||
306 | # # 'code': 1, | ||
307 | # # 'msg': 'success', | ||
308 | # # 'data': { | ||
309 | # # 'classify': 0, | ||
310 | # # 'confidence': 0.999, | ||
311 | # # 'data': [ | ||
312 | # # { | ||
313 | # # 'cn_key': 'value', | ||
314 | # # 'cn_key': 'value', | ||
315 | # # }, | ||
316 | # # { | ||
317 | # # 'cn_key': 'value', | ||
318 | # # 'cn_key': 'value', | ||
319 | # # }, | ||
320 | # # ] | ||
321 | # # } | ||
322 | # # } | ||
323 | # # | ||
324 | # # # 证件-2 or 其他类 | ||
325 | # # res = { | ||
326 | # # 'code': 1, | ||
327 | # # 'msg': 'success', | ||
328 | # # 'data': { | ||
329 | # # 'classify': 0, | ||
330 | # # 'confidence': 0.999, | ||
331 | # # } | ||
332 | # # } | ||
333 | # with open(img_path, 'rb') as f: | ||
334 | # base64_data = base64.b64encode(f.read()) | ||
335 | # # 获取解码后的base64值 | ||
336 | # file_data = base64_data.decode() | ||
337 | # json_data_1 = { | ||
338 | # "file": file_data | ||
339 | # } | ||
340 | # response_1 = requests.post(self.ocr_url_1, json=json_data_1) | ||
341 | # if response_1.status_code == 200: | ||
342 | # ocr_res_1 = response_1.json() | ||
343 | # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
344 | # self.log_base, img_path, ocr_res_1)) | ||
345 | # | ||
346 | # if ocr_res_1.get('code') == 1: | ||
347 | # ocr_data = ocr_res_1.get('data', {}) | ||
348 | # classify = ocr_data.get('classify') | ||
349 | # if classify is None: | ||
350 | # skip_img.append(self.parse_img_path(img_path)) | ||
351 | # return | ||
352 | # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
353 | # skip_img.append(self.parse_img_path(img_path)) | ||
354 | # return | ||
355 | # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
356 | # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) | ||
357 | # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
358 | # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
359 | # json_data_2 = { | ||
360 | # "pid": str(pid), | ||
361 | # "key": conf.OCR_KEY, | ||
362 | # "secret": conf.OCR_SECRET, | ||
363 | # "file": file_data | ||
364 | # } | ||
365 | # response_2 = requests.post(self.ocr_url_2, data=json_data_2) | ||
366 | # if response_2.status_code == 200: | ||
367 | # # 识别结果 | ||
368 | # ocr_res_2 = response_2.json() | ||
369 | # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
370 | # self.log_base, img_path, ocr_res_2)) | ||
371 | # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | ||
372 | # else: | ||
373 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
374 | # else: # 流水处理 | ||
375 | # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | ||
376 | # else: | ||
377 | # skip_img.append(self.parse_img_path(img_path)) | ||
378 | # else: | ||
379 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
380 | |||
381 | @staticmethod | ||
382 | def parse_img_path(img_path): | ||
383 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
384 | part_list = img_name.split('_') | ||
385 | # page_7_img_11_0 | ||
386 | return int(part_list[1])+1, int(part_list[3])+1 | ||
387 | |||
388 | @staticmethod | ||
389 | def get_most(value_list): | ||
390 | if value_list: | ||
391 | most_common = Counter(value_list).most_common(1) | ||
392 | return most_common[0][0] if most_common else None | ||
393 | |||
394 | @staticmethod | ||
395 | def date_format(date_str, format_str): | ||
396 | try: | ||
397 | date_res = datetime.strptime(date_str, format_str).date() | ||
398 | except Exception as e: | ||
399 | return | ||
400 | else: | ||
401 | return date_res | ||
402 | |||
403 | def get_validate_date(self, date_list): | ||
404 | for date_str in date_list: | ||
405 | for format_str in consts.DATE_FORMAT: | ||
406 | date_res = self.date_format(date_str, format_str) | ||
407 | if isinstance(date_res, date): | ||
408 | return date_res | ||
409 | |||
410 | def merge_card(self, bs_summary): | ||
411 | merged_bs_summary = {} | ||
412 | sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True) | ||
413 | for main_card in sorted_card: | ||
414 | if bs_summary.get(main_card) is None: | ||
415 | continue | ||
416 | merged_bs_summary[main_card] = bs_summary.pop(main_card) | ||
417 | del merged_bs_summary[main_card]['count'] | ||
418 | merge_cards = [] | ||
419 | for card in bs_summary.keys(): | ||
420 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: | ||
421 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) | ||
422 | merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) | ||
423 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) | ||
424 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) | ||
425 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) | ||
426 | merged_bs_summary[main_card]['code'].extend(bs_summary[card]['code']) | ||
427 | merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time']) | ||
428 | merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date']) | ||
429 | merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date']) | ||
430 | merge_cards.append(card) | ||
431 | for card in merge_cards: | ||
432 | del bs_summary[card] | ||
433 | merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify']) | ||
434 | merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role']) | ||
435 | del bs_summary | ||
436 | return merged_bs_summary | ||
437 | |||
438 | def prune_bs_summary(self, bs_summary): | ||
439 | for summary in bs_summary.values(): | ||
440 | del summary['count'] | ||
441 | summary['classify'] = self.get_most(summary['classify']) | ||
442 | summary['role'] = self.get_most(summary['role']) | ||
443 | return bs_summary | ||
444 | |||
445 | def rebuild_bs_summary(self, bs_summary, unknown_summary): | ||
446 | # bs_summary = { | ||
447 | # '卡号': { | ||
448 | # 'count': 100, | ||
449 | # 'classify': [], | ||
450 | # 'confidence': [], | ||
451 | # 'role': [], | ||
452 | # 'code': [('page', 'code')], | ||
453 | # 'print_time': [], | ||
454 | # 'start_date': [], | ||
455 | # 'end_date': [], | ||
456 | # 'sheet': ['sheet_name'] | ||
457 | # } | ||
458 | # } | ||
459 | # | ||
460 | # unknown_summary = { | ||
461 | # 0: { | ||
462 | # '户名': { | ||
463 | # 'classify': 0, | ||
464 | # 'confidence': [], | ||
465 | # 'role': '户名', | ||
466 | # 'code': [('page', 'code')], | ||
467 | # 'print_time': [], | ||
468 | # 'start_date': [], | ||
469 | # 'end_date': [], | ||
470 | # 'sheet': ['sheet_name'] | ||
471 | # } | ||
472 | # } | ||
473 | # } | ||
474 | # 无卡号 | ||
475 | if len(bs_summary) == 0: | ||
476 | del bs_summary | ||
477 | merged_bs_summary = {} | ||
478 | card_num = 1 | ||
479 | for role_dict in unknown_summary.values(): | ||
480 | if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict: | ||
481 | summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {}) | ||
482 | for summary in role_dict.values(): | ||
483 | summary_dict['confidence'].extend(summary['confidence']) | ||
484 | summary_dict['role'] = summary['role'] | ||
485 | summary_dict['code'].extend(summary['code']) | ||
486 | summary_dict['print_time'].extend(summary['print_time']) | ||
487 | summary_dict['start_date'].extend(summary['start_date']) | ||
488 | summary_dict['end_date'].extend(summary['end_date']) | ||
489 | summary_dict['sheet'].extend(summary['sheet']) | ||
490 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
491 | merged_bs_summary[card] = summary_dict | ||
492 | else: | ||
493 | for summary in role_dict.values(): | ||
494 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
495 | card_num += 1 | ||
496 | merged_bs_summary[card] = summary | ||
497 | else: | ||
498 | # 1卡号 | ||
499 | one_card = False | ||
500 | if len(bs_summary) == 1: | ||
501 | merged_bs_summary = self.prune_bs_summary(bs_summary) | ||
502 | one_card = True | ||
503 | # 多卡号 | ||
504 | else: | ||
505 | merged_bs_summary = self.merge_card(bs_summary) | ||
506 | |||
507 | for card_summary in merged_bs_summary.values(): | ||
508 | merge_role = [] | ||
509 | classify_summary = unknown_summary.get(card_summary['classify'], {}) | ||
510 | for role, summary in classify_summary.items(): | ||
511 | if one_card or role in card_summary['role_set']: | ||
512 | merge_role.append(role) | ||
513 | card_summary['confidence'].extend(summary['confidence']) | ||
514 | card_summary['sheet'].extend(summary['sheet']) | ||
515 | card_summary['code'].extend(summary['code']) | ||
516 | card_summary['print_time'].extend(summary['print_time']) | ||
517 | card_summary['start_date'].extend(summary['start_date']) | ||
518 | card_summary['end_date'].extend(summary['end_date']) | ||
519 | |||
520 | for role in merge_role: | ||
521 | del classify_summary[role] | ||
522 | |||
523 | card_num = 1 | ||
524 | for role_dict in unknown_summary.values(): | ||
525 | for summary in role_dict.values(): | ||
526 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
527 | card_num += 1 | ||
528 | merged_bs_summary[card] = summary | ||
529 | |||
530 | del unknown_summary | ||
531 | for summary in merged_bs_summary.values(): | ||
532 | if summary.get('role_set') is not None: | ||
533 | del summary['role_set'] | ||
534 | summary['print_time'] = self.get_validate_date(summary['print_time']) | ||
535 | summary['start_date'] = self.get_validate_date(summary['start_date']) | ||
536 | summary['end_date'] = self.get_validate_date(summary['end_date']) | ||
537 | summary['confidence'] = max(summary['confidence']) | ||
538 | return merged_bs_summary | ||
539 | |||
540 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 | ||
541 | # TODO 异常邮件通知 | ||
542 | # 识别失败:普通异常,如PDF异常、构建过程异常 | ||
543 | # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件 | ||
544 | # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件 | ||
545 | # TODO OCR接口调用重试 | ||
546 | def handle(self, *args, **kwargs): | ||
547 | sleep_second = int(conf.SLEEP_SECOND) | ||
548 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | ||
549 | |||
550 | while self.switch: | ||
551 | # 1. 从队列获取文件信息 | ||
552 | doc, business_type = self.get_doc_info() | ||
553 | # 队列为空时的处理 | ||
554 | if doc is None: | ||
555 | time.sleep(sleep_second) | ||
556 | sleep_second = min(max_sleep_second, sleep_second + 5) | ||
557 | continue | ||
558 | sleep_second = int(conf.SLEEP_SECOND) | ||
559 | |||
560 | try: | ||
561 | start_time = time.time() | ||
562 | # 2. 从EDMS获取PDF文件 | ||
563 | doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type) | ||
564 | |||
565 | # 3.PDF文件提取图片 | ||
566 | img_save_path = os.path.join(doc_data_path, 'img') | ||
567 | self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( | ||
568 | self.log_base, business_type, doc.id)) | ||
569 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
570 | pdf_handler.extract_image() | ||
571 | self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( | ||
572 | self.log_base, business_type, doc.id)) | ||
573 | |||
574 | # 4.获取OCR结果并且构建excel文件 | ||
575 | bs_summary = {} | ||
576 | license_summary = {} | ||
577 | unknown_summary = {} | ||
578 | res_list = [] | ||
579 | interest_keyword = Keywords.objects.filter( | ||
580 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) | ||
581 | salary_keyword = Keywords.objects.filter( | ||
582 | type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True) | ||
583 | loan_keyword = Keywords.objects.filter( | ||
584 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list( | ||
585 | 'keyword', flat=True) | ||
586 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | ||
587 | |||
588 | # wb = Workbook() | ||
589 | |||
590 | # 4.1 获取OCR结果 | ||
591 | loop = asyncio.get_event_loop() | ||
592 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) | ||
593 | for img_path in pdf_handler.img_path_list] | ||
594 | loop.run_until_complete(asyncio.wait(tasks)) | ||
595 | # loop.close() | ||
596 | |||
597 | # for img_path in pdf_handler.img_path_list: | ||
598 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) | ||
599 | |||
600 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' | ||
601 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, | ||
602 | unknown_summary, license_summary)) | ||
603 | |||
604 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | ||
605 | |||
606 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' | ||
607 | '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type, | ||
608 | doc.id, merged_bs_summary, | ||
609 | unknown_summary, res_list)) | ||
610 | del unknown_summary | ||
611 | |||
612 | # 4.2 重构Excel文件 | ||
613 | wb.save(src_excel_path) | ||
614 | wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme) | ||
615 | wb.save(excel_path) | ||
616 | except EDMSException as e: | ||
617 | doc.status = DocStatus.PROCESS_FAILED.value | ||
618 | doc.save() | ||
619 | self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] ' | ||
620 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
621 | except Exception as e: | ||
622 | doc.status = DocStatus.PROCESS_FAILED.value | ||
623 | doc.save() | ||
624 | self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] ' | ||
625 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
626 | else: | ||
627 | try: | ||
628 | # 5.上传至EDMS | ||
629 | for times in range(consts.RETRY_TIMES): | ||
630 | try: | ||
631 | self.edms.upload(excel_path, doc, business_type) | ||
632 | except Exception as e: | ||
633 | self.cronjob_log.warn( | ||
634 | '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
635 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
636 | edms_exc = str(e) | ||
637 | else: | ||
638 | break | ||
639 | else: | ||
640 | raise EDMSException(edms_exc) | ||
641 | except Exception as e: | ||
642 | doc.status = DocStatus.UPLOAD_FAILED.value | ||
643 | doc.save() | ||
644 | end_time = time.time() | ||
645 | speed_time = int(end_time - start_time) | ||
646 | self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] ' | ||
647 | '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id, | ||
648 | speed_time, e)) | ||
649 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
650 | |||
651 | else: | ||
652 | doc.status = DocStatus.COMPLETE.value | ||
653 | doc.save() | ||
654 | end_time = time.time() | ||
655 | speed_time = int(end_time - start_time) | ||
656 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' | ||
657 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) | ||
658 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
659 | |||
660 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) |
1 | import re | ||
2 | import os | ||
3 | import ast | ||
4 | import datetime | ||
5 | from openpyxl import Workbook | ||
6 | from django.core.management import BaseCommand | ||
7 | from settings import conf | ||
8 | from common.mixins import LoggerMixin | ||
9 | from apps.doc.models import HILDoc, AFCDoc | ||
10 | from apps.doc import consts | ||
11 | |||
12 | |||
13 | class Command(BaseCommand, LoggerMixin): | ||
14 | |||
15 | def __init__(self): | ||
16 | super().__init__() | ||
17 | self.sheet_name = '身份证' | ||
18 | self.header = ('申请号', '身份证号', '民族', '时间戳') | ||
19 | |||
20 | def add_arguments(self, parser): | ||
21 | parser.add_argument( | ||
22 | '--date', | ||
23 | default=datetime.date.today() - datetime.timedelta(days=1), | ||
24 | dest='date', | ||
25 | help='将要计算的日期,格式: 2018-01-01' | ||
26 | ) | ||
27 | |||
28 | def handle(self, *args, **kwargs): | ||
29 | date = kwargs.get('date') | ||
30 | if isinstance(date, str): | ||
31 | if not re.match(r'\d{4}-\d{2}-\d{2}', date): | ||
32 | print('date format error') | ||
33 | return | ||
34 | date_str = date | ||
35 | else: | ||
36 | date_str = date.strftime('%Y-%m-%d') | ||
37 | |||
38 | afc_excel_dir = os.path.join(conf.DATA_DIR, 'AFC', 'IdCard') | ||
39 | hil_excel_dir = os.path.join(conf.DATA_DIR, 'HIL', 'IdCard') | ||
40 | if not os.path.exists(afc_excel_dir) or not os.path.exists(hil_excel_dir): | ||
41 | print('excel_dir not exist') | ||
42 | return | ||
43 | |||
44 | log_path = os.path.join(conf.LOG_DIR, 'idcard.log.{0}'.format(date_str)) | ||
45 | if not os.path.exists(log_path): | ||
46 | print('log_path not exists') | ||
47 | return | ||
48 | |||
49 | wb_afc = Workbook() | ||
50 | ws_afc = wb_afc.create_sheet(self.sheet_name) | ||
51 | ws_afc.append(self.header) | ||
52 | wb_afc.remove(wb_afc.get_sheet_by_name('Sheet')) | ||
53 | |||
54 | wb_hil = Workbook() | ||
55 | ws_hil = wb_hil.create_sheet(self.sheet_name) | ||
56 | ws_hil.append(self.header) | ||
57 | wb_hil.remove(wb_hil.get_sheet_by_name('Sheet')) | ||
58 | |||
59 | with open(log_path, 'r', encoding='utf-8') as fp: | ||
60 | for line in fp: | ||
61 | search_obj = re.match(r'\[(.*)] \[task=(.*)] \[idcard=(.*)]', line) | ||
62 | idcard_str = search_obj.group(3) | ||
63 | idcard_list = ast.literal_eval(idcard_str) | ||
64 | content_list = [] | ||
65 | for idcard_dict in idcard_list: | ||
66 | nation = idcard_dict.get('民族') | ||
67 | if nation is None: | ||
68 | continue | ||
69 | if idcard_dict.get('类别') == '1': | ||
70 | continue | ||
71 | content_list.append((idcard_dict.get('公民身份号码'), nation)) | ||
72 | if len(content_list) == 0: | ||
73 | continue | ||
74 | |||
75 | time_str = search_obj.group(1) | ||
76 | task_str = search_obj.group(2) | ||
77 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | ||
78 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | ||
79 | application_id = doc_class.objects.filter(id=int(doc_id_str)).values_list('application_id', flat=True) | ||
80 | |||
81 | if business_type == consts.HIL_PREFIX: | ||
82 | for id_num, nation in content_list: | ||
83 | ws_hil.append((application_id[0], id_num, nation, time_str)) | ||
84 | else: | ||
85 | for id_num, nation in content_list: | ||
86 | ws_afc.append((application_id[0], id_num, nation, time_str)) | ||
87 | |||
88 | afc_excel_path = os.path.join(afc_excel_dir, 'idcard_{0}.xlsx'.format(date_str)) | ||
89 | hil_excel_path = os.path.join(hil_excel_dir, 'idcard_{0}.xlsx'.format(date_str)) | ||
90 | wb_afc.save(afc_excel_path) | ||
91 | wb_hil.save(hil_excel_path) |
1 | import os | ||
2 | import datetime | ||
3 | from calendar import monthrange | ||
4 | from openpyxl import Workbook, load_workbook | ||
5 | from django.core.management import BaseCommand | ||
6 | from settings import conf | ||
7 | from common.mixins import LoggerMixin | ||
8 | |||
9 | |||
10 | class Command(BaseCommand, LoggerMixin): | ||
11 | |||
12 | def __init__(self): | ||
13 | super().__init__() | ||
14 | self.dirs = ('AFC', 'HIL') | ||
15 | |||
16 | def handle(self, *args, **kwargs): | ||
17 | now_time = datetime.datetime.now() | ||
18 | end_day_in_mouth = now_time.replace(day=1) | ||
19 | pre_mouth = end_day_in_mouth - datetime.timedelta(days=1) | ||
20 | |||
21 | for target_dir in self.dirs: | ||
22 | excel_dir = os.path.join(conf.DATA_DIR, target_dir, 'IdCard') | ||
23 | if not os.path.exists(excel_dir): | ||
24 | print('excel dir not exists: {0}'.format(excel_dir)) | ||
25 | return | ||
26 | |||
27 | monthly_wb = Workbook() | ||
28 | |||
29 | for d in range(1, monthrange(pre_mouth.year, pre_mouth.month)[1] + 1): | ||
30 | date_str = '{:04d}-{:02d}-{:02d}'.format(pre_mouth.year, pre_mouth.month, d) | ||
31 | daily_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(date_str)) | ||
32 | if not os.path.exists(daily_excel_path): | ||
33 | print('daily excel path not exists: {0}'.format(daily_excel_path)) | ||
34 | continue | ||
35 | |||
36 | monthly_ws = monthly_wb.create_sheet(date_str) | ||
37 | daily_wb = load_workbook(daily_excel_path) | ||
38 | daily_ws = daily_wb.get_sheet_by_name('身份证') | ||
39 | for row in daily_ws.iter_rows(min_row=1, values_only=True): | ||
40 | monthly_ws.append(row) | ||
41 | |||
42 | monthly_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(pre_mouth.strftime('%Y-%m'))) | ||
43 | monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet')) | ||
44 | monthly_wb.save(monthly_excel_path) |
... | @@ -14,7 +14,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -14,7 +14,6 @@ class Command(BaseCommand, LoggerMixin): |
14 | 14 | ||
15 | def __init__(self): | 15 | def __init__(self): |
16 | super().__init__() | 16 | super().__init__() |
17 | self.log_base = '[license statistics]' | ||
18 | self.header_map = { | 17 | self.header_map = { |
19 | consts.MVI_CLASSIFY: [('申请ID', '发票代码', '发票号码', '开票日期', '不含税价', '发票联', '购买方名称', | 18 | consts.MVI_CLASSIFY: [('申请ID', '发票代码', '发票号码', '开票日期', '不含税价', '发票联', '购买方名称', |
20 | '购买方证件号码', '纳税人识别号', '车架号', '价税合计小写', '销货单位名称', '增值税税额', | 19 | '购买方证件号码', '纳税人识别号', '车架号', '价税合计小写', '销货单位名称', '增值税税额', |
... | @@ -75,7 +74,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -75,7 +74,8 @@ class Command(BaseCommand, LoggerMixin): |
75 | print('excel dir not exists') | 74 | print('excel dir not exists') |
76 | return | 75 | return |
77 | excel_path = os.path.join(excel_dir, 'license_{0}.xlsx'.format(date_str)) | 76 | excel_path = os.path.join(excel_dir, 'license_{0}.xlsx'.format(date_str)) |
78 | log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str)) | 77 | # log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str)) |
78 | log_path = os.path.join(conf.LOG_DIR, 'license_statistics.log.{0}'.format(date_str)) | ||
79 | if not os.path.exists(log_path): | 79 | if not os.path.exists(log_path): |
80 | print('log_path not exists') | 80 | print('log_path not exists') |
81 | return | 81 | return |
... | @@ -92,7 +92,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -92,7 +92,8 @@ class Command(BaseCommand, LoggerMixin): |
92 | 92 | ||
93 | with open(log_path, 'r', encoding='utf-8') as fp: | 93 | with open(log_path, 'r', encoding='utf-8') as fp: |
94 | for line in fp: | 94 | for line in fp: |
95 | search_obj = re.search(r'task=(.*) license_summary=(.*)', line) | 95 | # search_obj = re.search(r'task=(.*) license_summary=(.*)', line) |
96 | search_obj = re.search(r'\[task=(.*)] \[license_summary=(.*)]', line) | ||
96 | task_str = search_obj.group(1) | 97 | task_str = search_obj.group(1) |
97 | license_summary = ast.literal_eval(search_obj.group(2)) | 98 | license_summary = ast.literal_eval(search_obj.group(2)) |
98 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | 99 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | ... | ... |
... | @@ -689,8 +689,15 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -689,8 +689,15 @@ class Command(BaseCommand, LoggerMixin): |
689 | '[license_summary={4}]'.format(self.log_base, task_str, bs_summary, | 689 | '[license_summary={4}]'.format(self.log_base, task_str, bs_summary, |
690 | unknown_summary, license_summary)) | 690 | unknown_summary, license_summary)) |
691 | 691 | ||
692 | self.license_log.info('[task={0}] [license_summary={1}]'.format(task_str, license_summary)) | ||
693 | idcard_list = license_summary.get(consts.IC_CLASSIFY) | ||
694 | if idcard_list: | ||
695 | self.idcard_log.info('[task={0}] [idcard={1}]'.format(task_str, idcard_list)) | ||
696 | |||
692 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | 697 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) |
693 | 698 | ||
699 | self.bs_log.info('[task={0}] [bs_summary={1}]'.format(task_str, merged_bs_summary)) | ||
700 | |||
694 | self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] ' | 701 | self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] ' |
695 | '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary, | 702 | '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary, |
696 | unknown_summary, res_list)) | 703 | unknown_summary, res_list)) | ... | ... |
... | @@ -40,6 +40,9 @@ class LoggerMixin: | ... | @@ -40,6 +40,9 @@ class LoggerMixin: |
40 | exception_log = logging.getLogger('exception') | 40 | exception_log = logging.getLogger('exception') |
41 | cronjob_log = logging.getLogger('cronjob') | 41 | cronjob_log = logging.getLogger('cronjob') |
42 | folder_log = logging.getLogger('folder') | 42 | folder_log = logging.getLogger('folder') |
43 | bs_log = logging.getLogger('bs') | ||
44 | license_log = logging.getLogger('license') | ||
45 | idcard_log = logging.getLogger('idcard') | ||
43 | 46 | ||
44 | 47 | ||
45 | class GenericView(LoggerMixin, GenericExceptionMixin, GenericAPIView): | 48 | class GenericView(LoggerMixin, GenericExceptionMixin, GenericAPIView): | ... | ... |
... | @@ -84,9 +84,15 @@ class PDFHandler: | ... | @@ -84,9 +84,15 @@ class PDFHandler: |
84 | def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): | 84 | def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): |
85 | pix = self.recover_pix(pdf, xref, smask, colorspace) | 85 | pix = self.recover_pix(pdf, xref, smask, colorspace) |
86 | ext, img_data = self.get_img_data(pix) | 86 | ext, img_data = self.get_img_data(pix) |
87 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) | 87 | if ext == 'jpx': |
88 | with open(img_save_path, "wb") as f: | 88 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext='jpeg') |
89 | f.write(img_data) | 89 | jpx_pix = fitz.Pixmap(img_data) |
90 | jpx_pix.writeImage(img_save_path) | ||
91 | jpx_pix = None | ||
92 | else: | ||
93 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) | ||
94 | with open(img_save_path, "wb") as f: | ||
95 | f.write(img_data) | ||
90 | self.xref_set.add(xref) | 96 | self.xref_set.add(xref) |
91 | self.img_path_list.append(img_save_path) | 97 | self.img_path_list.append(img_save_path) |
92 | 98 | ... | ... |
1 | [loggers] | 1 | [loggers] |
2 | keys=root, running, exception, cronjob, folder, django.db.backends | 2 | keys=root, running, exception, cronjob, folder, bs, license, idcard, django.db.backends |
3 | 3 | ||
4 | [handlers] | 4 | [handlers] |
5 | keys=consoleHandler, django_rotateFileHandler, exceptionFileHandler, cronjobFileHandler, folderFileHandler, djangodbFileHandler | 5 | keys=consoleHandler, django_rotateFileHandler, exceptionFileHandler, cronjobFileHandler, folderFileHandler, bsFileHandler, licenseFileHandler, idcardFileHandler, djangodbFileHandler |
6 | 6 | ||
7 | [formatters] | 7 | [formatters] |
8 | keys=SituFormatter, dataLogFormatter | 8 | keys=SituFormatter, dataLogFormatter, SimpleFormatter |
9 | 9 | ||
10 | [formatter_SituFormatter] | 10 | [formatter_SituFormatter] |
11 | format=[%(asctime)s] [%(process)d] [%(thread)d] [%(threadName)s] [%(filename)s:%(lineno)d] %(levelname)s %(message)s | 11 | format=[%(asctime)s] [%(process)d] [%(thread)d] [%(threadName)s] [%(filename)s:%(lineno)d] %(levelname)s %(message)s |
... | @@ -15,6 +15,10 @@ datefmt= | ... | @@ -15,6 +15,10 @@ datefmt= |
15 | class=situlogger.JsonFormatter | 15 | class=situlogger.JsonFormatter |
16 | format=%(asctime)s %(levelname)s %(funcName)s | 16 | format=%(asctime)s %(levelname)s %(funcName)s |
17 | 17 | ||
18 | [formatter_SimpleFormatter] | ||
19 | format=[%(asctime)s] %(message)s | ||
20 | datefmt= | ||
21 | |||
18 | [handler_consoleHandler] | 22 | [handler_consoleHandler] |
19 | class=StreamHandler | 23 | class=StreamHandler |
20 | level=ERROR | 24 | level=ERROR |
... | @@ -45,6 +49,24 @@ level=DEBUG | ... | @@ -45,6 +49,24 @@ level=DEBUG |
45 | formatter=SituFormatter | 49 | formatter=SituFormatter |
46 | args=('../logs/folder_ocr.log',) | 50 | args=('../logs/folder_ocr.log',) |
47 | 51 | ||
52 | [handler_bsFileHandler] | ||
53 | class=situlogger.SituRotatingFileHandler | ||
54 | level=DEBUG | ||
55 | formatter=SimpleFormatter | ||
56 | args=('../logs/bs_statistics.log',) | ||
57 | |||
58 | [handler_licenseFileHandler] | ||
59 | class=situlogger.SituRotatingFileHandler | ||
60 | level=DEBUG | ||
61 | formatter=SimpleFormatter | ||
62 | args=('../logs/license_statistics.log',) | ||
63 | |||
64 | [handler_idcardFileHandler] | ||
65 | class=situlogger.SituRotatingFileHandler | ||
66 | level=DEBUG | ||
67 | formatter=SimpleFormatter | ||
68 | args=('../logs/idcard.log',) | ||
69 | |||
48 | [handler_djangodbFileHandler] | 70 | [handler_djangodbFileHandler] |
49 | class=situlogger.SituRotatingFileHandler | 71 | class=situlogger.SituRotatingFileHandler |
50 | level=DEBUG | 72 | level=DEBUG |
... | @@ -79,6 +101,24 @@ handlers=folderFileHandler | ... | @@ -79,6 +101,24 @@ handlers=folderFileHandler |
79 | qualname=folder | 101 | qualname=folder |
80 | propagate=0 | 102 | propagate=0 |
81 | 103 | ||
104 | [logger_bs] | ||
105 | level=INFO | ||
106 | handlers=bsFileHandler | ||
107 | qualname=bs | ||
108 | propagate=0 | ||
109 | |||
110 | [logger_license] | ||
111 | level=INFO | ||
112 | handlers=licenseFileHandler | ||
113 | qualname=license | ||
114 | propagate=0 | ||
115 | |||
116 | [logger_idcard] | ||
117 | level=INFO | ||
118 | handlers=idcardFileHandler | ||
119 | qualname=idcard | ||
120 | propagate=0 | ||
121 | |||
82 | [logger_django.db.backends] | 122 | [logger_django.db.backends] |
83 | level=DEBUG | 123 | level=DEBUG |
84 | handlers=djangodbFileHandler | 124 | handlers=djangodbFileHandler | ... | ... |
-
Please register or sign in to post a comment