Merge branch 'feature/main' into feature/mssql
Showing
10 changed files
with
204 additions
and
669 deletions
| ... | @@ -152,7 +152,7 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果') | ... | @@ -152,7 +152,7 @@ RESULT_IDX = FIXED_HEADERS.index('核对结果') |
| 152 | # '借贷': ('贷', '借'), # 竖版-无表格-广发银行 | 152 | # '借贷': ('贷', '借'), # 竖版-无表格-广发银行 |
| 153 | # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行 | 153 | # '借贷状态': ('贷', '借'), # 竖版-特殊-交通银行 |
| 154 | # '收/支': ('收入', '支出'), # 横版-表格-北京银行 | 154 | # '收/支': ('收入', '支出'), # 横版-表格-北京银行 |
| 155 | BORROW_HEADERS_SET = {'借贷', '借贷状态', '收/支', '收支标志'} | 155 | BORROW_HEADERS_SET = {'借贷', '借\n贷', '借贷状态', '收/支', '收支标志'} |
| 156 | BORROW_INCOME_SET = {'贷', '收入', '收', '收(Cr)'} | 156 | BORROW_INCOME_SET = {'贷', '收入', '收', '收(Cr)'} |
| 157 | BORROW_OUTLAY_SET = {'借', '支出', '支', '付(Dr)'} | 157 | BORROW_OUTLAY_SET = {'借', '支出', '支', '付(Dr)'} |
| 158 | INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'} | 158 | INCOME_HEADERS_SET = {'收入金额', '收入', '存入', '存入金额(贷)', '存入金额(贷)'} |
| ... | @@ -165,6 +165,7 @@ HEADERS_MAPPING = {} | ... | @@ -165,6 +165,7 @@ HEADERS_MAPPING = {} |
| 165 | HEADERS_MAPPING.update( | 165 | HEADERS_MAPPING.update( |
| 166 | { | 166 | { |
| 167 | '借贷': BORROW_KEY, | 167 | '借贷': BORROW_KEY, |
| 168 | '借\n贷': BORROW_KEY, | ||
| 168 | '借贷状态': BORROW_KEY, | 169 | '借贷状态': BORROW_KEY, |
| 169 | '收支标志': BORROW_KEY, | 170 | '收支标志': BORROW_KEY, |
| 170 | '收/支': BORROW_KEY, | 171 | '收/支': BORROW_KEY, | ... | ... |
| ... | @@ -40,7 +40,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -40,7 +40,8 @@ class Command(BaseCommand, LoggerMixin): |
| 40 | print('excel dir not exists') | 40 | print('excel dir not exists') |
| 41 | return | 41 | return |
| 42 | excel_path = os.path.join(excel_dir, 'bs_{0}.xlsx'.format(date_str)) | 42 | excel_path = os.path.join(excel_dir, 'bs_{0}.xlsx'.format(date_str)) |
| 43 | log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str)) | 43 | # log_path = os.path.join(conf.LOG_DIR, 'bs.log.{0}'.format(date_str)) |
| 44 | log_path = os.path.join(conf.LOG_DIR, 'bs_statistics.log.{0}'.format(date_str)) | ||
| 44 | if not os.path.exists(log_path): | 45 | if not os.path.exists(log_path): |
| 45 | print('log_path not exists') | 46 | print('log_path not exists') |
| 46 | return | 47 | return |
| ... | @@ -48,7 +49,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -48,7 +49,8 @@ class Command(BaseCommand, LoggerMixin): |
| 48 | summary_dict = {} | 49 | summary_dict = {} |
| 49 | with open(log_path, 'r', encoding='utf-8') as fp: | 50 | with open(log_path, 'r', encoding='utf-8') as fp: |
| 50 | for line in fp: | 51 | for line in fp: |
| 51 | search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line) | 52 | # search_obj = re.search(r'task=(.*) merged_bs_summary=(.*)', line) |
| 53 | search_obj = re.search(r'\[task=(.*)] \[bs_summary=(.*)]', line) | ||
| 52 | task_str = search_obj.group(1) | 54 | task_str = search_obj.group(1) |
| 53 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | 55 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) |
| 54 | doc_id = int(doc_id_str) | 56 | doc_id = int(doc_id_str) | ... | ... |
| 1 | import os | ||
| 2 | import time | ||
| 3 | import json | ||
| 4 | import signal | ||
| 5 | import asyncio | ||
| 6 | import aiohttp | ||
| 7 | import difflib | ||
| 8 | import base64 | ||
| 9 | import requests | ||
| 10 | from datetime import datetime, date | ||
| 11 | from collections import Counter | ||
| 12 | from apps.doc.ocr.wb import BSWorkbook, Workbook | ||
| 13 | from django.core.management import BaseCommand | ||
| 14 | |||
| 15 | from settings import conf | ||
| 16 | from common.mixins import LoggerMixin | ||
| 17 | from common.tools.file_tools import write_zip_file | ||
| 18 | from common.tools.pdf_to_img import PDFHandler | ||
| 19 | from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords | ||
| 20 | from apps.doc.named_enum import KeywordsType | ||
| 21 | from apps.doc import consts | ||
| 22 | from apps.doc.ocr.edms import EDMS, rh | ||
| 23 | from apps.doc.exceptions import EDMSException | ||
| 24 | |||
| 25 | |||
| 26 | class Command(BaseCommand, LoggerMixin): | ||
| 27 | |||
| 28 | def __init__(self): | ||
| 29 | super().__init__() | ||
| 30 | self.log_base = '[doc ocr process]' | ||
| 31 | # 处理文件开关 | ||
| 32 | self.switch = True | ||
| 33 | # 数据目录 | ||
| 34 | self.data_dir = conf.DATA_DIR | ||
| 35 | # ocr相关 | ||
| 36 | self.ocr_url_1 = conf.OCR_URL_1 | ||
| 37 | self.ocr_url_2 = conf.OCR_URL_2 | ||
| 38 | self.ocr_url_3 = conf.BC_URL | ||
| 39 | # EDMS web_service_api | ||
| 40 | self.edms = EDMS() | ||
| 41 | # 优雅退出信号:15 | ||
| 42 | signal.signal(signal.SIGTERM, self.signal_handler) | ||
| 43 | |||
| 44 | def signal_handler(self, sig, frame): | ||
| 45 | self.switch = False # 停止处理文件 | ||
| 46 | |||
| 47 | def get_doc_info(self): | ||
| 48 | task_str, is_priority = rh.dequeue() | ||
| 49 | if task_str is None: | ||
| 50 | self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | ||
| 51 | return None, None | ||
| 52 | |||
| 53 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | ||
| 54 | doc_id = int(doc_id_str) | ||
| 55 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | ||
| 56 | # doc_info = doc_class.objects.filter(id=doc_id, status=DocStatus.INIT.value).values( | ||
| 57 | # 'id', 'metadata_version_id', 'application_id', 'document_name', 'document_scheme').first() | ||
| 58 | doc = doc_class.objects.filter(id=doc_id).first() | ||
| 59 | if doc is None: | ||
| 60 | self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( | ||
| 61 | self.log_base, task_str, is_priority)) | ||
| 62 | return None, None | ||
| 63 | elif doc.status != DocStatus.INIT.value: | ||
| 64 | self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] ' | ||
| 65 | '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status)) | ||
| 66 | return None, None | ||
| 67 | doc.status = DocStatus.PROCESSING.value | ||
| 68 | doc.save() | ||
| 69 | self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format( | ||
| 70 | self.log_base, task_str, is_priority)) | ||
| 71 | return doc, business_type | ||
| 72 | |||
| 73 | def pdf_download(self, doc, business_type): | ||
| 74 | doc_data_path = os.path.join(self.data_dir, business_type, str(doc.id)) | ||
| 75 | os.makedirs(doc_data_path, exist_ok=True) | ||
| 76 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | ||
| 77 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | ||
| 78 | for times in range(consts.RETRY_TIMES): | ||
| 79 | try: | ||
| 80 | self.edms.download(pdf_path, doc.metadata_version_id) | ||
| 81 | except Exception as e: | ||
| 82 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
| 83 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
| 84 | edms_exc = str(e) | ||
| 85 | else: | ||
| 86 | break | ||
| 87 | else: | ||
| 88 | raise EDMSException(edms_exc) | ||
| 89 | |||
| 90 | excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id)) | ||
| 91 | src_excel_path = os.path.join(doc_data_path, 'src.xlsx') | ||
| 92 | self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format( | ||
| 93 | self.log_base, business_type, doc.id, pdf_path)) | ||
| 94 | return doc_data_path, excel_path, src_excel_path, pdf_path | ||
| 95 | |||
| 96 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino): | ||
| 97 | sheets = ocr_data.get('data', []) | ||
| 98 | if not sheets: | ||
| 99 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
| 100 | return | ||
| 101 | confidence = ocr_data.get('confidence', 1) | ||
| 102 | img_name = 'page_{0}_img_{1}'.format(pno, ino) | ||
| 103 | cells_exists = False | ||
| 104 | for i, sheet in enumerate(sheets): | ||
| 105 | cells = sheet.get('cells') | ||
| 106 | if not cells: | ||
| 107 | continue | ||
| 108 | cells_exists = True | ||
| 109 | sheet_name = '{0}_{1}'.format(img_name, i) | ||
| 110 | ws = wb.create_sheet(sheet_name) | ||
| 111 | for cell in cells: | ||
| 112 | c1 = cell.get('start_column') | ||
| 113 | r1 = cell.get('start_row') | ||
| 114 | words = cell.get('words') | ||
| 115 | ws.cell(row=r1 + 1, column=c1 + 1, value=words) | ||
| 116 | |||
| 117 | # ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'] | ||
| 118 | summary = sheet.get('summary') | ||
| 119 | card = summary[1] | ||
| 120 | if card is None: | ||
| 121 | classify_dict = unknown_summary.setdefault(classify, {}) | ||
| 122 | role = consts.UNKNOWN_ROLE if summary[0] is None else summary[0] | ||
| 123 | role_dict = classify_dict.setdefault(role, {}) | ||
| 124 | role_dict['classify'] = classify | ||
| 125 | role_dict['role'] = role | ||
| 126 | role_dict.setdefault('sheet', []).append(sheet_name) | ||
| 127 | role_dict.setdefault('confidence', []).append(confidence) | ||
| 128 | code_list = role_dict.setdefault('code', []) | ||
| 129 | pt_list = role_dict.setdefault('print_time', []) | ||
| 130 | sd_list = role_dict.setdefault('start_date', []) | ||
| 131 | ed_list = role_dict.setdefault('end_date', []) | ||
| 132 | if summary[3] is not None: | ||
| 133 | code_list.append((summary[2], summary[3])) | ||
| 134 | if summary[4] is not None: | ||
| 135 | pt_list.append(summary[4]) | ||
| 136 | if summary[5] is not None: | ||
| 137 | sd_list.append(summary[5]) | ||
| 138 | if summary[6] is not None: | ||
| 139 | ed_list.append(summary[6]) | ||
| 140 | else: | ||
| 141 | card_dict = bs_summary.setdefault(card, {}) | ||
| 142 | card_dict['count'] = card_dict.get('count', 0) + 1 | ||
| 143 | card_dict.setdefault('classify', []).append(classify) | ||
| 144 | card_dict.setdefault('confidence', []).append(confidence) | ||
| 145 | card_dict.setdefault('sheet', []).append(sheet_name) | ||
| 146 | role_list = card_dict.setdefault('role', []) | ||
| 147 | role_set = card_dict.setdefault('role_set', set()) | ||
| 148 | code_list = card_dict.setdefault('code', []) | ||
| 149 | pt_list = card_dict.setdefault('print_time', []) | ||
| 150 | sd_list = card_dict.setdefault('start_date', []) | ||
| 151 | ed_list = card_dict.setdefault('end_date', []) | ||
| 152 | if summary[0] is not None: | ||
| 153 | role_list.append(summary[0]) | ||
| 154 | role_set.add(summary[0]) | ||
| 155 | if summary[3] is not None: | ||
| 156 | code_list.append((summary[2], summary[3])) | ||
| 157 | if summary[4] is not None: | ||
| 158 | pt_list.append(summary[4]) | ||
| 159 | if summary[5] is not None: | ||
| 160 | sd_list.append(summary[5]) | ||
| 161 | if summary[6] is not None: | ||
| 162 | ed_list.append(summary[6]) | ||
| 163 | |||
| 164 | if cells_exists: | ||
| 165 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
| 166 | else: | ||
| 167 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
| 168 | |||
| 169 | def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino): | ||
| 170 | # 类别:'0'身份证, '1'居住证 | ||
| 171 | license_data = ocr_data.get('data', []) | ||
| 172 | if not license_data: | ||
| 173 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
| 174 | return | ||
| 175 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
| 176 | license_summary.setdefault(classify, []).extend(license_data) | ||
| 177 | |||
| 178 | def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino): | ||
| 179 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: | ||
| 180 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
| 181 | if pid == consts.BC_PID: | ||
| 182 | # 银行卡 | ||
| 183 | # res_dict = {} | ||
| 184 | # for en_key, chn_key in consts.BC_FIELD: | ||
| 185 | # res_dict[chn_key] = ocr_res_2.get(en_key, '') | ||
| 186 | license_summary.setdefault(classify, []).append(ocr_res_2) | ||
| 187 | else: | ||
| 188 | # 营业执照等 | ||
| 189 | for result_dict in ocr_res_2.get('ResultList', []): | ||
| 190 | res_dict = {} | ||
| 191 | for field_dict in result_dict.get('FieldList', []): | ||
| 192 | res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') | ||
| 193 | license_summary.setdefault(classify, []).append(res_dict) | ||
| 194 | else: | ||
| 195 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
| 196 | |||
| 197 | @staticmethod | ||
| 198 | async def fetch_ocr_1_result(url, json_data): | ||
| 199 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
| 200 | async with session.post(url, json=json_data) as response: | ||
| 201 | if response.status == 200: | ||
| 202 | return await response.json() | ||
| 203 | |||
| 204 | @staticmethod | ||
| 205 | async def fetch_ocr_2_result(url, json_data): | ||
| 206 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
| 207 | async with session.post(url, data=json_data) as response: | ||
| 208 | if response.status == 200: | ||
| 209 | return await response.text() | ||
| 210 | |||
| 211 | @staticmethod | ||
| 212 | async def fetch_bc_name_result(url, json_data): | ||
| 213 | async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session: | ||
| 214 | async with session.post(url, json=json_data) as response: | ||
| 215 | if response.status == 200: | ||
| 216 | return await response.json() | ||
| 217 | |||
| 218 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list): | ||
| 219 | pno, ino = self.parse_img_path(img_path) | ||
| 220 | with open(img_path, 'rb') as f: | ||
| 221 | base64_data = base64.b64encode(f.read()) | ||
| 222 | # 获取解码后的base64值 | ||
| 223 | file_data = base64_data.decode() | ||
| 224 | json_data_1 = { | ||
| 225 | "file": file_data | ||
| 226 | } | ||
| 227 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) | ||
| 228 | if ocr_res_1 is None: | ||
| 229 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
| 230 | self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path)) | ||
| 231 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
| 232 | else: | ||
| 233 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format( | ||
| 234 | self.log_base, img_path, ocr_res_1)) | ||
| 235 | |||
| 236 | if ocr_res_1.get('code') == 1: | ||
| 237 | ocr_data = ocr_res_1.get('data', {}) | ||
| 238 | classify = ocr_data.get('classify') | ||
| 239 | if classify is None: | ||
| 240 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
| 241 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
| 242 | self.log_base, img_path, ocr_res_1)) | ||
| 243 | return | ||
| 244 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
| 245 | res_list.append((pno, ino, consts.RES_SUCCESS_OTHER)) | ||
| 246 | return | ||
| 247 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
| 248 | self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino) | ||
| 249 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
| 250 | pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
| 251 | json_data_2 = { | ||
| 252 | "pid": str(pid), | ||
| 253 | # "key": conf.OCR_KEY, | ||
| 254 | # "secret": conf.OCR_SECRET, | ||
| 255 | "filedata": file_data | ||
| 256 | } | ||
| 257 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) | ||
| 258 | if ocr_res_2 is None: | ||
| 259 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
| 260 | self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path)) | ||
| 261 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
| 262 | else: | ||
| 263 | # 识别结果 | ||
| 264 | ocr_res_2 = json.loads(ocr_res_2) | ||
| 265 | self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format( | ||
| 266 | self.log_base, img_path, ocr_res_2)) | ||
| 267 | if classify == consts.BC_CLASSIFY: | ||
| 268 | name = '有' | ||
| 269 | json_data_1['card_res'] = ocr_res_2 | ||
| 270 | card_name_res = await self.fetch_bc_name_result(self.ocr_url_3, json_data_1) | ||
| 271 | if isinstance(card_name_res, dict) and \ | ||
| 272 | card_name_res.get('data', {}).get('is_exists_name') == 0: | ||
| 273 | name = '无' | ||
| 274 | ocr_res_2['Name'] = name | ||
| 275 | self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino) | ||
| 276 | else: # 流水处理 | ||
| 277 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino) | ||
| 278 | else: | ||
| 279 | res_list.append((pno, ino, consts.RES_FAILED)) | ||
| 280 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
| 281 | self.log_base, img_path, ocr_res_1)) | ||
| 282 | |||
| 283 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | ||
| 284 | # # # 流水 | ||
| 285 | # # res = { | ||
| 286 | # # 'code': 1, | ||
| 287 | # # 'msg': 'success', | ||
| 288 | # # 'data': { | ||
| 289 | # # 'classify': 0, | ||
| 290 | # # 'confidence': 0.999, | ||
| 291 | # # 'data': [ | ||
| 292 | # # { | ||
| 293 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
| 294 | # # 'cells': [] | ||
| 295 | # # }, | ||
| 296 | # # { | ||
| 297 | # # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'], | ||
| 298 | # # 'cells': [] | ||
| 299 | # # } | ||
| 300 | # # ] | ||
| 301 | # # } | ||
| 302 | # # } | ||
| 303 | # # | ||
| 304 | # # # 证件-1 | ||
| 305 | # # res = { | ||
| 306 | # # 'code': 1, | ||
| 307 | # # 'msg': 'success', | ||
| 308 | # # 'data': { | ||
| 309 | # # 'classify': 0, | ||
| 310 | # # 'confidence': 0.999, | ||
| 311 | # # 'data': [ | ||
| 312 | # # { | ||
| 313 | # # 'cn_key': 'value', | ||
| 314 | # # 'cn_key': 'value', | ||
| 315 | # # }, | ||
| 316 | # # { | ||
| 317 | # # 'cn_key': 'value', | ||
| 318 | # # 'cn_key': 'value', | ||
| 319 | # # }, | ||
| 320 | # # ] | ||
| 321 | # # } | ||
| 322 | # # } | ||
| 323 | # # | ||
| 324 | # # # 证件-2 or 其他类 | ||
| 325 | # # res = { | ||
| 326 | # # 'code': 1, | ||
| 327 | # # 'msg': 'success', | ||
| 328 | # # 'data': { | ||
| 329 | # # 'classify': 0, | ||
| 330 | # # 'confidence': 0.999, | ||
| 331 | # # } | ||
| 332 | # # } | ||
| 333 | # with open(img_path, 'rb') as f: | ||
| 334 | # base64_data = base64.b64encode(f.read()) | ||
| 335 | # # 获取解码后的base64值 | ||
| 336 | # file_data = base64_data.decode() | ||
| 337 | # json_data_1 = { | ||
| 338 | # "file": file_data | ||
| 339 | # } | ||
| 340 | # response_1 = requests.post(self.ocr_url_1, json=json_data_1) | ||
| 341 | # if response_1.status_code == 200: | ||
| 342 | # ocr_res_1 = response_1.json() | ||
| 343 | # self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | ||
| 344 | # self.log_base, img_path, ocr_res_1)) | ||
| 345 | # | ||
| 346 | # if ocr_res_1.get('code') == 1: | ||
| 347 | # ocr_data = ocr_res_1.get('data', {}) | ||
| 348 | # classify = ocr_data.get('classify') | ||
| 349 | # if classify is None: | ||
| 350 | # skip_img.append(self.parse_img_path(img_path)) | ||
| 351 | # return | ||
| 352 | # elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | ||
| 353 | # skip_img.append(self.parse_img_path(img_path)) | ||
| 354 | # return | ||
| 355 | # elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | ||
| 356 | # self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) | ||
| 357 | # elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | ||
| 358 | # pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | ||
| 359 | # json_data_2 = { | ||
| 360 | # "pid": str(pid), | ||
| 361 | # "key": conf.OCR_KEY, | ||
| 362 | # "secret": conf.OCR_SECRET, | ||
| 363 | # "file": file_data | ||
| 364 | # } | ||
| 365 | # response_2 = requests.post(self.ocr_url_2, data=json_data_2) | ||
| 366 | # if response_2.status_code == 200: | ||
| 367 | # # 识别结果 | ||
| 368 | # ocr_res_2 = response_2.json() | ||
| 369 | # self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | ||
| 370 | # self.log_base, img_path, ocr_res_2)) | ||
| 371 | # self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | ||
| 372 | # else: | ||
| 373 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
| 374 | # else: # 流水处理 | ||
| 375 | # self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | ||
| 376 | # else: | ||
| 377 | # skip_img.append(self.parse_img_path(img_path)) | ||
| 378 | # else: | ||
| 379 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
| 380 | |||
| 381 | @staticmethod | ||
| 382 | def parse_img_path(img_path): | ||
| 383 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | ||
| 384 | part_list = img_name.split('_') | ||
| 385 | # page_7_img_11_0 | ||
| 386 | return int(part_list[1])+1, int(part_list[3])+1 | ||
| 387 | |||
| 388 | @staticmethod | ||
| 389 | def get_most(value_list): | ||
| 390 | if value_list: | ||
| 391 | most_common = Counter(value_list).most_common(1) | ||
| 392 | return most_common[0][0] if most_common else None | ||
| 393 | |||
| 394 | @staticmethod | ||
| 395 | def date_format(date_str, format_str): | ||
| 396 | try: | ||
| 397 | date_res = datetime.strptime(date_str, format_str).date() | ||
| 398 | except Exception as e: | ||
| 399 | return | ||
| 400 | else: | ||
| 401 | return date_res | ||
| 402 | |||
| 403 | def get_validate_date(self, date_list): | ||
| 404 | for date_str in date_list: | ||
| 405 | for format_str in consts.DATE_FORMAT: | ||
| 406 | date_res = self.date_format(date_str, format_str) | ||
| 407 | if isinstance(date_res, date): | ||
| 408 | return date_res | ||
| 409 | |||
| 410 | def merge_card(self, bs_summary): | ||
| 411 | merged_bs_summary = {} | ||
| 412 | sorted_card = sorted(bs_summary.keys(), key=lambda x: bs_summary[x]['count'], reverse=True) | ||
| 413 | for main_card in sorted_card: | ||
| 414 | if bs_summary.get(main_card) is None: | ||
| 415 | continue | ||
| 416 | merged_bs_summary[main_card] = bs_summary.pop(main_card) | ||
| 417 | del merged_bs_summary[main_card]['count'] | ||
| 418 | merge_cards = [] | ||
| 419 | for card in bs_summary.keys(): | ||
| 420 | if difflib.SequenceMatcher(None, main_card, card).quick_ratio() > consts.CARD_RATIO: | ||
| 421 | merged_bs_summary[main_card]['classify'].extend(bs_summary[card]['classify']) | ||
| 422 | merged_bs_summary[main_card]['confidence'].extend(bs_summary[card]['confidence']) | ||
| 423 | merged_bs_summary[main_card]['sheet'].extend(bs_summary[card]['sheet']) | ||
| 424 | merged_bs_summary[main_card]['role'].extend(bs_summary[card]['role']) | ||
| 425 | merged_bs_summary[main_card]['role_set'].update(bs_summary[card]['role_set']) | ||
| 426 | merged_bs_summary[main_card]['code'].extend(bs_summary[card]['code']) | ||
| 427 | merged_bs_summary[main_card]['print_time'].extend(bs_summary[card]['print_time']) | ||
| 428 | merged_bs_summary[main_card]['start_date'].extend(bs_summary[card]['start_date']) | ||
| 429 | merged_bs_summary[main_card]['end_date'].extend(bs_summary[card]['end_date']) | ||
| 430 | merge_cards.append(card) | ||
| 431 | for card in merge_cards: | ||
| 432 | del bs_summary[card] | ||
| 433 | merged_bs_summary[main_card]['classify'] = self.get_most(merged_bs_summary[main_card]['classify']) | ||
| 434 | merged_bs_summary[main_card]['role'] = self.get_most(merged_bs_summary[main_card]['role']) | ||
| 435 | del bs_summary | ||
| 436 | return merged_bs_summary | ||
| 437 | |||
| 438 | def prune_bs_summary(self, bs_summary): | ||
| 439 | for summary in bs_summary.values(): | ||
| 440 | del summary['count'] | ||
| 441 | summary['classify'] = self.get_most(summary['classify']) | ||
| 442 | summary['role'] = self.get_most(summary['role']) | ||
| 443 | return bs_summary | ||
| 444 | |||
| 445 | def rebuild_bs_summary(self, bs_summary, unknown_summary): | ||
| 446 | # bs_summary = { | ||
| 447 | # '卡号': { | ||
| 448 | # 'count': 100, | ||
| 449 | # 'classify': [], | ||
| 450 | # 'confidence': [], | ||
| 451 | # 'role': [], | ||
| 452 | # 'code': [('page', 'code')], | ||
| 453 | # 'print_time': [], | ||
| 454 | # 'start_date': [], | ||
| 455 | # 'end_date': [], | ||
| 456 | # 'sheet': ['sheet_name'] | ||
| 457 | # } | ||
| 458 | # } | ||
| 459 | # | ||
| 460 | # unknown_summary = { | ||
| 461 | # 0: { | ||
| 462 | # '户名': { | ||
| 463 | # 'classify': 0, | ||
| 464 | # 'confidence': [], | ||
| 465 | # 'role': '户名', | ||
| 466 | # 'code': [('page', 'code')], | ||
| 467 | # 'print_time': [], | ||
| 468 | # 'start_date': [], | ||
| 469 | # 'end_date': [], | ||
| 470 | # 'sheet': ['sheet_name'] | ||
| 471 | # } | ||
| 472 | # } | ||
| 473 | # } | ||
| 474 | # 无卡号 | ||
| 475 | if len(bs_summary) == 0: | ||
| 476 | del bs_summary | ||
| 477 | merged_bs_summary = {} | ||
| 478 | card_num = 1 | ||
| 479 | for role_dict in unknown_summary.values(): | ||
| 480 | if len(role_dict) == 2 and consts.UNKNOWN_ROLE in role_dict: | ||
| 481 | summary_dict = role_dict.pop(consts.UNKNOWN_ROLE, {}) | ||
| 482 | for summary in role_dict.values(): | ||
| 483 | summary_dict['confidence'].extend(summary['confidence']) | ||
| 484 | summary_dict['role'] = summary['role'] | ||
| 485 | summary_dict['code'].extend(summary['code']) | ||
| 486 | summary_dict['print_time'].extend(summary['print_time']) | ||
| 487 | summary_dict['start_date'].extend(summary['start_date']) | ||
| 488 | summary_dict['end_date'].extend(summary['end_date']) | ||
| 489 | summary_dict['sheet'].extend(summary['sheet']) | ||
| 490 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
| 491 | merged_bs_summary[card] = summary_dict | ||
| 492 | else: | ||
| 493 | for summary in role_dict.values(): | ||
| 494 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
| 495 | card_num += 1 | ||
| 496 | merged_bs_summary[card] = summary | ||
| 497 | else: | ||
| 498 | # 1卡号 | ||
| 499 | one_card = False | ||
| 500 | if len(bs_summary) == 1: | ||
| 501 | merged_bs_summary = self.prune_bs_summary(bs_summary) | ||
| 502 | one_card = True | ||
| 503 | # 多卡号 | ||
| 504 | else: | ||
| 505 | merged_bs_summary = self.merge_card(bs_summary) | ||
| 506 | |||
| 507 | for card_summary in merged_bs_summary.values(): | ||
| 508 | merge_role = [] | ||
| 509 | classify_summary = unknown_summary.get(card_summary['classify'], {}) | ||
| 510 | for role, summary in classify_summary.items(): | ||
| 511 | if one_card or role in card_summary['role_set']: | ||
| 512 | merge_role.append(role) | ||
| 513 | card_summary['confidence'].extend(summary['confidence']) | ||
| 514 | card_summary['sheet'].extend(summary['sheet']) | ||
| 515 | card_summary['code'].extend(summary['code']) | ||
| 516 | card_summary['print_time'].extend(summary['print_time']) | ||
| 517 | card_summary['start_date'].extend(summary['start_date']) | ||
| 518 | card_summary['end_date'].extend(summary['end_date']) | ||
| 519 | |||
| 520 | for role in merge_role: | ||
| 521 | del classify_summary[role] | ||
| 522 | |||
| 523 | card_num = 1 | ||
| 524 | for role_dict in unknown_summary.values(): | ||
| 525 | for summary in role_dict.values(): | ||
| 526 | card = '{0}_{1}'.format(consts.UNKNOWN_CARD, card_num) | ||
| 527 | card_num += 1 | ||
| 528 | merged_bs_summary[card] = summary | ||
| 529 | |||
| 530 | del unknown_summary | ||
| 531 | for summary in merged_bs_summary.values(): | ||
| 532 | if summary.get('role_set') is not None: | ||
| 533 | del summary['role_set'] | ||
| 534 | summary['print_time'] = self.get_validate_date(summary['print_time']) | ||
| 535 | summary['start_date'] = self.get_validate_date(summary['start_date']) | ||
| 536 | summary['end_date'] = self.get_validate_date(summary['end_date']) | ||
| 537 | summary['confidence'] = max(summary['confidence']) | ||
| 538 | return merged_bs_summary | ||
| 539 | |||
| 540 | # TODO 细化文件状态,不同异常状态,归还队列,重试时采取不同的处理 | ||
| 541 | # TODO 异常邮件通知 | ||
| 542 | # 识别失败:普通异常,如PDF异常、构建过程异常 | ||
| 543 | # EDMS异常:下载异常-->回队列-->邮件;上传异常-->重新上传队列-->邮件 | ||
| 544 | # 算法异常:第一道异常-->识别失败-->邮件;第二道异常-->识别失败-->邮件 | ||
| 545 | # TODO OCR接口调用重试 | ||
| 546 | def handle(self, *args, **kwargs): | ||
| 547 | sleep_second = int(conf.SLEEP_SECOND) | ||
| 548 | max_sleep_second = int(conf.MAX_SLEEP_SECOND) | ||
| 549 | |||
| 550 | while self.switch: | ||
| 551 | # 1. 从队列获取文件信息 | ||
| 552 | doc, business_type = self.get_doc_info() | ||
| 553 | # 队列为空时的处理 | ||
| 554 | if doc is None: | ||
| 555 | time.sleep(sleep_second) | ||
| 556 | sleep_second = min(max_sleep_second, sleep_second + 5) | ||
| 557 | continue | ||
| 558 | sleep_second = int(conf.SLEEP_SECOND) | ||
| 559 | |||
| 560 | try: | ||
| 561 | start_time = time.time() | ||
| 562 | # 2. 从EDMS获取PDF文件 | ||
| 563 | doc_data_path, excel_path, src_excel_path, pdf_path = self.pdf_download(doc, business_type) | ||
| 564 | |||
| 565 | # 3.PDF文件提取图片 | ||
| 566 | img_save_path = os.path.join(doc_data_path, 'img') | ||
| 567 | self.cronjob_log.info('{0} [pdf to img start] [business_type={1}] [doc_id={2}]'.format( | ||
| 568 | self.log_base, business_type, doc.id)) | ||
| 569 | pdf_handler = PDFHandler(pdf_path, img_save_path) | ||
| 570 | pdf_handler.extract_image() | ||
| 571 | self.cronjob_log.info('{0} [pdf to img end] [business_type={1}] [doc_id={2}]'.format( | ||
| 572 | self.log_base, business_type, doc.id)) | ||
| 573 | |||
| 574 | # 4.获取OCR结果并且构建excel文件 | ||
| 575 | bs_summary = {} | ||
| 576 | license_summary = {} | ||
| 577 | unknown_summary = {} | ||
| 578 | res_list = [] | ||
| 579 | interest_keyword = Keywords.objects.filter( | ||
| 580 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) | ||
| 581 | salary_keyword = Keywords.objects.filter( | ||
| 582 | type=KeywordsType.SALARY.value, on_off=True).values_list('keyword', flat=True) | ||
| 583 | loan_keyword = Keywords.objects.filter( | ||
| 584 | type__in=[KeywordsType.LOAN.value, KeywordsType.ALI_WECHART.value], on_off=True).values_list( | ||
| 585 | 'keyword', flat=True) | ||
| 586 | wb = BSWorkbook(interest_keyword, salary_keyword, loan_keyword) | ||
| 587 | |||
| 588 | # wb = Workbook() | ||
| 589 | |||
| 590 | # 4.1 获取OCR结果 | ||
| 591 | loop = asyncio.get_event_loop() | ||
| 592 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) | ||
| 593 | for img_path in pdf_handler.img_path_list] | ||
| 594 | loop.run_until_complete(asyncio.wait(tasks)) | ||
| 595 | # loop.close() | ||
| 596 | |||
| 597 | # for img_path in pdf_handler.img_path_list: | ||
| 598 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) | ||
| 599 | |||
| 600 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' | ||
| 601 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, | ||
| 602 | unknown_summary, license_summary)) | ||
| 603 | |||
| 604 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | ||
| 605 | |||
| 606 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' | ||
| 607 | '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type, | ||
| 608 | doc.id, merged_bs_summary, | ||
| 609 | unknown_summary, res_list)) | ||
| 610 | del unknown_summary | ||
| 611 | |||
| 612 | # 4.2 重构Excel文件 | ||
| 613 | wb.save(src_excel_path) | ||
| 614 | wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme) | ||
| 615 | wb.save(excel_path) | ||
| 616 | except EDMSException as e: | ||
| 617 | doc.status = DocStatus.PROCESS_FAILED.value | ||
| 618 | doc.save() | ||
| 619 | self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] ' | ||
| 620 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
| 621 | except Exception as e: | ||
| 622 | doc.status = DocStatus.PROCESS_FAILED.value | ||
| 623 | doc.save() | ||
| 624 | self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] ' | ||
| 625 | '[err={3}]'.format(self.log_base, business_type, doc.id, e)) | ||
| 626 | else: | ||
| 627 | try: | ||
| 628 | # 5.上传至EDMS | ||
| 629 | for times in range(consts.RETRY_TIMES): | ||
| 630 | try: | ||
| 631 | self.edms.upload(excel_path, doc, business_type) | ||
| 632 | except Exception as e: | ||
| 633 | self.cronjob_log.warn( | ||
| 634 | '{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] ' | ||
| 635 | '[error={4}]'.format(self.log_base, times, business_type, doc.id, e)) | ||
| 636 | edms_exc = str(e) | ||
| 637 | else: | ||
| 638 | break | ||
| 639 | else: | ||
| 640 | raise EDMSException(edms_exc) | ||
| 641 | except Exception as e: | ||
| 642 | doc.status = DocStatus.UPLOAD_FAILED.value | ||
| 643 | doc.save() | ||
| 644 | end_time = time.time() | ||
| 645 | speed_time = int(end_time - start_time) | ||
| 646 | self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] ' | ||
| 647 | '[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id, | ||
| 648 | speed_time, e)) | ||
| 649 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
| 650 | |||
| 651 | else: | ||
| 652 | doc.status = DocStatus.COMPLETE.value | ||
| 653 | doc.save() | ||
| 654 | end_time = time.time() | ||
| 655 | speed_time = int(end_time - start_time) | ||
| 656 | self.cronjob_log.info('{0} [process complete] [business_type={1}] [doc_id={2}] ' | ||
| 657 | '[speed_time={3}]'.format(self.log_base, business_type, doc.id, speed_time)) | ||
| 658 | write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id))) | ||
| 659 | |||
| 660 | self.cronjob_log.info('{0} [stop safely]'.format(self.log_base)) |
| 1 | import re | ||
| 2 | import os | ||
| 3 | import ast | ||
| 4 | import datetime | ||
| 5 | from openpyxl import Workbook | ||
| 6 | from django.core.management import BaseCommand | ||
| 7 | from settings import conf | ||
| 8 | from common.mixins import LoggerMixin | ||
| 9 | from apps.doc.models import HILDoc, AFCDoc | ||
| 10 | from apps.doc import consts | ||
| 11 | |||
| 12 | |||
| 13 | class Command(BaseCommand, LoggerMixin): | ||
| 14 | |||
| 15 | def __init__(self): | ||
| 16 | super().__init__() | ||
| 17 | self.sheet_name = '身份证' | ||
| 18 | self.header = ('申请号', '身份证号', '民族', '时间戳') | ||
| 19 | |||
| 20 | def add_arguments(self, parser): | ||
| 21 | parser.add_argument( | ||
| 22 | '--date', | ||
| 23 | default=datetime.date.today() - datetime.timedelta(days=1), | ||
| 24 | dest='date', | ||
| 25 | help='将要计算的日期,格式: 2018-01-01' | ||
| 26 | ) | ||
| 27 | |||
| 28 | def handle(self, *args, **kwargs): | ||
| 29 | date = kwargs.get('date') | ||
| 30 | if isinstance(date, str): | ||
| 31 | if not re.match(r'\d{4}-\d{2}-\d{2}', date): | ||
| 32 | print('date format error') | ||
| 33 | return | ||
| 34 | date_str = date | ||
| 35 | else: | ||
| 36 | date_str = date.strftime('%Y-%m-%d') | ||
| 37 | |||
| 38 | afc_excel_dir = os.path.join(conf.DATA_DIR, 'AFC', 'IdCard') | ||
| 39 | hil_excel_dir = os.path.join(conf.DATA_DIR, 'HIL', 'IdCard') | ||
| 40 | if not os.path.exists(afc_excel_dir) or not os.path.exists(hil_excel_dir): | ||
| 41 | print('excel_dir not exist') | ||
| 42 | return | ||
| 43 | |||
| 44 | log_path = os.path.join(conf.LOG_DIR, 'idcard.log.{0}'.format(date_str)) | ||
| 45 | if not os.path.exists(log_path): | ||
| 46 | print('log_path not exists') | ||
| 47 | return | ||
| 48 | |||
| 49 | wb_afc = Workbook() | ||
| 50 | ws_afc = wb_afc.create_sheet(self.sheet_name) | ||
| 51 | ws_afc.append(self.header) | ||
| 52 | wb_afc.remove(wb_afc.get_sheet_by_name('Sheet')) | ||
| 53 | |||
| 54 | wb_hil = Workbook() | ||
| 55 | ws_hil = wb_hil.create_sheet(self.sheet_name) | ||
| 56 | ws_hil.append(self.header) | ||
| 57 | wb_hil.remove(wb_hil.get_sheet_by_name('Sheet')) | ||
| 58 | |||
| 59 | with open(log_path, 'r', encoding='utf-8') as fp: | ||
| 60 | for line in fp: | ||
| 61 | search_obj = re.match(r'\[(.*)] \[task=(.*)] \[idcard=(.*)]', line) | ||
| 62 | idcard_str = search_obj.group(3) | ||
| 63 | idcard_list = ast.literal_eval(idcard_str) | ||
| 64 | content_list = [] | ||
| 65 | for idcard_dict in idcard_list: | ||
| 66 | nation = idcard_dict.get('民族') | ||
| 67 | if nation is None: | ||
| 68 | continue | ||
| 69 | if idcard_dict.get('类别') == '1': | ||
| 70 | continue | ||
| 71 | content_list.append((idcard_dict.get('公民身份号码'), nation)) | ||
| 72 | if len(content_list) == 0: | ||
| 73 | continue | ||
| 74 | |||
| 75 | time_str = search_obj.group(1) | ||
| 76 | task_str = search_obj.group(2) | ||
| 77 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | ||
| 78 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | ||
| 79 | application_id = doc_class.objects.filter(id=int(doc_id_str)).values_list('application_id', flat=True) | ||
| 80 | |||
| 81 | if business_type == consts.HIL_PREFIX: | ||
| 82 | for id_num, nation in content_list: | ||
| 83 | ws_hil.append((application_id[0], id_num, nation, time_str)) | ||
| 84 | else: | ||
| 85 | for id_num, nation in content_list: | ||
| 86 | ws_afc.append((application_id[0], id_num, nation, time_str)) | ||
| 87 | |||
| 88 | afc_excel_path = os.path.join(afc_excel_dir, 'idcard_{0}.xlsx'.format(date_str)) | ||
| 89 | hil_excel_path = os.path.join(hil_excel_dir, 'idcard_{0}.xlsx'.format(date_str)) | ||
| 90 | wb_afc.save(afc_excel_path) | ||
| 91 | wb_hil.save(hil_excel_path) |
| 1 | import os | ||
| 2 | import datetime | ||
| 3 | from calendar import monthrange | ||
| 4 | from openpyxl import Workbook, load_workbook | ||
| 5 | from django.core.management import BaseCommand | ||
| 6 | from settings import conf | ||
| 7 | from common.mixins import LoggerMixin | ||
| 8 | |||
| 9 | |||
| 10 | class Command(BaseCommand, LoggerMixin): | ||
| 11 | |||
| 12 | def __init__(self): | ||
| 13 | super().__init__() | ||
| 14 | self.dirs = ('AFC', 'HIL') | ||
| 15 | |||
| 16 | def handle(self, *args, **kwargs): | ||
| 17 | now_time = datetime.datetime.now() | ||
| 18 | end_day_in_mouth = now_time.replace(day=1) | ||
| 19 | pre_mouth = end_day_in_mouth - datetime.timedelta(days=1) | ||
| 20 | |||
| 21 | for target_dir in self.dirs: | ||
| 22 | excel_dir = os.path.join(conf.DATA_DIR, target_dir, 'IdCard') | ||
| 23 | if not os.path.exists(excel_dir): | ||
| 24 | print('excel dir not exists: {0}'.format(excel_dir)) | ||
| 25 | return | ||
| 26 | |||
| 27 | monthly_wb = Workbook() | ||
| 28 | |||
| 29 | for d in range(1, monthrange(pre_mouth.year, pre_mouth.month)[1] + 1): | ||
| 30 | date_str = '{:04d}-{:02d}-{:02d}'.format(pre_mouth.year, pre_mouth.month, d) | ||
| 31 | daily_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(date_str)) | ||
| 32 | if not os.path.exists(daily_excel_path): | ||
| 33 | print('daily excel path not exists: {0}'.format(daily_excel_path)) | ||
| 34 | continue | ||
| 35 | |||
| 36 | monthly_ws = monthly_wb.create_sheet(date_str) | ||
| 37 | daily_wb = load_workbook(daily_excel_path) | ||
| 38 | daily_ws = daily_wb.get_sheet_by_name('身份证') | ||
| 39 | for row in daily_ws.iter_rows(min_row=1, values_only=True): | ||
| 40 | monthly_ws.append(row) | ||
| 41 | |||
| 42 | monthly_excel_path = os.path.join(excel_dir, 'idcard_{0}.xlsx'.format(pre_mouth.strftime('%Y-%m'))) | ||
| 43 | monthly_wb.remove(monthly_wb.get_sheet_by_name('Sheet')) | ||
| 44 | monthly_wb.save(monthly_excel_path) |
| ... | @@ -14,7 +14,6 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -14,7 +14,6 @@ class Command(BaseCommand, LoggerMixin): |
| 14 | 14 | ||
| 15 | def __init__(self): | 15 | def __init__(self): |
| 16 | super().__init__() | 16 | super().__init__() |
| 17 | self.log_base = '[license statistics]' | ||
| 18 | self.header_map = { | 17 | self.header_map = { |
| 19 | consts.MVI_CLASSIFY: [('申请ID', '发票代码', '发票号码', '开票日期', '不含税价', '发票联', '购买方名称', | 18 | consts.MVI_CLASSIFY: [('申请ID', '发票代码', '发票号码', '开票日期', '不含税价', '发票联', '购买方名称', |
| 20 | '购买方证件号码', '纳税人识别号', '车架号', '价税合计小写', '销货单位名称', '增值税税额', | 19 | '购买方证件号码', '纳税人识别号', '车架号', '价税合计小写', '销货单位名称', '增值税税额', |
| ... | @@ -75,7 +74,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -75,7 +74,8 @@ class Command(BaseCommand, LoggerMixin): |
| 75 | print('excel dir not exists') | 74 | print('excel dir not exists') |
| 76 | return | 75 | return |
| 77 | excel_path = os.path.join(excel_dir, 'license_{0}.xlsx'.format(date_str)) | 76 | excel_path = os.path.join(excel_dir, 'license_{0}.xlsx'.format(date_str)) |
| 78 | log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str)) | 77 | # log_path = os.path.join(conf.LOG_DIR, 'license.log.{0}'.format(date_str)) |
| 78 | log_path = os.path.join(conf.LOG_DIR, 'license_statistics.log.{0}'.format(date_str)) | ||
| 79 | if not os.path.exists(log_path): | 79 | if not os.path.exists(log_path): |
| 80 | print('log_path not exists') | 80 | print('log_path not exists') |
| 81 | return | 81 | return |
| ... | @@ -92,7 +92,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -92,7 +92,8 @@ class Command(BaseCommand, LoggerMixin): |
| 92 | 92 | ||
| 93 | with open(log_path, 'r', encoding='utf-8') as fp: | 93 | with open(log_path, 'r', encoding='utf-8') as fp: |
| 94 | for line in fp: | 94 | for line in fp: |
| 95 | search_obj = re.search(r'task=(.*) license_summary=(.*)', line) | 95 | # search_obj = re.search(r'task=(.*) license_summary=(.*)', line) |
| 96 | search_obj = re.search(r'\[task=(.*)] \[license_summary=(.*)]', line) | ||
| 96 | task_str = search_obj.group(1) | 97 | task_str = search_obj.group(1) |
| 97 | license_summary = ast.literal_eval(search_obj.group(2)) | 98 | license_summary = ast.literal_eval(search_obj.group(2)) |
| 98 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | 99 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | ... | ... |
| ... | @@ -689,8 +689,15 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -689,8 +689,15 @@ class Command(BaseCommand, LoggerMixin): |
| 689 | '[license_summary={4}]'.format(self.log_base, task_str, bs_summary, | 689 | '[license_summary={4}]'.format(self.log_base, task_str, bs_summary, |
| 690 | unknown_summary, license_summary)) | 690 | unknown_summary, license_summary)) |
| 691 | 691 | ||
| 692 | self.license_log.info('[task={0}] [license_summary={1}]'.format(task_str, license_summary)) | ||
| 693 | idcard_list = license_summary.get(consts.IC_CLASSIFY) | ||
| 694 | if idcard_list: | ||
| 695 | self.idcard_log.info('[task={0}] [idcard={1}]'.format(task_str, idcard_list)) | ||
| 696 | |||
| 692 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | 697 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) |
| 693 | 698 | ||
| 699 | self.bs_log.info('[task={0}] [bs_summary={1}]'.format(task_str, merged_bs_summary)) | ||
| 700 | |||
| 694 | self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] ' | 701 | self.cronjob_log.info('{0} [task={1}] [merged_bs_summary={2}] [unknown_summary={3}] ' |
| 695 | '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary, | 702 | '[res_list={4}]'.format(self.log_base, task_str, merged_bs_summary, |
| 696 | unknown_summary, res_list)) | 703 | unknown_summary, res_list)) | ... | ... |
| ... | @@ -40,6 +40,9 @@ class LoggerMixin: | ... | @@ -40,6 +40,9 @@ class LoggerMixin: |
| 40 | exception_log = logging.getLogger('exception') | 40 | exception_log = logging.getLogger('exception') |
| 41 | cronjob_log = logging.getLogger('cronjob') | 41 | cronjob_log = logging.getLogger('cronjob') |
| 42 | folder_log = logging.getLogger('folder') | 42 | folder_log = logging.getLogger('folder') |
| 43 | bs_log = logging.getLogger('bs') | ||
| 44 | license_log = logging.getLogger('license') | ||
| 45 | idcard_log = logging.getLogger('idcard') | ||
| 43 | 46 | ||
| 44 | 47 | ||
| 45 | class GenericView(LoggerMixin, GenericExceptionMixin, GenericAPIView): | 48 | class GenericView(LoggerMixin, GenericExceptionMixin, GenericAPIView): | ... | ... |
| ... | @@ -84,6 +84,12 @@ class PDFHandler: | ... | @@ -84,6 +84,12 @@ class PDFHandler: |
| 84 | def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): | 84 | def extract_single_image(self, pdf, xref, smask, colorspace, pno, img_index=0): |
| 85 | pix = self.recover_pix(pdf, xref, smask, colorspace) | 85 | pix = self.recover_pix(pdf, xref, smask, colorspace) |
| 86 | ext, img_data = self.get_img_data(pix) | 86 | ext, img_data = self.get_img_data(pix) |
| 87 | if ext == 'jpx': | ||
| 88 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext='jpeg') | ||
| 89 | jpx_pix = fitz.Pixmap(img_data) | ||
| 90 | jpx_pix.writeImage(img_save_path) | ||
| 91 | jpx_pix = None | ||
| 92 | else: | ||
| 87 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) | 93 | img_save_path = self.get_img_save_path(pno, img_index=img_index, ext=ext) |
| 88 | with open(img_save_path, "wb") as f: | 94 | with open(img_save_path, "wb") as f: |
| 89 | f.write(img_data) | 95 | f.write(img_data) | ... | ... |
| 1 | [loggers] | 1 | [loggers] |
| 2 | keys=root, running, exception, cronjob, folder, django.db.backends | 2 | keys=root, running, exception, cronjob, folder, bs, license, idcard, django.db.backends |
| 3 | 3 | ||
| 4 | [handlers] | 4 | [handlers] |
| 5 | keys=consoleHandler, django_rotateFileHandler, exceptionFileHandler, cronjobFileHandler, folderFileHandler, djangodbFileHandler | 5 | keys=consoleHandler, django_rotateFileHandler, exceptionFileHandler, cronjobFileHandler, folderFileHandler, bsFileHandler, licenseFileHandler, idcardFileHandler, djangodbFileHandler |
| 6 | 6 | ||
| 7 | [formatters] | 7 | [formatters] |
| 8 | keys=SituFormatter, dataLogFormatter | 8 | keys=SituFormatter, dataLogFormatter, SimpleFormatter |
| 9 | 9 | ||
| 10 | [formatter_SituFormatter] | 10 | [formatter_SituFormatter] |
| 11 | format=[%(asctime)s] [%(process)d] [%(thread)d] [%(threadName)s] [%(filename)s:%(lineno)d] %(levelname)s %(message)s | 11 | format=[%(asctime)s] [%(process)d] [%(thread)d] [%(threadName)s] [%(filename)s:%(lineno)d] %(levelname)s %(message)s |
| ... | @@ -15,6 +15,10 @@ datefmt= | ... | @@ -15,6 +15,10 @@ datefmt= |
| 15 | class=situlogger.JsonFormatter | 15 | class=situlogger.JsonFormatter |
| 16 | format=%(asctime)s %(levelname)s %(funcName)s | 16 | format=%(asctime)s %(levelname)s %(funcName)s |
| 17 | 17 | ||
| 18 | [formatter_SimpleFormatter] | ||
| 19 | format=[%(asctime)s] %(message)s | ||
| 20 | datefmt= | ||
| 21 | |||
| 18 | [handler_consoleHandler] | 22 | [handler_consoleHandler] |
| 19 | class=StreamHandler | 23 | class=StreamHandler |
| 20 | level=ERROR | 24 | level=ERROR |
| ... | @@ -45,6 +49,24 @@ level=DEBUG | ... | @@ -45,6 +49,24 @@ level=DEBUG |
| 45 | formatter=SituFormatter | 49 | formatter=SituFormatter |
| 46 | args=('../logs/folder_ocr.log',) | 50 | args=('../logs/folder_ocr.log',) |
| 47 | 51 | ||
| 52 | [handler_bsFileHandler] | ||
| 53 | class=situlogger.SituRotatingFileHandler | ||
| 54 | level=DEBUG | ||
| 55 | formatter=SimpleFormatter | ||
| 56 | args=('../logs/bs_statistics.log',) | ||
| 57 | |||
| 58 | [handler_licenseFileHandler] | ||
| 59 | class=situlogger.SituRotatingFileHandler | ||
| 60 | level=DEBUG | ||
| 61 | formatter=SimpleFormatter | ||
| 62 | args=('../logs/license_statistics.log',) | ||
| 63 | |||
| 64 | [handler_idcardFileHandler] | ||
| 65 | class=situlogger.SituRotatingFileHandler | ||
| 66 | level=DEBUG | ||
| 67 | formatter=SimpleFormatter | ||
| 68 | args=('../logs/idcard.log',) | ||
| 69 | |||
| 48 | [handler_djangodbFileHandler] | 70 | [handler_djangodbFileHandler] |
| 49 | class=situlogger.SituRotatingFileHandler | 71 | class=situlogger.SituRotatingFileHandler |
| 50 | level=DEBUG | 72 | level=DEBUG |
| ... | @@ -79,6 +101,24 @@ handlers=folderFileHandler | ... | @@ -79,6 +101,24 @@ handlers=folderFileHandler |
| 79 | qualname=folder | 101 | qualname=folder |
| 80 | propagate=0 | 102 | propagate=0 |
| 81 | 103 | ||
| 104 | [logger_bs] | ||
| 105 | level=INFO | ||
| 106 | handlers=bsFileHandler | ||
| 107 | qualname=bs | ||
| 108 | propagate=0 | ||
| 109 | |||
| 110 | [logger_license] | ||
| 111 | level=INFO | ||
| 112 | handlers=licenseFileHandler | ||
| 113 | qualname=license | ||
| 114 | propagate=0 | ||
| 115 | |||
| 116 | [logger_idcard] | ||
| 117 | level=INFO | ||
| 118 | handlers=idcardFileHandler | ||
| 119 | qualname=idcard | ||
| 120 | propagate=0 | ||
| 121 | |||
| 82 | [logger_django.db.backends] | 122 | [logger_django.db.backends] |
| 83 | level=DEBUG | 123 | level=DEBUG |
| 84 | handlers=djangodbFileHandler | 124 | handlers=djangodbFileHandler | ... | ... |
-
Please register or sign in to post a comment