add res sheet
Showing
3 changed files
with
56 additions
and
36 deletions
| ... | @@ -72,8 +72,12 @@ TRANS_MAP = { | ... | @@ -72,8 +72,12 @@ TRANS_MAP = { |
| 72 | } | 72 | } |
| 73 | TRANS = str.maketrans(TRANS_MAP) | 73 | TRANS = str.maketrans(TRANS_MAP) |
| 74 | ERROR_CHARS = {'.', '。', ':', ':', '•', '·', ',', ','} | 74 | ERROR_CHARS = {'.', '。', ':', ':', '•', '·', ',', ','} |
| 75 | SKIP_IMG_SHEET_NAME = '未处理图片' | 75 | RES_SHEET_NAME = '结果统计' |
| 76 | SKIP_IMG_SHEET_HEADER = ('页码', '序号') | 76 | RES_SHEET_HEADER = ('页码', '序号', '结果') |
| 77 | RES_SUCCESS = '识别成功' | ||
| 78 | RES_SUCCESS_OTHER = '识别成功(其他类)' | ||
| 79 | RES_SUCCESS_EMPTY = '识别成功(空数据)' | ||
| 80 | RES_FAILED = '识别识别' | ||
| 77 | 81 | ||
| 78 | CARD_RATIO = 0.9 | 82 | CARD_RATIO = 0.9 |
| 79 | UNKNOWN_CARD = '未知卡号' | 83 | UNKNOWN_CARD = '未知卡号' | ... | ... |
| ... | @@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin): |
| 93 | self.log_base, business_type, doc.id, pdf_path)) | 93 | self.log_base, business_type, doc.id, pdf_path)) |
| 94 | return doc_data_path, excel_path, src_excel_path, pdf_path | 94 | return doc_data_path, excel_path, src_excel_path, pdf_path |
| 95 | 95 | ||
| 96 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img): | 96 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino): |
| 97 | sheets = ocr_data.get('data', []) | 97 | sheets = ocr_data.get('data', []) |
| 98 | if not sheets: | 98 | if not sheets: |
| 99 | skip_img.append(self.parse_img_path(img_path)) | 99 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) |
| 100 | return | 100 | return |
| 101 | confidence = ocr_data.get('confidence', 1) | 101 | confidence = ocr_data.get('confidence', 1) |
| 102 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | 102 | img_name = 'page_{0}_img_{1}'.format(pno, ino) |
| 103 | cells_exists = False | ||
| 103 | for i, sheet in enumerate(sheets): | 104 | for i, sheet in enumerate(sheets): |
| 104 | cells = sheet.get('cells') | 105 | cells = sheet.get('cells') |
| 105 | if not cells: | 106 | if not cells: |
| 106 | skip_img.append(self.parse_img_path(img_path)) | ||
| 107 | continue | 107 | continue |
| 108 | cells_exists = True | ||
| 108 | sheet_name = '{0}_{1}'.format(img_name, i) | 109 | sheet_name = '{0}_{1}'.format(img_name, i) |
| 109 | ws = wb.create_sheet(sheet_name) | 110 | ws = wb.create_sheet(sheet_name) |
| 110 | for cell in cells: | 111 | for cell in cells: |
| ... | @@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin): |
| 160 | if summary[6] is not None: | 161 | if summary[6] is not None: |
| 161 | ed_list.append(summary[6]) | 162 | ed_list.append(summary[6]) |
| 162 | 163 | ||
| 163 | def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path): | 164 | if cells_exists: |
| 165 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
| 166 | else: | ||
| 167 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
| 168 | |||
| 169 | def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino): | ||
| 164 | # 类别:'0'身份证, '1'居住证 | 170 | # 类别:'0'身份证, '1'居住证 |
| 165 | license_data = ocr_data.get('data', []) | 171 | license_data = ocr_data.get('data', []) |
| 166 | if not license_data: | 172 | if not license_data: |
| 167 | skip_img.append(self.parse_img_path(img_path)) | 173 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) |
| 168 | return | 174 | return |
| 175 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
| 169 | license_summary.setdefault(classify, []).extend(license_data) | 176 | license_summary.setdefault(classify, []).extend(license_data) |
| 170 | 177 | ||
| 171 | def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path): | 178 | def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino): |
| 172 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: | 179 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: |
| 180 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
| 173 | if pid == consts.BC_PID: | 181 | if pid == consts.BC_PID: |
| 174 | # 银行卡 | 182 | # 银行卡 |
| 175 | # res_dict = {} | 183 | # res_dict = {} |
| ... | @@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin): |
| 184 | res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') | 192 | res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') |
| 185 | license_summary.setdefault(classify, []).append(res_dict) | 193 | license_summary.setdefault(classify, []).append(res_dict) |
| 186 | else: | 194 | else: |
| 187 | skip_img.append(self.parse_img_path(img_path)) | 195 | res_list.append((pno, ino, consts.RES_FAILED)) |
| 188 | 196 | ||
| 189 | @staticmethod | 197 | @staticmethod |
| 190 | async def fetch_ocr_1_result(url, json_data): | 198 | async def fetch_ocr_1_result(url, json_data): |
| ... | @@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin): |
| 207 | if response.status == 200: | 215 | if response.status == 200: |
| 208 | return await response.json() | 216 | return await response.json() |
| 209 | 217 | ||
| 210 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | 218 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list): |
| 219 | pno, ino = self.parse_img_path(img_path) | ||
| 211 | with open(img_path, 'rb') as f: | 220 | with open(img_path, 'rb') as f: |
| 212 | base64_data = base64.b64encode(f.read()) | 221 | base64_data = base64.b64encode(f.read()) |
| 213 | # 获取解码后的base64值 | 222 | # 获取解码后的base64值 |
| ... | @@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin): |
| 217 | } | 226 | } |
| 218 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) | 227 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) |
| 219 | if ocr_res_1 is None: | 228 | if ocr_res_1 is None: |
| 220 | skip_img.append(self.parse_img_path(img_path)) | 229 | res_list.append((pno, ino, consts.RES_FAILED)) |
| 221 | raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | 230 | self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path)) |
| 231 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
| 222 | else: | 232 | else: |
| 223 | self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | 233 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format( |
| 224 | self.log_base, img_path, ocr_res_1)) | 234 | self.log_base, img_path, ocr_res_1)) |
| 225 | 235 | ||
| 226 | if ocr_res_1.get('code') == 1: | 236 | if ocr_res_1.get('code') == 1: |
| 227 | ocr_data = ocr_res_1.get('data', {}) | 237 | ocr_data = ocr_res_1.get('data', {}) |
| 228 | classify = ocr_data.get('classify') | 238 | classify = ocr_data.get('classify') |
| 229 | if classify is None: | 239 | if classify is None: |
| 230 | skip_img.append(self.parse_img_path(img_path)) | 240 | res_list.append((pno, ino, consts.RES_FAILED)) |
| 241 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
| 242 | self.log_base, img_path, ocr_res_1)) | ||
| 231 | return | 243 | return |
| 232 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | 244 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 |
| 233 | skip_img.append(self.parse_img_path(img_path)) | 245 | res_list.append((pno, ino, consts.RES_SUCCESS_OTHER)) |
| 234 | return | 246 | return |
| 235 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | 247 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 |
| 236 | self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) | 248 | self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino) |
| 237 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | 249 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 |
| 238 | pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | 250 | pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) |
| 239 | json_data_2 = { | 251 | json_data_2 = { |
| ... | @@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin): |
| 244 | } | 256 | } |
| 245 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) | 257 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) |
| 246 | if ocr_res_2 is None: | 258 | if ocr_res_2 is None: |
| 247 | raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | 259 | res_list.append((pno, ino, consts.RES_FAILED)) |
| 260 | self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path)) | ||
| 261 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
| 248 | else: | 262 | else: |
| 249 | # 识别结果 | 263 | # 识别结果 |
| 250 | ocr_res_2 = json.loads(ocr_res_2) | 264 | ocr_res_2 = json.loads(ocr_res_2) |
| 251 | self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | 265 | self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format( |
| 252 | self.log_base, img_path, ocr_res_2)) | 266 | self.log_base, img_path, ocr_res_2)) |
| 253 | if classify == consts.BC_CLASSIFY: | 267 | if classify == consts.BC_CLASSIFY: |
| 254 | name = '有' | 268 | name = '有' |
| ... | @@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin): |
| 258 | card_name_res.get('data', {}).get('is_exists_name') == 0: | 272 | card_name_res.get('data', {}).get('is_exists_name') == 0: |
| 259 | name = '无' | 273 | name = '无' |
| 260 | ocr_res_2['Name'] = name | 274 | ocr_res_2['Name'] = name |
| 261 | self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | 275 | self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino) |
| 262 | else: # 流水处理 | 276 | else: # 流水处理 |
| 263 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | 277 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino) |
| 264 | else: | 278 | else: |
| 265 | skip_img.append(self.parse_img_path(img_path)) | 279 | res_list.append((pno, ino, consts.RES_FAILED)) |
| 280 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
| 281 | self.log_base, img_path, ocr_res_1)) | ||
| 266 | 282 | ||
| 267 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | 283 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): |
| 268 | # # # 流水 | 284 | # # # 流水 |
| ... | @@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin): |
| 559 | bs_summary = {} | 575 | bs_summary = {} |
| 560 | license_summary = {} | 576 | license_summary = {} |
| 561 | unknown_summary = {} | 577 | unknown_summary = {} |
| 562 | skip_img = [] | 578 | res_list = [] |
| 563 | interest_keyword = Keywords.objects.filter( | 579 | interest_keyword = Keywords.objects.filter( |
| 564 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) | 580 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) |
| 565 | salary_keyword = Keywords.objects.filter( | 581 | salary_keyword = Keywords.objects.filter( |
| ... | @@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin): |
| 573 | 589 | ||
| 574 | # 4.1 获取OCR结果 | 590 | # 4.1 获取OCR结果 |
| 575 | loop = asyncio.get_event_loop() | 591 | loop = asyncio.get_event_loop() |
| 576 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) | 592 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) |
| 577 | for img_path in pdf_handler.img_path_list] | 593 | for img_path in pdf_handler.img_path_list] |
| 578 | loop.run_until_complete(asyncio.wait(tasks)) | 594 | loop.run_until_complete(asyncio.wait(tasks)) |
| 579 | # loop.close() | 595 | # loop.close() |
| 580 | 596 | ||
| 581 | # for img_path in pdf_handler.img_path_list: | 597 | # for img_path in pdf_handler.img_path_list: |
| 582 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) | 598 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) |
| 583 | 599 | ||
| 584 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' | 600 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' |
| 585 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, | 601 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, |
| ... | @@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin): |
| 588 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | 604 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) |
| 589 | 605 | ||
| 590 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' | 606 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' |
| 591 | '[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type, | 607 | '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type, |
| 592 | doc.id, merged_bs_summary, | 608 | doc.id, merged_bs_summary, |
| 593 | unknown_summary, skip_img)) | 609 | unknown_summary, res_list)) |
| 594 | del unknown_summary | 610 | del unknown_summary |
| 595 | 611 | ||
| 596 | # 4.2 重构Excel文件 | 612 | # 4.2 重构Excel文件 |
| 597 | wb.save(src_excel_path) | 613 | wb.save(src_excel_path) |
| 598 | wb.rebuild(merged_bs_summary, license_summary, skip_img, doc.document_scheme) | 614 | wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme) |
| 599 | wb.save(excel_path) | 615 | wb.save(excel_path) |
| 600 | except EDMSException as e: | 616 | except EDMSException as e: |
| 601 | doc.status = DocStatus.PROCESS_FAILED.value | 617 | doc.status = DocStatus.PROCESS_FAILED.value | ... | ... |
| ... | @@ -502,19 +502,19 @@ class BSWorkbook(Workbook): | ... | @@ -502,19 +502,19 @@ class BSWorkbook(Workbook): |
| 502 | ws.append((write_field, license_dict.get(search_field, ''))) | 502 | ws.append((write_field, license_dict.get(search_field, ''))) |
| 503 | ws.append((None, )) | 503 | ws.append((None, )) |
| 504 | 504 | ||
| 505 | def skip_img_sheet(self, skip_img): | 505 | def res_sheet(self, res_list): |
| 506 | if skip_img: | 506 | if res_list: |
| 507 | ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME) | 507 | ws = self.create_sheet(consts.RES_SHEET_NAME) |
| 508 | ws.append(consts.SKIP_IMG_SHEET_HEADER) | 508 | ws.append(consts.RES_SHEET_HEADER) |
| 509 | for img_tuple in skip_img: | 509 | for res_tuple in res_list: |
| 510 | ws.append(img_tuple) | 510 | ws.append(res_tuple) |
| 511 | 511 | ||
| 512 | def remove_base_sheet(self): | 512 | def remove_base_sheet(self): |
| 513 | if len(self.sheetnames) > 1: | 513 | if len(self.sheetnames) > 1: |
| 514 | self.remove(self.get_sheet_by_name('Sheet')) | 514 | self.remove(self.get_sheet_by_name('Sheet')) |
| 515 | 515 | ||
| 516 | def rebuild(self, bs_summary, license_summary, skip_img, document_scheme): | 516 | def rebuild(self, bs_summary, license_summary, res_list, document_scheme): |
| 517 | self.bs_rebuild(bs_summary) | 517 | self.bs_rebuild(bs_summary) |
| 518 | self.license_rebuild(license_summary, document_scheme) | 518 | self.license_rebuild(license_summary, document_scheme) |
| 519 | self.skip_img_sheet(skip_img) | 519 | self.res_sheet(res_list) |
| 520 | self.remove_base_sheet() | 520 | self.remove_base_sheet() | ... | ... |
-
Please register or sign in to post a comment