add res sheet
Showing
3 changed files
with
56 additions
and
36 deletions
... | @@ -72,8 +72,12 @@ TRANS_MAP = { | ... | @@ -72,8 +72,12 @@ TRANS_MAP = { |
72 | } | 72 | } |
73 | TRANS = str.maketrans(TRANS_MAP) | 73 | TRANS = str.maketrans(TRANS_MAP) |
74 | ERROR_CHARS = {'.', '。', ':', ':', '•', '·', ',', ','} | 74 | ERROR_CHARS = {'.', '。', ':', ':', '•', '·', ',', ','} |
75 | SKIP_IMG_SHEET_NAME = '未处理图片' | 75 | RES_SHEET_NAME = '结果统计' |
76 | SKIP_IMG_SHEET_HEADER = ('页码', '序号') | 76 | RES_SHEET_HEADER = ('页码', '序号', '结果') |
77 | RES_SUCCESS = '识别成功' | ||
78 | RES_SUCCESS_OTHER = '识别成功(其他类)' | ||
79 | RES_SUCCESS_EMPTY = '识别成功(空数据)' | ||
80 | RES_FAILED = '识别识别' | ||
77 | 81 | ||
78 | CARD_RATIO = 0.9 | 82 | CARD_RATIO = 0.9 |
79 | UNKNOWN_CARD = '未知卡号' | 83 | UNKNOWN_CARD = '未知卡号' | ... | ... |
... | @@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin): |
93 | self.log_base, business_type, doc.id, pdf_path)) | 93 | self.log_base, business_type, doc.id, pdf_path)) |
94 | return doc_data_path, excel_path, src_excel_path, pdf_path | 94 | return doc_data_path, excel_path, src_excel_path, pdf_path |
95 | 95 | ||
96 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img): | 96 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino): |
97 | sheets = ocr_data.get('data', []) | 97 | sheets = ocr_data.get('data', []) |
98 | if not sheets: | 98 | if not sheets: |
99 | skip_img.append(self.parse_img_path(img_path)) | 99 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) |
100 | return | 100 | return |
101 | confidence = ocr_data.get('confidence', 1) | 101 | confidence = ocr_data.get('confidence', 1) |
102 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | 102 | img_name = 'page_{0}_img_{1}'.format(pno, ino) |
103 | cells_exists = False | ||
103 | for i, sheet in enumerate(sheets): | 104 | for i, sheet in enumerate(sheets): |
104 | cells = sheet.get('cells') | 105 | cells = sheet.get('cells') |
105 | if not cells: | 106 | if not cells: |
106 | skip_img.append(self.parse_img_path(img_path)) | ||
107 | continue | 107 | continue |
108 | cells_exists = True | ||
108 | sheet_name = '{0}_{1}'.format(img_name, i) | 109 | sheet_name = '{0}_{1}'.format(img_name, i) |
109 | ws = wb.create_sheet(sheet_name) | 110 | ws = wb.create_sheet(sheet_name) |
110 | for cell in cells: | 111 | for cell in cells: |
... | @@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin): |
160 | if summary[6] is not None: | 161 | if summary[6] is not None: |
161 | ed_list.append(summary[6]) | 162 | ed_list.append(summary[6]) |
162 | 163 | ||
163 | def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path): | 164 | if cells_exists: |
165 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
166 | else: | ||
167 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) | ||
168 | |||
169 | def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino): | ||
164 | # 类别:'0'身份证, '1'居住证 | 170 | # 类别:'0'身份证, '1'居住证 |
165 | license_data = ocr_data.get('data', []) | 171 | license_data = ocr_data.get('data', []) |
166 | if not license_data: | 172 | if not license_data: |
167 | skip_img.append(self.parse_img_path(img_path)) | 173 | res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY)) |
168 | return | 174 | return |
175 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
169 | license_summary.setdefault(classify, []).extend(license_data) | 176 | license_summary.setdefault(classify, []).extend(license_data) |
170 | 177 | ||
171 | def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path): | 178 | def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino): |
172 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: | 179 | if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: |
180 | res_list.append((pno, ino, consts.RES_SUCCESS)) | ||
173 | if pid == consts.BC_PID: | 181 | if pid == consts.BC_PID: |
174 | # 银行卡 | 182 | # 银行卡 |
175 | # res_dict = {} | 183 | # res_dict = {} |
... | @@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin): |
184 | res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') | 192 | res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') |
185 | license_summary.setdefault(classify, []).append(res_dict) | 193 | license_summary.setdefault(classify, []).append(res_dict) |
186 | else: | 194 | else: |
187 | skip_img.append(self.parse_img_path(img_path)) | 195 | res_list.append((pno, ino, consts.RES_FAILED)) |
188 | 196 | ||
189 | @staticmethod | 197 | @staticmethod |
190 | async def fetch_ocr_1_result(url, json_data): | 198 | async def fetch_ocr_1_result(url, json_data): |
... | @@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin): |
207 | if response.status == 200: | 215 | if response.status == 200: |
208 | return await response.json() | 216 | return await response.json() |
209 | 217 | ||
210 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | 218 | async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list): |
219 | pno, ino = self.parse_img_path(img_path) | ||
211 | with open(img_path, 'rb') as f: | 220 | with open(img_path, 'rb') as f: |
212 | base64_data = base64.b64encode(f.read()) | 221 | base64_data = base64.b64encode(f.read()) |
213 | # 获取解码后的base64值 | 222 | # 获取解码后的base64值 |
... | @@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin): |
217 | } | 226 | } |
218 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) | 227 | ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) |
219 | if ocr_res_1 is None: | 228 | if ocr_res_1 is None: |
220 | skip_img.append(self.parse_img_path(img_path)) | 229 | res_list.append((pno, ino, consts.RES_FAILED)) |
221 | raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | 230 | self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path)) |
231 | # raise Exception('ocr 1 error, img_path={0}'.format(img_path)) | ||
222 | else: | 232 | else: |
223 | self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( | 233 | self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format( |
224 | self.log_base, img_path, ocr_res_1)) | 234 | self.log_base, img_path, ocr_res_1)) |
225 | 235 | ||
226 | if ocr_res_1.get('code') == 1: | 236 | if ocr_res_1.get('code') == 1: |
227 | ocr_data = ocr_res_1.get('data', {}) | 237 | ocr_data = ocr_res_1.get('data', {}) |
228 | classify = ocr_data.get('classify') | 238 | classify = ocr_data.get('classify') |
229 | if classify is None: | 239 | if classify is None: |
230 | skip_img.append(self.parse_img_path(img_path)) | 240 | res_list.append((pno, ino, consts.RES_FAILED)) |
241 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
242 | self.log_base, img_path, ocr_res_1)) | ||
231 | return | 243 | return |
232 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | 244 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 |
233 | skip_img.append(self.parse_img_path(img_path)) | 245 | res_list.append((pno, ino, consts.RES_SUCCESS_OTHER)) |
234 | return | 246 | return |
235 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 | 247 | elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 |
236 | self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) | 248 | self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino) |
237 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 | 249 | elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 |
238 | pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) | 250 | pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) |
239 | json_data_2 = { | 251 | json_data_2 = { |
... | @@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin): |
244 | } | 256 | } |
245 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) | 257 | ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) |
246 | if ocr_res_2 is None: | 258 | if ocr_res_2 is None: |
247 | raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | 259 | res_list.append((pno, ino, consts.RES_FAILED)) |
260 | self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path)) | ||
261 | # raise Exception('ocr 2 error, img_path={0}'.format(img_path)) | ||
248 | else: | 262 | else: |
249 | # 识别结果 | 263 | # 识别结果 |
250 | ocr_res_2 = json.loads(ocr_res_2) | 264 | ocr_res_2 = json.loads(ocr_res_2) |
251 | self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( | 265 | self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format( |
252 | self.log_base, img_path, ocr_res_2)) | 266 | self.log_base, img_path, ocr_res_2)) |
253 | if classify == consts.BC_CLASSIFY: | 267 | if classify == consts.BC_CLASSIFY: |
254 | name = '有' | 268 | name = '有' |
... | @@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin): |
258 | card_name_res.get('data', {}).get('is_exists_name') == 0: | 272 | card_name_res.get('data', {}).get('is_exists_name') == 0: |
259 | name = '无' | 273 | name = '无' |
260 | ocr_res_2['Name'] = name | 274 | ocr_res_2['Name'] = name |
261 | self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) | 275 | self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino) |
262 | else: # 流水处理 | 276 | else: # 流水处理 |
263 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) | 277 | self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino) |
264 | else: | 278 | else: |
265 | skip_img.append(self.parse_img_path(img_path)) | 279 | res_list.append((pno, ino, consts.RES_FAILED)) |
280 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format( | ||
281 | self.log_base, img_path, ocr_res_1)) | ||
266 | 282 | ||
267 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): | 283 | # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): |
268 | # # # 流水 | 284 | # # # 流水 |
... | @@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin): |
559 | bs_summary = {} | 575 | bs_summary = {} |
560 | license_summary = {} | 576 | license_summary = {} |
561 | unknown_summary = {} | 577 | unknown_summary = {} |
562 | skip_img = [] | 578 | res_list = [] |
563 | interest_keyword = Keywords.objects.filter( | 579 | interest_keyword = Keywords.objects.filter( |
564 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) | 580 | type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) |
565 | salary_keyword = Keywords.objects.filter( | 581 | salary_keyword = Keywords.objects.filter( |
... | @@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin): |
573 | 589 | ||
574 | # 4.1 获取OCR结果 | 590 | # 4.1 获取OCR结果 |
575 | loop = asyncio.get_event_loop() | 591 | loop = asyncio.get_event_loop() |
576 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) | 592 | tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) |
577 | for img_path in pdf_handler.img_path_list] | 593 | for img_path in pdf_handler.img_path_list] |
578 | loop.run_until_complete(asyncio.wait(tasks)) | 594 | loop.run_until_complete(asyncio.wait(tasks)) |
579 | # loop.close() | 595 | # loop.close() |
580 | 596 | ||
581 | # for img_path in pdf_handler.img_path_list: | 597 | # for img_path in pdf_handler.img_path_list: |
582 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) | 598 | # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list) |
583 | 599 | ||
584 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' | 600 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' |
585 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, | 601 | '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, |
... | @@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin): |
588 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) | 604 | merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) |
589 | 605 | ||
590 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' | 606 | self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' |
591 | '[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type, | 607 | '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type, |
592 | doc.id, merged_bs_summary, | 608 | doc.id, merged_bs_summary, |
593 | unknown_summary, skip_img)) | 609 | unknown_summary, res_list)) |
594 | del unknown_summary | 610 | del unknown_summary |
595 | 611 | ||
596 | # 4.2 重构Excel文件 | 612 | # 4.2 重构Excel文件 |
597 | wb.save(src_excel_path) | 613 | wb.save(src_excel_path) |
598 | wb.rebuild(merged_bs_summary, license_summary, skip_img, doc.document_scheme) | 614 | wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme) |
599 | wb.save(excel_path) | 615 | wb.save(excel_path) |
600 | except EDMSException as e: | 616 | except EDMSException as e: |
601 | doc.status = DocStatus.PROCESS_FAILED.value | 617 | doc.status = DocStatus.PROCESS_FAILED.value | ... | ... |
... | @@ -502,19 +502,19 @@ class BSWorkbook(Workbook): | ... | @@ -502,19 +502,19 @@ class BSWorkbook(Workbook): |
502 | ws.append((write_field, license_dict.get(search_field, ''))) | 502 | ws.append((write_field, license_dict.get(search_field, ''))) |
503 | ws.append((None, )) | 503 | ws.append((None, )) |
504 | 504 | ||
505 | def skip_img_sheet(self, skip_img): | 505 | def res_sheet(self, res_list): |
506 | if skip_img: | 506 | if res_list: |
507 | ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME) | 507 | ws = self.create_sheet(consts.RES_SHEET_NAME) |
508 | ws.append(consts.SKIP_IMG_SHEET_HEADER) | 508 | ws.append(consts.RES_SHEET_HEADER) |
509 | for img_tuple in skip_img: | 509 | for res_tuple in res_list: |
510 | ws.append(img_tuple) | 510 | ws.append(res_tuple) |
511 | 511 | ||
512 | def remove_base_sheet(self): | 512 | def remove_base_sheet(self): |
513 | if len(self.sheetnames) > 1: | 513 | if len(self.sheetnames) > 1: |
514 | self.remove(self.get_sheet_by_name('Sheet')) | 514 | self.remove(self.get_sheet_by_name('Sheet')) |
515 | 515 | ||
516 | def rebuild(self, bs_summary, license_summary, skip_img, document_scheme): | 516 | def rebuild(self, bs_summary, license_summary, res_list, document_scheme): |
517 | self.bs_rebuild(bs_summary) | 517 | self.bs_rebuild(bs_summary) |
518 | self.license_rebuild(license_summary, document_scheme) | 518 | self.license_rebuild(license_summary, document_scheme) |
519 | self.skip_img_sheet(skip_img) | 519 | self.res_sheet(res_list) |
520 | self.remove_base_sheet() | 520 | self.remove_base_sheet() | ... | ... |
-
Please register or sign in to post a comment