bffd2595 by 周伟奇

add res sheet

1 parent 01efbccb
...@@ -72,8 +72,12 @@ TRANS_MAP = { ...@@ -72,8 +72,12 @@ TRANS_MAP = {
72 } 72 }
73 TRANS = str.maketrans(TRANS_MAP) 73 TRANS = str.maketrans(TRANS_MAP)
74 ERROR_CHARS = {'.', '。', ':', ':', '•', '·', ',', ','} 74 ERROR_CHARS = {'.', '。', ':', ':', '•', '·', ',', ','}
75 SKIP_IMG_SHEET_NAME = '未处理图片' 75 RES_SHEET_NAME = '结果统计'
76 SKIP_IMG_SHEET_HEADER = ('页码', '序号') 76 RES_SHEET_HEADER = ('页码', '序号', '结果')
77 RES_SUCCESS = '识别成功'
78 RES_SUCCESS_OTHER = '识别成功(其他类)'
79 RES_SUCCESS_EMPTY = '识别成功(空数据)'
80 RES_FAILED = '识别识别'
77 81
78 CARD_RATIO = 0.9 82 CARD_RATIO = 0.9
79 UNKNOWN_CARD = '未知卡号' 83 UNKNOWN_CARD = '未知卡号'
......
...@@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin): ...@@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin):
93 self.log_base, business_type, doc.id, pdf_path)) 93 self.log_base, business_type, doc.id, pdf_path))
94 return doc_data_path, excel_path, src_excel_path, pdf_path 94 return doc_data_path, excel_path, src_excel_path, pdf_path
95 95
96 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img): 96 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino):
97 sheets = ocr_data.get('data', []) 97 sheets = ocr_data.get('data', [])
98 if not sheets: 98 if not sheets:
99 skip_img.append(self.parse_img_path(img_path)) 99 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
100 return 100 return
101 confidence = ocr_data.get('confidence', 1) 101 confidence = ocr_data.get('confidence', 1)
102 img_name, _ = os.path.splitext(os.path.basename(img_path)) 102 img_name = 'page_{0}_img_{1}'.format(pno, ino)
103 cells_exists = False
103 for i, sheet in enumerate(sheets): 104 for i, sheet in enumerate(sheets):
104 cells = sheet.get('cells') 105 cells = sheet.get('cells')
105 if not cells: 106 if not cells:
106 skip_img.append(self.parse_img_path(img_path))
107 continue 107 continue
108 cells_exists = True
108 sheet_name = '{0}_{1}'.format(img_name, i) 109 sheet_name = '{0}_{1}'.format(img_name, i)
109 ws = wb.create_sheet(sheet_name) 110 ws = wb.create_sheet(sheet_name)
110 for cell in cells: 111 for cell in cells:
...@@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin): ...@@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin):
160 if summary[6] is not None: 161 if summary[6] is not None:
161 ed_list.append(summary[6]) 162 ed_list.append(summary[6])
162 163
163 def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path): 164 if cells_exists:
165 res_list.append((pno, ino, consts.RES_SUCCESS))
166 else:
167 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
168
169 def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino):
164 # 类别:'0'身份证, '1'居住证 170 # 类别:'0'身份证, '1'居住证
165 license_data = ocr_data.get('data', []) 171 license_data = ocr_data.get('data', [])
166 if not license_data: 172 if not license_data:
167 skip_img.append(self.parse_img_path(img_path)) 173 res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
168 return 174 return
175 res_list.append((pno, ino, consts.RES_SUCCESS))
169 license_summary.setdefault(classify, []).extend(license_data) 176 license_summary.setdefault(classify, []).extend(license_data)
170 177
171 def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path): 178 def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino):
172 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET: 179 if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
180 res_list.append((pno, ino, consts.RES_SUCCESS))
173 if pid == consts.BC_PID: 181 if pid == consts.BC_PID:
174 # 银行卡 182 # 银行卡
175 # res_dict = {} 183 # res_dict = {}
...@@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin):
184 res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '') 192 res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
185 license_summary.setdefault(classify, []).append(res_dict) 193 license_summary.setdefault(classify, []).append(res_dict)
186 else: 194 else:
187 skip_img.append(self.parse_img_path(img_path)) 195 res_list.append((pno, ino, consts.RES_FAILED))
188 196
189 @staticmethod 197 @staticmethod
190 async def fetch_ocr_1_result(url, json_data): 198 async def fetch_ocr_1_result(url, json_data):
...@@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin):
207 if response.status == 200: 215 if response.status == 200:
208 return await response.json() 216 return await response.json()
209 217
210 async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): 218 async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list):
219 pno, ino = self.parse_img_path(img_path)
211 with open(img_path, 'rb') as f: 220 with open(img_path, 'rb') as f:
212 base64_data = base64.b64encode(f.read()) 221 base64_data = base64.b64encode(f.read())
213 # 获取解码后的base64值 222 # 获取解码后的base64值
...@@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin): ...@@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin):
217 } 226 }
218 ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1) 227 ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
219 if ocr_res_1 is None: 228 if ocr_res_1 is None:
220 skip_img.append(self.parse_img_path(img_path)) 229 res_list.append((pno, ino, consts.RES_FAILED))
221 raise Exception('ocr 1 error, img_path={0}'.format(img_path)) 230 self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path))
231 # raise Exception('ocr 1 error, img_path={0}'.format(img_path))
222 else: 232 else:
223 self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format( 233 self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format(
224 self.log_base, img_path, ocr_res_1)) 234 self.log_base, img_path, ocr_res_1))
225 235
226 if ocr_res_1.get('code') == 1: 236 if ocr_res_1.get('code') == 1:
227 ocr_data = ocr_res_1.get('data', {}) 237 ocr_data = ocr_res_1.get('data', {})
228 classify = ocr_data.get('classify') 238 classify = ocr_data.get('classify')
229 if classify is None: 239 if classify is None:
230 skip_img.append(self.parse_img_path(img_path)) 240 res_list.append((pno, ino, consts.RES_FAILED))
241 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
242 self.log_base, img_path, ocr_res_1))
231 return 243 return
232 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 244 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
233 skip_img.append(self.parse_img_path(img_path)) 245 res_list.append((pno, ino, consts.RES_SUCCESS_OTHER))
234 return 246 return
235 elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1 247 elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
236 self.license1_process(ocr_data, license_summary, classify, skip_img, img_path) 248 self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino)
237 elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2 249 elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
238 pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify) 250 pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
239 json_data_2 = { 251 json_data_2 = {
...@@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin):
244 } 256 }
245 ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2) 257 ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
246 if ocr_res_2 is None: 258 if ocr_res_2 is None:
247 raise Exception('ocr 2 error, img_path={0}'.format(img_path)) 259 res_list.append((pno, ino, consts.RES_FAILED))
260 self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path))
261 # raise Exception('ocr 2 error, img_path={0}'.format(img_path))
248 else: 262 else:
249 # 识别结果 263 # 识别结果
250 ocr_res_2 = json.loads(ocr_res_2) 264 ocr_res_2 = json.loads(ocr_res_2)
251 self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format( 265 self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format(
252 self.log_base, img_path, ocr_res_2)) 266 self.log_base, img_path, ocr_res_2))
253 if classify == consts.BC_CLASSIFY: 267 if classify == consts.BC_CLASSIFY:
254 name = '有' 268 name = '有'
...@@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin):
258 card_name_res.get('data', {}).get('is_exists_name') == 0: 272 card_name_res.get('data', {}).get('is_exists_name') == 0:
259 name = '无' 273 name = '无'
260 ocr_res_2['Name'] = name 274 ocr_res_2['Name'] = name
261 self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path) 275 self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino)
262 else: # 流水处理 276 else: # 流水处理
263 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img) 277 self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino)
264 else: 278 else:
265 skip_img.append(self.parse_img_path(img_path)) 279 res_list.append((pno, ino, consts.RES_FAILED))
280 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
281 self.log_base, img_path, ocr_res_1))
266 282
267 # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img): 283 # def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
268 # # # 流水 284 # # # 流水
...@@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin):
559 bs_summary = {} 575 bs_summary = {}
560 license_summary = {} 576 license_summary = {}
561 unknown_summary = {} 577 unknown_summary = {}
562 skip_img = [] 578 res_list = []
563 interest_keyword = Keywords.objects.filter( 579 interest_keyword = Keywords.objects.filter(
564 type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True) 580 type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
565 salary_keyword = Keywords.objects.filter( 581 salary_keyword = Keywords.objects.filter(
...@@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin):
573 589
574 # 4.1 获取OCR结果 590 # 4.1 获取OCR结果
575 loop = asyncio.get_event_loop() 591 loop = asyncio.get_event_loop()
576 tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) 592 tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
577 for img_path in pdf_handler.img_path_list] 593 for img_path in pdf_handler.img_path_list]
578 loop.run_until_complete(asyncio.wait(tasks)) 594 loop.run_until_complete(asyncio.wait(tasks))
579 # loop.close() 595 # loop.close()
580 596
581 # for img_path in pdf_handler.img_path_list: 597 # for img_path in pdf_handler.img_path_list:
582 # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img) 598 # self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
583 599
584 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] ' 600 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
585 '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary, 601 '[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
...@@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin):
588 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary) 604 merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
589 605
590 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] ' 606 self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
591 '[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type, 607 '[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type,
592 doc.id, merged_bs_summary, 608 doc.id, merged_bs_summary,
593 unknown_summary, skip_img)) 609 unknown_summary, res_list))
594 del unknown_summary 610 del unknown_summary
595 611
596 # 4.2 重构Excel文件 612 # 4.2 重构Excel文件
597 wb.save(src_excel_path) 613 wb.save(src_excel_path)
598 wb.rebuild(merged_bs_summary, license_summary, skip_img, doc.document_scheme) 614 wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
599 wb.save(excel_path) 615 wb.save(excel_path)
600 except EDMSException as e: 616 except EDMSException as e:
601 doc.status = DocStatus.PROCESS_FAILED.value 617 doc.status = DocStatus.PROCESS_FAILED.value
......
...@@ -502,19 +502,19 @@ class BSWorkbook(Workbook): ...@@ -502,19 +502,19 @@ class BSWorkbook(Workbook):
502 ws.append((write_field, license_dict.get(search_field, ''))) 502 ws.append((write_field, license_dict.get(search_field, '')))
503 ws.append((None, )) 503 ws.append((None, ))
504 504
505 def skip_img_sheet(self, skip_img): 505 def res_sheet(self, res_list):
506 if skip_img: 506 if res_list:
507 ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME) 507 ws = self.create_sheet(consts.RES_SHEET_NAME)
508 ws.append(consts.SKIP_IMG_SHEET_HEADER) 508 ws.append(consts.RES_SHEET_HEADER)
509 for img_tuple in skip_img: 509 for res_tuple in res_list:
510 ws.append(img_tuple) 510 ws.append(res_tuple)
511 511
512 def remove_base_sheet(self): 512 def remove_base_sheet(self):
513 if len(self.sheetnames) > 1: 513 if len(self.sheetnames) > 1:
514 self.remove(self.get_sheet_by_name('Sheet')) 514 self.remove(self.get_sheet_by_name('Sheet'))
515 515
516 def rebuild(self, bs_summary, license_summary, skip_img, document_scheme): 516 def rebuild(self, bs_summary, license_summary, res_list, document_scheme):
517 self.bs_rebuild(bs_summary) 517 self.bs_rebuild(bs_summary)
518 self.license_rebuild(license_summary, document_scheme) 518 self.license_rebuild(license_summary, document_scheme)
519 self.skip_img_sheet(skip_img) 519 self.res_sheet(res_list)
520 self.remove_base_sheet() 520 self.remove_base_sheet()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!