bffd2595 by 周伟奇

add res sheet

1 parent 01efbccb
......@@ -72,8 +72,12 @@ TRANS_MAP = {
}
TRANS = str.maketrans(TRANS_MAP)
ERROR_CHARS = {'.', '。', ':', ':', '•', '·', ',', ','}
SKIP_IMG_SHEET_NAME = '未处理图片'
SKIP_IMG_SHEET_HEADER = ('页码', '序号')
RES_SHEET_NAME = '结果统计'
RES_SHEET_HEADER = ('页码', '序号', '结果')
RES_SUCCESS = '识别成功'
RES_SUCCESS_OTHER = '识别成功(其他类)'
RES_SUCCESS_EMPTY = '识别成功(空数据)'
RES_FAILED = '识别识别'
CARD_RATIO = 0.9
UNKNOWN_CARD = '未知卡号'
......
......@@ -93,18 +93,19 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, business_type, doc.id, pdf_path))
return doc_data_path, excel_path, src_excel_path, pdf_path
def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img):
def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino):
sheets = ocr_data.get('data', [])
if not sheets:
skip_img.append(self.parse_img_path(img_path))
res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
return
confidence = ocr_data.get('confidence', 1)
img_name, _ = os.path.splitext(os.path.basename(img_path))
img_name = 'page_{0}_img_{1}'.format(pno, ino)
cells_exists = False
for i, sheet in enumerate(sheets):
cells = sheet.get('cells')
if not cells:
skip_img.append(self.parse_img_path(img_path))
continue
cells_exists = True
sheet_name = '{0}_{1}'.format(img_name, i)
ws = wb.create_sheet(sheet_name)
for cell in cells:
......@@ -160,16 +161,23 @@ class Command(BaseCommand, LoggerMixin):
if summary[6] is not None:
ed_list.append(summary[6])
def license1_process(self, ocr_data, license_summary, classify, skip_img, img_path):
if cells_exists:
res_list.append((pno, ino, consts.RES_SUCCESS))
else:
res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino):
# 类别:'0'身份证, '1'居住证
license_data = ocr_data.get('data', [])
if not license_data:
skip_img.append(self.parse_img_path(img_path))
res_list.append((pno, ino, consts.RES_SUCCESS_EMPTY))
return
res_list.append((pno, ino, consts.RES_SUCCESS))
license_summary.setdefault(classify, []).extend(license_data)
def license2_process(self, ocr_res_2, license_summary, pid, classify, skip_img, img_path):
def license2_process(self, ocr_res_2, license_summary, pid, classify, res_list, pno, ino):
if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
res_list.append((pno, ino, consts.RES_SUCCESS))
if pid == consts.BC_PID:
# 银行卡
# res_dict = {}
......@@ -184,7 +192,7 @@ class Command(BaseCommand, LoggerMixin):
res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
license_summary.setdefault(classify, []).append(res_dict)
else:
skip_img.append(self.parse_img_path(img_path))
res_list.append((pno, ino, consts.RES_FAILED))
@staticmethod
async def fetch_ocr_1_result(url, json_data):
......@@ -207,7 +215,8 @@ class Command(BaseCommand, LoggerMixin):
if response.status == 200:
return await response.json()
async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, res_list):
pno, ino = self.parse_img_path(img_path)
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
......@@ -217,23 +226,26 @@ class Command(BaseCommand, LoggerMixin):
}
ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
if ocr_res_1 is None:
skip_img.append(self.parse_img_path(img_path))
raise Exception('ocr 1 error, img_path={0}'.format(img_path))
res_list.append((pno, ino, consts.RES_FAILED))
self.cronjob_log.info('{0} [ocr_1 failed] [img={1}]'.format(self.log_base, img_path))
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
else:
self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
self.cronjob_log.info('{0} [ocr_1 success] [img={1}] [res={2}]'.format(
self.log_base, img_path, ocr_res_1))
if ocr_res_1.get('code') == 1:
ocr_data = ocr_res_1.get('data', {})
classify = ocr_data.get('classify')
if classify is None:
skip_img.append(self.parse_img_path(img_path))
res_list.append((pno, ino, consts.RES_FAILED))
self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
self.log_base, img_path, ocr_res_1))
return
elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
skip_img.append(self.parse_img_path(img_path))
res_list.append((pno, ino, consts.RES_SUCCESS_OTHER))
return
elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
self.license1_process(ocr_data, license_summary, classify, res_list, pno, ino)
elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
pid, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
json_data_2 = {
......@@ -244,11 +256,13 @@ class Command(BaseCommand, LoggerMixin):
}
ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
if ocr_res_2 is None:
raise Exception('ocr 2 error, img_path={0}'.format(img_path))
res_list.append((pno, ino, consts.RES_FAILED))
self.cronjob_log.info('{0} [ocr_2 failed] [img={1}]'.format(self.log_base, img_path))
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
else:
# 识别结果
ocr_res_2 = json.loads(ocr_res_2)
self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
self.cronjob_log.info('{0} [ocr_2 success] [img={1}] [res={2}]'.format(
self.log_base, img_path, ocr_res_2))
if classify == consts.BC_CLASSIFY:
name = '有'
......@@ -258,11 +272,13 @@ class Command(BaseCommand, LoggerMixin):
card_name_res.get('data', {}).get('is_exists_name') == 0:
name = '无'
ocr_res_2['Name'] = name
self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
self.license2_process(ocr_res_2, license_summary, pid, classify, res_list, pno, ino)
else: # 流水处理
self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
self.bs_process(wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino)
else:
skip_img.append(self.parse_img_path(img_path))
res_list.append((pno, ino, consts.RES_FAILED))
self.cronjob_log.info('{0} [ocr_1 res error] [img={1}] [res={2}]'.format(
self.log_base, img_path, ocr_res_1))
# def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
# # # 流水
......@@ -559,7 +575,7 @@ class Command(BaseCommand, LoggerMixin):
bs_summary = {}
license_summary = {}
unknown_summary = {}
skip_img = []
res_list = []
interest_keyword = Keywords.objects.filter(
type=KeywordsType.INTEREST.value, on_off=True).values_list('keyword', flat=True)
salary_keyword = Keywords.objects.filter(
......@@ -573,13 +589,13 @@ class Command(BaseCommand, LoggerMixin):
# 4.1 获取OCR结果
loop = asyncio.get_event_loop()
tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
for img_path in pdf_handler.img_path_list]
loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
# for img_path in pdf_handler.img_path_list:
# self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
# self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, res_list)
self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
'[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
......@@ -588,14 +604,14 @@ class Command(BaseCommand, LoggerMixin):
merged_bs_summary = self.rebuild_bs_summary(bs_summary, unknown_summary)
self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [merged_bs_summary={3}] '
'[unknown_summary={4}] [skip_img={5}]'.format(self.log_base, business_type,
'[unknown_summary={4}] [res_list={5}]'.format(self.log_base, business_type,
doc.id, merged_bs_summary,
unknown_summary, skip_img))
unknown_summary, res_list))
del unknown_summary
# 4.2 重构Excel文件
wb.save(src_excel_path)
wb.rebuild(merged_bs_summary, license_summary, skip_img, doc.document_scheme)
wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
wb.save(excel_path)
except EDMSException as e:
doc.status = DocStatus.PROCESS_FAILED.value
......
......@@ -502,19 +502,19 @@ class BSWorkbook(Workbook):
ws.append((write_field, license_dict.get(search_field, '')))
ws.append((None, ))
def skip_img_sheet(self, skip_img):
if skip_img:
ws = self.create_sheet(consts.SKIP_IMG_SHEET_NAME)
ws.append(consts.SKIP_IMG_SHEET_HEADER)
for img_tuple in skip_img:
ws.append(img_tuple)
def res_sheet(self, res_list):
if res_list:
ws = self.create_sheet(consts.RES_SHEET_NAME)
ws.append(consts.RES_SHEET_HEADER)
for res_tuple in res_list:
ws.append(res_tuple)
def remove_base_sheet(self):
if len(self.sheetnames) > 1:
self.remove(self.get_sheet_by_name('Sheet'))
def rebuild(self, bs_summary, license_summary, skip_img, document_scheme):
def rebuild(self, bs_summary, license_summary, res_list, document_scheme):
self.bs_rebuild(bs_summary)
self.license_rebuild(license_summary, document_scheme)
self.skip_img_sheet(skip_img)
self.res_sheet(res_list)
self.remove_base_sheet()
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!