e570371a by 周伟奇

add edms retry

1 parent 04020101
......@@ -35,6 +35,8 @@ DEALER_CODE_META_FIELD_id = 13
BUSINESS_TYPE_META_FIELD_id = 93
DEALER_CODE = 'ocr_situ_group'
RETRY_TIMES = 3
# ---------银行流水模板相关--------------------------------------------------------------------------------------------
TRANS_MAP = {
......
class EDMSException(Exception):
pass
......@@ -19,6 +19,7 @@ from apps.doc.models import DocStatus, HILDoc, AFCDoc, Keywords
from apps.doc.named_enum import KeywordsType
from apps.doc import consts
from apps.doc.ocr.edms import EDMS, rh
from apps.doc.exceptions import EDMSException
class Command(BaseCommand, LoggerMixin):
......@@ -72,11 +73,21 @@ class Command(BaseCommand, LoggerMixin):
os.makedirs(doc_data_path, exist_ok=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
self.edms.download(pdf_path, doc.metadata_version_id)
for times in range(consts.RETRY_TIMES):
try:
self.edms.download(pdf_path, doc.metadata_version_id)
except Exception as e:
self.cronjob_log.warn('{0} [edms download failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
edms_exc = str(e)
else:
break
else:
raise EDMSException(edms_exc)
excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
self.cronjob_log.info('{0} [pdf download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
self.cronjob_log.info('{0} [edms download success] [business_type={1}] [doc_id={2}] [pdf_path={3}]'.format(
self.log_base, business_type, doc.id, pdf_path))
return doc_data_path, excel_path, src_excel_path, pdf_path
......@@ -177,105 +188,21 @@ class Command(BaseCommand, LoggerMixin):
else:
skip_img.append(self.parse_img_path(img_path))
# async def fetch_ocr_result(self, url, json_data):
# async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
# async with session.post(url, json=json_data) as response:
# if response.status == 200:
# return await response.json()
#
# async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary):
# with open(img_path, 'rb') as f:
# base64_data = base64.b64encode(f.read())
# # 获取解码后的base64值
# file_data = base64_data.decode()
# json_data_1 = {
# "file": file_data
# }
# ocr_res_1 = await self.fetch_ocr_result(self.ocr_url_1, json_data_1)
# if ocr_res_1 is None:
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
# else:
# self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_1))
#
# if ocr_res_1.get('code') == 1:
# ocr_data = ocr_res_1.get('data', {})
# classify = ocr_data.get('classify')
# if classify is None:
# return
# elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
# return
# elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
# self.license1_process(ocr_data, license_summary, classify)
# elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
# pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
# json_data_2 = {
# "pid": str(pid),
# "key": conf.OCR_KEY,
# "secret": conf.OCR_SECRET,
# "file": file_data
# }
# ocr_res_2 = await self.fetch_ocr_result(self.ocr_url_2, json_data_2)
# if ocr_res_2 is None:
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
# else:
# # 识别结果
# self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_2))
# self.license2_process(ocr_res_2, license_summary, pid, classify)
# else: # 流水处理
# self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify)
def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
# # 流水
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# 'data': [
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# },
# {
# 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# 'cells': []
# }
# ]
# }
# }
#
# # 证件-1
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# 'data': [
# {
# 'cn_key': 'value',
# 'cn_key': 'value',
# },
# {
# 'cn_key': 'value',
# 'cn_key': 'value',
# },
# ]
# }
# }
#
# # 证件-2 or 其他类
# res = {
# 'code': 1,
# 'msg': 'success',
# 'data': {
# 'classify': 0,
# 'confidence': 0.999,
# }
# }
@staticmethod
async def fetch_ocr_1_result(url, json_data):
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
async with session.post(url, json=json_data) as response:
if response.status == 200:
return await response.json()
@staticmethod
async def fetch_ocr_2_result(url, json_data):
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
async with session.post(url, data=json_data) as response:
if response.status == 200:
return await response.json()
async def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
......@@ -283,9 +210,10 @@ class Command(BaseCommand, LoggerMixin):
json_data_1 = {
"file": file_data
}
response_1 = requests.post(self.ocr_url_1, json=json_data_1)
if response_1.status_code == 200:
ocr_res_1 = response_1.json()
ocr_res_1 = await self.fetch_ocr_1_result(self.ocr_url_1, json_data_1)
if ocr_res_1 is None:
raise Exception('ocr 1 error, img_path={0}'.format(img_path))
else:
self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
self.log_base, img_path, ocr_res_1))
......@@ -308,26 +236,121 @@ class Command(BaseCommand, LoggerMixin):
"secret": conf.OCR_SECRET,
"file": file_data
}
response_2 = requests.post(self.ocr_url_2, data=json_data_2)
if response_2.status_code == 200:
ocr_res_2 = await self.fetch_ocr_2_result(self.ocr_url_2, json_data_2)
if ocr_res_2 is None:
raise Exception('ocr 2 error, img_path={0}'.format(img_path))
else:
# 识别结果
ocr_res_2 = response_2.json()
self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
self.log_base, img_path, ocr_res_2))
self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
else:
raise Exception('ocr 2 error, img_path={0}'.format(img_path))
else: # 流水处理
self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
else:
skip_img.append(self.parse_img_path(img_path))
else:
raise Exception('ocr 1 error, img_path={0}'.format(img_path))
# def img_2_ocr_2_wb(self, wb, img_path, bs_summary, unknown_summary, license_summary, skip_img):
# # # 流水
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # },
# # {
# # 'summary': ['户名', '卡号', '页码', '回单验证码', '打印时间', '起始时间', '终止时间'],
# # 'cells': []
# # }
# # ]
# # }
# # }
# #
# # # 证件-1
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # 'data': [
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # {
# # 'cn_key': 'value',
# # 'cn_key': 'value',
# # },
# # ]
# # }
# # }
# #
# # # 证件-2 or 其他类
# # res = {
# # 'code': 1,
# # 'msg': 'success',
# # 'data': {
# # 'classify': 0,
# # 'confidence': 0.999,
# # }
# # }
# with open(img_path, 'rb') as f:
# base64_data = base64.b64encode(f.read())
# # 获取解码后的base64值
# file_data = base64_data.decode()
# json_data_1 = {
# "file": file_data
# }
# response_1 = requests.post(self.ocr_url_1, json=json_data_1)
# if response_1.status_code == 200:
# ocr_res_1 = response_1.json()
# self.cronjob_log.info('{0} [ocr_1 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_1))
#
# if ocr_res_1.get('code') == 1:
# ocr_data = ocr_res_1.get('data', {})
# classify = ocr_data.get('classify')
# if classify is None:
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
# skip_img.append(self.parse_img_path(img_path))
# return
# elif classify in consts.LICENSE_CLASSIFY_SET_1: # 证件1
# self.license1_process(ocr_data, license_summary, classify, skip_img, img_path)
# elif classify in consts.LICENSE_CLASSIFY_SET_2: # 证件2
# pid, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
# json_data_2 = {
# "pid": str(pid),
# "key": conf.OCR_KEY,
# "secret": conf.OCR_SECRET,
# "file": file_data
# }
# response_2 = requests.post(self.ocr_url_2, data=json_data_2)
# if response_2.status_code == 200:
# # 识别结果
# ocr_res_2 = response_2.json()
# self.cronjob_log.info('{0} [ocr_2 result] [img={1}] [res={2}]'.format(
# self.log_base, img_path, ocr_res_2))
# self.license2_process(ocr_res_2, license_summary, pid, classify, skip_img, img_path)
# else:
# raise Exception('ocr 2 error, img_path={0}'.format(img_path))
# else: # 流水处理
# self.bs_process(wb, ocr_data, bs_summary, unknown_summary, img_path, classify, skip_img)
# else:
# skip_img.append(self.parse_img_path(img_path))
# else:
# raise Exception('ocr 1 error, img_path={0}'.format(img_path))
@staticmethod
def parse_img_path(img_path):
img_name, _ = os.path.splitext(os.path.basename(img_path))
return img_name[5], img_name[11]
return int(img_name[5])+1, int(img_name[11])+1
@staticmethod
def get_most(value_list):
......@@ -520,14 +543,14 @@ class Command(BaseCommand, LoggerMixin):
# wb = Workbook()
# 4.1 获取OCR结果
# loop = asyncio.get_event_loop()
# tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary)
# for img_path in pdf_handler.img_path_list]
# loop.run_until_complete(asyncio.wait(tasks))
loop = asyncio.get_event_loop()
tasks = [self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
for img_path in pdf_handler.img_path_list]
loop.run_until_complete(asyncio.wait(tasks))
# loop.close()
for img_path in pdf_handler.img_path_list:
self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
# for img_path in pdf_handler.img_path_list:
# self.img_2_ocr_2_wb(wb, img_path, bs_summary, unknown_summary, license_summary, skip_img)
self.cronjob_log.info('{0} [business_type={1}] [doc_id={2}] [bs_summary={3}] [unknown_summary={4}] '
'[license_summary={5}]'.format(self.log_base, business_type, doc.id, bs_summary,
......@@ -545,23 +568,37 @@ class Command(BaseCommand, LoggerMixin):
wb.save(src_excel_path)
wb.rebuild(merged_bs_summary, license_summary, skip_img)
wb.save(excel_path)
except EDMSException as e:
self.cronjob_log.error('{0} [process failed (edms download)] [business_type={1}] [doc_id={2}] '
'[err={3}]'.format(self.log_base, business_type, doc.id, e))
except Exception as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed] [business_type={1}] [doc_id={2}] [err={3}]'.format(
self.log_base, business_type, doc.id, e))
self.cronjob_log.error('{0} [process failed (program)] [business_type={1}] [doc_id={2}] '
'[err={3}]'.format(self.log_base, business_type, doc.id, e))
else:
try:
# 5.上传至EDMS
self.edms.upload(excel_path, doc, business_type)
# print('upload pass')
for times in range(consts.RETRY_TIMES):
try:
self.edms.upload(excel_path, doc, business_type)
except Exception as e:
self.cronjob_log.warn(
'{0} [edms upload failed] [times={1}] [business_type={2}] [doc_id={3}] '
'[error={4}]'.format(self.log_base, times, business_type, doc.id, e))
edms_exc = str(e)
else:
break
else:
raise EDMSException(edms_exc)
except Exception as e:
doc.status = DocStatus.UPLOAD_FAILED.value
doc.save()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.error('{0} [upload failed] [business_type={1}] [doc_id={2}] [speed_time={3}] '
'[err={4}]'.format(self.log_base, business_type, doc.id, speed_time, e))
self.cronjob_log.error('{0} [process failed (edms upload)] [business_type={1}] [doc_id={2}] '
'[speed_time={3}] [err={4}]'.format(self.log_base, business_type, doc.id,
speed_time, e))
write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
else:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!