160ac57d by 冯轩

merge

2 parents 88f01673 d619642f
......@@ -10,4 +10,7 @@ urlpatterns = [
path(r'invoice/downloadExcel', views.InvoiceExcelView.as_view()),
path(r'invoice/queryInfo', views.InvoiceQueryInfoView.as_view()),
path(r'contract/v1', views.SEContractView.as_view()),
path(r'reocr', views.DocReOcrView.as_view()),
path(r'batch/reocr', views.BatchReOcrView.as_view()),
]
......
......@@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin):
if len(info_tuple) == 2:
business_type, doc_id_str = info_tuple
else:
business_type, doc_id_str, classify_1_str = info_tuple
business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
zip_doc = doc_class.objects.filter(id=doc_id).first()
......@@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin):
else:
self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format(
self.log_base, task_str))
return zip_doc, business_type
return zip_doc, business_type, re_ocr_flag
def get_doc_info(self, task_str, is_priority=False):
try:
......@@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin):
classify_1_str = '0'
rebuild_task_str = task_str
else:
business_type, doc_id_str, classify_1_str = info_tuple
business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str)
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
......@@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin):
else:
self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return doc, business_type, rebuild_task_str, classify_1_str
return doc, business_type, rebuild_task_str, classify_1_str, re_ocr_flag
# def pdf_download(self, doc, pdf_path):
# if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
......@@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str))
# 2. 修改doc状态: 识别中
zip_doc, business_type = self.get_zip_doc_info(task_str)
zip_doc, business_type, re_ocr_flag = self.get_zip_doc_info(task_str)
if zip_doc is None:
time.sleep(self.sleep_time_doc_get)
continue
......@@ -1339,7 +1339,7 @@ class Command(BaseCommand, LoggerMixin):
try:
# 1. 从队列获取文件信息
doc, business_type, task_str, classify_1_str = self.get_doc_info(task_str, is_priority)
doc, business_type, task_str, classify_1_str, re_ocr_flag = self.get_doc_info(task_str, is_priority)
# 队列为空时的处理
if doc is None:
time.sleep(self.sleep_time_doc_get)
......@@ -1389,7 +1389,8 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
self.log_base, task_str, times))
start_time = time.time()
pdf_handler.extract_image(max_img_count)
max_img_count_or_none = None if re_ocr_flag == 'Y' else max_img_count
pdf_handler.extract_image(max_img_count_or_none)
end_time = time.time()
speed_time = int(end_time - start_time)
self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
......@@ -1407,7 +1408,7 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
self.log_base, task_str))
raise Exception('pdf img empty')
elif pdf_handler.img_count >= max_img_count:
elif re_ocr_flag == 'N' and pdf_handler.img_count >= max_img_count:
self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
self.log_base, task_str, pdf_handler.img_count))
......
......@@ -593,6 +593,16 @@ invoice_download_args = {
'application_ids': fields.Str(required=True),
}
doc_reocr_args = {
'doc_id': fields.Int(required=True),
'application_entity': fields.Str(required=True),
}
batch_reocr_args = {
'application_entity': fields.Str(required=True),
}
class UploadDocView(GenericView, DocHandler):
# permission_classes = []
# authentication_classes = []
......@@ -709,7 +719,7 @@ class UploadDocView(GenericView, DocHandler):
or document_name.endswith('.RAR'):
is_zip = True
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
enqueue_res = rh.enqueue([task], is_priority, is_zip)
self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}] [is_fsm={5} [classify_1={6}]]'.format(args, prefix, doc.id,
......@@ -1294,7 +1304,7 @@ class DocView(DocGenericView, DocHandler):
is_zip = True
# tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
enqueue_res = rh.enqueue([task], is_priority, is_zip)
self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
......@@ -1983,88 +1993,88 @@ class GoView(GenericView):
else:
return response.error_msg(msg='识别错误')
class InvoiceExcelView(GenericView):
#permission_classes = [IsAuthenticated]
#authentication_classes = [OAuth2AuthenticationWithUser]
# 下载发票excel
@use_args(invoice_download_args, location='data')
def post(self, request, args):
application_ids = args.get('application_ids')
application_entity = args.get('application_entity')
self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format('111222333'))
# 角色权限不符,返回异常
token = request.META.get("HTTP_AUTHORIZATION")
user_role = rh.get_token(token[-11:])
self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format(user_role))
if user_role is None or user_role == '-1' or (user_role == '1' and application_entity == '2') or (user_role == '2' and application_entity == '1'):
self.running_log.info('[InvoiceExcelView no permission] [user_role={0}] [application_entity={1}]'.format(user_role, application_entity))
raise NoPermissionException('no permission')
url = 'http://127.0.0.1:8088/napi/invoice/downloadExcelOri'
body = {
'applicationIds': application_ids,
'applicationEntity': application_entity
}
try:
self.running_log.info("request java invoice excel api, url:{0}, body:{1}".format(url, json.dumps(body)))
headers = {
'Content-Type': 'application/json'
}
resp = requests.post(url, headers=headers, json=body)
self.running_log.info("java invoice excel api finish, applicationIds:{0},{1}".format(application_ids, resp.text))
res_json = json.loads(resp.text)
file_path = res_json.get('result')
self.running_log.info("java invoice excel after process, filePath:{0}".format(file_path))
current_time = time.strftime('%Y-%m-%d_%H_%M_%S', time.localtime())
download_file_name = "发票信息提取-" + current_time + ".xlsx"
f = open(file_path,"rb")
response = HttpResponse(content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
response['Content-Disposition'] = 'attachment; filename={0}'.format(escape_uri_path(download_file_name))
response['Access-Control-Expose-Headers'] = 'content-disposition'
response.write(f.read())
f.close()
return response
except Exception as e:
self.running_log.error("invoice excel request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
url, json.dumps(body), traceback.format_exc()))
class InvoiceQueryInfoView(GenericView):
#permission_classes = [IsAuthenticated]
#authentication_classes = [OAuth2AuthenticationWithUser]
@use_args(invoice_download_args, location='data')
def post(self, request, args):
application_ids = args.get('application_ids')
application_entity = args.get('application_entity')
self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format('111222333'))
# 角色权限不符,返回异常
token = request.META.get("HTTP_AUTHORIZATION")
user_role = rh.get_token(token[-11:])
self.running_log.info('[InvoiceQueryInfoView] [user_role={0}] '.format(user_role))
if user_role is None or user_role == '-1' or (user_role == '1' and application_entity == '2') or (user_role == '2' and application_entity == '1'):
self.running_log.info('[InvoiceExcelView no permission] [user_role={0}] [application_entity={1}]'.format(user_role, application_entity))
raise NoPermissionException('no permission')
url = 'http://127.0.0.1:8088/napi/invoice/queryInfoOri'
body = {
'applicationIds': application_ids,
'applicationEntity': application_entity
}
try:
self.running_log.info("request java invoice info api, url:{0}, body:{1}".format(url, json.dumps(body)))
headers = {
'Content-Type': 'application/json'
}
resp = requests.post(url, headers=headers, json=body)
self.running_log.info("java invoice info api finish, applicationIds:{0},{1}".format(application_ids, resp.text))
res_json = json.loads(resp.text)
java_result = res_json.get('result')
return response2.ok(data=java_result)
except Exception as e:
self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
url, json.dumps(body), traceback.format_exc()))
# class InvoiceExcelView(GenericView):
# #permission_classes = [IsAuthenticated]
# #authentication_classes = [OAuth2AuthenticationWithUser]
# # 下载发票excel
# @use_args(invoice_download_args, location='data')
# def post(self, request, args):
# application_ids = args.get('application_ids')
# application_entity = args.get('application_entity')
# self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format('111222333'))
# # 角色权限不符,返回异常
# token = request.META.get("HTTP_AUTHORIZATION")
# user_role = rh.get_token(token[-11:])
# self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format(user_role))
# if user_role is None or user_role == '-1' or (user_role == '1' and application_entity == '2') or (user_role == '2' and application_entity == '1'):
# self.running_log.info('[InvoiceExcelView no permission] [user_role={0}] [application_entity={1}]'.format(user_role, application_entity))
# raise NoPermissionException('no permission')
# url = 'http://127.0.0.1:8088/napi/invoice/downloadExcelOri'
# body = {
# 'applicationIds': application_ids,
# 'applicationEntity': application_entity
# }
# try:
# self.running_log.info("request java invoice excel api, url:{0}, body:{1}".format(url, json.dumps(body)))
# headers = {
# 'Content-Type': 'application/json'
# }
# resp = requests.post(url, headers=headers, json=body)
# self.running_log.info("java invoice excel api finish, applicationIds:{0},{1}".format(application_ids, resp.text))
# res_json = json.loads(resp.text)
# file_path = res_json.get('result')
# self.running_log.info("java invoice excel after process, filePath:{0}".format(file_path))
# current_time = time.strftime('%Y-%m-%d_%H_%M_%S', time.localtime())
# download_file_name = "发票信息提取-" + current_time + ".xlsx"
# f = open(file_path,"rb")
# response = HttpResponse(content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
# response['Content-Disposition'] = 'attachment; filename={0}'.format(escape_uri_path(download_file_name))
# response['Access-Control-Expose-Headers'] = 'content-disposition'
# response.write(f.read())
# f.close()
# return response
# except Exception as e:
# self.running_log.error("invoice excel request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
# url, json.dumps(body), traceback.format_exc()))
# class InvoiceQueryInfoView(GenericView):
# #permission_classes = [IsAuthenticated]
# #authentication_classes = [OAuth2AuthenticationWithUser]
# @use_args(invoice_download_args, location='data')
# def post(self, request, args):
# application_ids = args.get('application_ids')
# application_entity = args.get('application_entity')
# self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format('111222333'))
# # 角色权限不符,返回异常
# token = request.META.get("HTTP_AUTHORIZATION")
# user_role = rh.get_token(token[-11:])
# self.running_log.info('[InvoiceQueryInfoView] [user_role={0}] '.format(user_role))
# if user_role is None or user_role == '-1' or (user_role == '1' and application_entity == '2') or (user_role == '2' and application_entity == '1'):
# self.running_log.info('[InvoiceExcelView no permission] [user_role={0}] [application_entity={1}]'.format(user_role, application_entity))
# raise NoPermissionException('no permission')
# url = 'http://127.0.0.1:8088/napi/invoice/queryInfoOri'
# body = {
# 'applicationIds': application_ids,
# 'applicationEntity': application_entity
# }
# try:
# self.running_log.info("request java invoice info api, url:{0}, body:{1}".format(url, json.dumps(body)))
# headers = {
# 'Content-Type': 'application/json'
# }
# resp = requests.post(url, headers=headers, json=body)
# self.running_log.info("java invoice info api finish, applicationIds:{0},{1}".format(application_ids, resp.text))
# res_json = json.loads(resp.text)
# java_result = res_json.get('result')
# return response2.ok(data=java_result)
# except Exception as e:
# self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
# url, json.dumps(body), traceback.format_exc()))
def notifyCmsPass(self, request):
args = request.data
......@@ -2190,4 +2200,194 @@ class DownloadGBHistoryFileView(GenericView):
self.running_log.info('[DownloadGBHistoryFileView] [args={0}] '.format(args))
return response.ok(data=True)
except Exception as e:
return response.ok(data=False)
\ No newline at end of file
return response.ok(data=False)
class InvoiceExcelView(GenericView):
#permission_classes = [IsAuthenticated]
#authentication_classes = [OAuth2AuthenticationWithUser]
# 下载发票excel
@use_args(invoice_download_args, location='data')
def post(self, request, args):
application_ids = args.get('application_ids')
application_entity = args.get('application_entity')
self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format('111222333'))
# 角色权限不符,返回异常
#token = request.META.get("HTTP_AUTHORIZATION")
#user_role = rh.get_token(token[-11:])
#self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format(user_role))
#if user_role is None or user_role == '-1' or (user_role == '1' and application_entity == '2') or (user_role == '2' and application_entity == '1'):
# self.running_log.info('[InvoiceExcelView no permission] [user_role={0}] [application_entity={1}]'.format(user_role, application_entity))
# raise NoPermissionException('no permission')
url = 'http://127.0.0.1:8088/napi/invoice/downloadExcelOri'
body = {
'applicationIds': application_ids,
'applicationEntity': application_entity
}
try:
self.running_log.info("request java invoice excel api, url:{0}, body:{1}".format(url, json.dumps(body)))
headers = {
'Content-Type': 'application/json'
}
resp = requests.post(url, headers=headers, json=body)
self.running_log.info("java invoice excel api finish, applicationIds:{0},{1}".format(application_ids, resp.text))
res_json = json.loads(resp.text)
file_path = res_json.get('result')
self.running_log.info("java invoice excel after process, filePath:{0}".format(file_path))
current_time = time.strftime('%Y-%m-%d_%H_%M_%S', time.localtime())
download_file_name = "发票信息提取-" + current_time + ".xlsx"
f = open(file_path,"rb")
response = HttpResponse(content_type='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
response['Content-Disposition'] = 'attachment; filename={0}'.format(escape_uri_path(download_file_name))
response['Access-Control-Expose-Headers'] = 'content-disposition'
response.write(f.read())
f.close()
return response
except Exception as e:
self.running_log.error("invoice excel request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
url, json.dumps(body), traceback.format_exc()))
class InvoiceQueryInfoView(GenericView):
#permission_classes = [IsAuthenticated]
#authentication_classes = [OAuth2AuthenticationWithUser]
@use_args(invoice_download_args, location='data')
def post(self, request, args):
application_ids = args.get('application_ids')
application_entity = args.get('application_entity')
self.running_log.info('[InvoiceExcelView] [user_role={0}] '.format('111222333'))
# 角色权限不符,返回异常
#token = request.META.get("HTTP_AUTHORIZATION")
#user_role = rh.get_token(token[-11:])
#self.running_log.info('[InvoiceQueryInfoView] [user_role={0}] '.format(user_role))
#if user_role is None or user_role == '-1' or (user_role == '1' and application_entity == '2') or (user_role == '2' and application_entity == '1'):
# self.running_log.info('[InvoiceExcelView no permission] [user_role={0}] [application_entity={1}]'.format(user_role, application_entity))
# raise NoPermissionException('no permission')
url = 'http://127.0.0.1:8088/napi/invoice/queryInfoOri'
body = {
'applicationIds': application_ids,
'applicationEntity': application_entity
}
try:
self.running_log.info("request java invoice info api, url:{0}, body:{1}".format(url, json.dumps(body)))
headers = {
'Content-Type': 'application/json'
}
resp = requests.post(url, headers=headers, json=body)
self.running_log.info("java invoice info api finish, applicationIds:{0},{1}".format(application_ids, resp.text))
res_json = json.loads(resp.text)
java_result = res_json.get('result')
return response2.ok(data=java_result)
except Exception as e:
self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
url, json.dumps(body), traceback.format_exc()))
class DocReOcrView(GenericView, DocHandler):
permission_classes = [IsAuthenticated]
authentication_classes = [OAuth2AuthenticationWithUser]
# required_scopes = ['write']
# 现有文件重新识别接口
@use_args(doc_reocr_args, location='data')
def post(self, request, args):
start_time = time.time()
application_entity = args.get('application_entity')
doc_id = args.get('doc_id')
doc_class, prefix = self.get_doc_class(application_entity)
doc = doc_class.objects.filter(id=doc_id).first()
# 3. 选择队列进入
is_priority = PriorityApplication.objects.filter(application_id=doc.application_id, on_off=True).exists()
is_zip = False
classify_1 = 0
# 电子合同 Econtract or OVP(FSM)
if doc.data_source == consts.DATA_SOURCE_LIST[2] or doc.data_source == consts.DATA_SOURCE_LIST[3]:
if doc.document_scheme == consts.DOC_SCHEME_LIST[1]:
for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
if keyword in doc.document_name:
classify_1 = classify_1_tmp
break
# FSM合同:WEP/MSI/SC/SC2
elif doc.data_source == consts.DATA_SOURCE_LIST[0] and doc.document_scheme == consts.DOC_SCHEME_LIST[0]:
for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
if keyword in doc.document_name:
classify_1 = classify_1_tmp
break
if doc.document_name.endswith('.zip') or doc.document_name.endswith('.rar') or doc.document_name.endswith('.ZIP') \
or doc.document_name.endswith('.RAR'):
is_zip = True
# task = 'AFC_11001_0_Y' 'AFC_11001_0_N' 最后的Y,N表示是否是reocr,N否,Y是
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'Y'])
enqueue_res = rh.enqueue([task], is_priority, is_zip)
self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}] [classify_1={5}]'.format(args, prefix, doc.id,
is_priority, enqueue_res, classify_1))
return response.ok()
class BatchReOcrView(GenericView, DocHandler):
permission_classes = [IsAuthenticated]
authentication_classes = [OAuth2AuthenticationWithUser]
# 现有文件批量重新识别接口
@use_args(batch_reocr_args, location='data')
def post(self, request, args):
start_time = time.time()
application_entity = args.get('application_entity')
today = timezone.now().date()
start_of_day = timezone.make_aware(timezone.datetime.combine(today, timezone.datetime.min.time()))
end_of_day = timezone.make_aware(timezone.datetime.combine(today, timezone.datetime.max.time()))
doc_class, prefix = self.get_doc_class(application_entity)
docs = doc_class.objects.filter(status=2, create_time__range=(start_of_day, end_of_day))
# 遍历
time_stamp = time.time()
for doc in docs.iterator():
self.running_log.info('[batch doc reocr timestamp={0}] [doc_id={1}]'.format(time_stamp, doc.id))
# 3. 选择队列进入
is_priority = PriorityApplication.objects.filter(application_id=doc.application_id, on_off=True).exists()
is_zip = False
classify_1 = 0
# 电子合同 Econtract or OVP(FSM)
if doc.data_source == consts.DATA_SOURCE_LIST[2] or doc.data_source == consts.DATA_SOURCE_LIST[3]:
if doc.document_scheme == consts.DOC_SCHEME_LIST[1]:
for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
if keyword in doc.document_name:
classify_1 = classify_1_tmp
break
# FSM合同:WEP/MSI/SC/SC2
elif doc.data_source == consts.DATA_SOURCE_LIST[0] and doc.document_scheme == consts.DOC_SCHEME_LIST[0]:
for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
if keyword in doc.document_name:
classify_1 = classify_1_tmp
break
if doc.document_name.endswith('.zip') or doc.document_name.endswith('.rar') or doc.document_name.endswith('.ZIP') \
or doc.document_name.endswith('.RAR'):
is_zip = True
# task = 'AFC_11001_0_Y' 'AFC_11001_0_N' 最后的Y,N表示是否是reocr,N否,Y是
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'Y'])
enqueue_res = rh.enqueue([task], is_priority, is_zip)
self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}] [classify_1={5}]'.format(args, prefix, doc.id,
is_priority, enqueue_res, classify_1))
return response.ok()
\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!