91ff8153 by 冯轩

init reocr

1 parent e08e5c00
......@@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin):
if len(info_tuple) == 2:
business_type, doc_id_str = info_tuple
else:
business_type, doc_id_str, classify_1_str = info_tuple
business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
zip_doc = doc_class.objects.filter(id=doc_id).first()
......@@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin):
else:
self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format(
self.log_base, task_str))
return zip_doc, business_type
return zip_doc, business_type, re_ocr_flag
def get_doc_info(self, task_str, is_priority=False):
try:
......@@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin):
classify_1_str = '0'
rebuild_task_str = task_str
else:
business_type, doc_id_str, classify_1_str = info_tuple
business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str)
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
......@@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin):
else:
self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return doc, business_type, rebuild_task_str, classify_1_str
return doc, business_type, rebuild_task_str, classify_1_str, re_ocr_flag
# def pdf_download(self, doc, pdf_path):
# if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
......@@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str))
# 2. 修改doc状态: 识别中
zip_doc, business_type = self.get_zip_doc_info(task_str)
zip_doc, business_type, re_ocr_flag = self.get_zip_doc_info(task_str)
if zip_doc is None:
time.sleep(self.sleep_time_doc_get)
continue
......@@ -1287,7 +1287,7 @@ class Command(BaseCommand, LoggerMixin):
target_pdf_path = os.path.join(pdf_doc_data_path, '{0}.pdf'.format(pdf_doc.id))
shutil.move(pdf_path, target_pdf_path)
pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0'])
pdf_task_str = consts.SPLIT_STR.join([business_type, str(pdf_doc.id), '0', re_ocr_flag])
pdf_task_str_list.append(pdf_task_str)
except Exception as e:
self.online_log.warn('{0} [zip_2_pdfs] [recreate pdf task failed] [task={1}] [pdf_path={2}]'
......@@ -1336,7 +1336,7 @@ class Command(BaseCommand, LoggerMixin):
try:
# 1. 从队列获取文件信息
doc, business_type, task_str, classify_1_str = self.get_doc_info(task_str, is_priority)
doc, business_type, task_str, classify_1_str, re_ocr_flag = self.get_doc_info(task_str, is_priority)
# 队列为空时的处理
if doc is None:
time.sleep(self.sleep_time_doc_get)
......@@ -1386,7 +1386,8 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
self.log_base, task_str, times))
start_time = time.time()
pdf_handler.extract_image(max_img_count)
max_img_count_or_none = None if re_ocr_flag == 'Y' else max_img_count
pdf_handler.extract_image(max_img_count_or_none)
end_time = time.time()
speed_time = int(end_time - start_time)
self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
......@@ -1404,7 +1405,7 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
self.log_base, task_str))
raise Exception('pdf img empty')
elif pdf_handler.img_count >= max_img_count:
elif re_ocr_flag == 'N' and pdf_handler.img_count >= max_img_count:
self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
self.log_base, task_str, pdf_handler.img_count))
......
......@@ -589,6 +589,11 @@ invoice_download_args = {
'application_ids': fields.Str(required=True),
}
doc_reocr_args = {
'doc_id': fields.Int(required=True),
'application_entity': fields.Int(required=True),
}
class UploadDocView(GenericView, DocHandler):
# permission_classes = []
......@@ -698,7 +703,7 @@ class UploadDocView(GenericView, DocHandler):
or document_name.endswith('.RAR'):
is_zip = True
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
enqueue_res = rh.enqueue([task], is_priority, is_zip)
self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}] [is_fsm={5} [classify_1={6}]]'.format(args, prefix, doc.id,
......@@ -1249,7 +1254,7 @@ class DocView(DocGenericView, DocHandler):
break
# tasks = ['{0}{1}{2}'.format(prefix, consts.SPLIT_STR, doc.id)]
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1)])
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'N'])
enqueue_res = rh.enqueue([task], is_priority)
self.running_log.info('[mock doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
......@@ -2068,4 +2073,54 @@ class InvoiceQueryInfoView(GenericView):
return response2.ok(data=java_result)
except Exception as e:
self.running_log.error("invoice info request to java error, url:{0}, param:{1}, errorMsg:{2}".format(
url, json.dumps(body), traceback.format_exc()))
\ No newline at end of file
url, json.dumps(body), traceback.format_exc()))
class DocReOcrView(GenericView, DocHandler):
permission_classes = [IsAuthenticated]
authentication_classes = [OAuth2AuthenticationWithUser]
# required_scopes = ['write']
# 现有文件重新识别接口
@use_args(doc_reocr_args, location='data')
def post(self, request, args):
start_time = time.time()
application_entity = args.get('application_entity')
doc_id = args.get('doc_id')
doc_class, prefix = self.get_doc_class(application_entity)
doc = doc_class.objects.filter(id=doc_id).first()
# 3. 选择队列进入
is_priority = PriorityApplication.objects.filter(application_id=doc.application_id, on_off=True).exists()
is_zip = False
classify_1 = 0
# 电子合同 Econtract or OVP(FSM)
if doc.data_source == consts.DATA_SOURCE_LIST[2] or doc.data_source == consts.DATA_SOURCE_LIST[3]:
if doc.document_scheme == consts.DOC_SCHEME_LIST[1]:
for keyword, classify_1_tmp in consts.ECONTRACT_KEYWORDS_MAP.get(prefix):
if keyword in doc.document_name:
classify_1 = classify_1_tmp
break
# FSM合同:WEP/MSI/SC/SC2
elif doc.data_source == consts.DATA_SOURCE_LIST[0] and doc.document_scheme == consts.DOC_SCHEME_LIST[0]:
for keyword, classify_1_tmp in consts.FSM_ECONTRACT_KEYWORDS_MAP.get(prefix):
if keyword in doc.document_name:
classify_1 = classify_1_tmp
break
if doc.document_name.endswith('.zip') or doc.document_name.endswith('.rar') or doc.document_name.endswith('.ZIP') \
or doc.document_name.endswith('.RAR'):
is_zip = True
# task = 'AFC_11001_0_Y' 'AFC_11001_0_N' 最后的Y,N表示是否是reocr,N否,Y是
task = consts.SPLIT_STR.join([prefix, str(doc.id), str(classify_1), 'Y'])
enqueue_res = rh.enqueue([task], is_priority, is_zip)
self.running_log.info('[doc upload success] [args={0}] [business_type={1}] [doc_id={2}] '
'[is_priority={3}] [enqueue_res={4}] [classify_1={5}]'.format(args, prefix, doc.id,
is_priority, enqueue_res, classify_1))
return response.ok()
\ No newline at end of file
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!