160ac57d by 冯轩

merge

2 parents 88f01673 d619642f
......@@ -10,4 +10,7 @@ urlpatterns = [
path(r'invoice/downloadExcel', views.InvoiceExcelView.as_view()),
path(r'invoice/queryInfo', views.InvoiceQueryInfoView.as_view()),
path(r'contract/v1', views.SEContractView.as_view()),
path(r'reocr', views.DocReOcrView.as_view()),
path(r'batch/reocr', views.BatchReOcrView.as_view()),
]
......
......@@ -100,7 +100,7 @@ class Command(BaseCommand, LoggerMixin):
if len(info_tuple) == 2:
business_type, doc_id_str = info_tuple
else:
business_type, doc_id_str, classify_1_str = info_tuple
business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
zip_doc = doc_class.objects.filter(id=doc_id).first()
......@@ -124,7 +124,7 @@ class Command(BaseCommand, LoggerMixin):
else:
self.online_log.info('{0} [zip_2_pdfs] [db save end] [task_str={1}]'.format(
self.log_base, task_str))
return zip_doc, business_type
return zip_doc, business_type, re_ocr_flag
def get_doc_info(self, task_str, is_priority=False):
try:
......@@ -135,7 +135,7 @@ class Command(BaseCommand, LoggerMixin):
classify_1_str = '0'
rebuild_task_str = task_str
else:
business_type, doc_id_str, classify_1_str = info_tuple
business_type, doc_id_str, classify_1_str, re_ocr_flag = info_tuple
rebuild_task_str = '{0}{1}{2}'.format(business_type, consts.SPLIT_STR, doc_id_str)
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
......@@ -160,7 +160,7 @@ class Command(BaseCommand, LoggerMixin):
else:
self.online_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return doc, business_type, rebuild_task_str, classify_1_str
return doc, business_type, rebuild_task_str, classify_1_str, re_ocr_flag
# def pdf_download(self, doc, pdf_path):
# if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
......@@ -1202,7 +1202,7 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.info('{0} [zip_2_pdfs] [task={1}]'.format(self.log_base, task_str))
# 2. 修改doc状态: 识别中
zip_doc, business_type = self.get_zip_doc_info(task_str)
zip_doc, business_type, re_ocr_flag = self.get_zip_doc_info(task_str)
if zip_doc is None:
time.sleep(self.sleep_time_doc_get)
continue
......@@ -1339,7 +1339,7 @@ class Command(BaseCommand, LoggerMixin):
try:
# 1. 从队列获取文件信息
doc, business_type, task_str, classify_1_str = self.get_doc_info(task_str, is_priority)
doc, business_type, task_str, classify_1_str, re_ocr_flag = self.get_doc_info(task_str, is_priority)
# 队列为空时的处理
if doc is None:
time.sleep(self.sleep_time_doc_get)
......@@ -1389,7 +1389,8 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
self.log_base, task_str, times))
start_time = time.time()
pdf_handler.extract_image(max_img_count)
max_img_count_or_none = None if re_ocr_flag == 'Y' else max_img_count
pdf_handler.extract_image(max_img_count_or_none)
end_time = time.time()
speed_time = int(end_time - start_time)
self.online_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}] [is_new_modify={4}]'.format(
......@@ -1407,7 +1408,7 @@ class Command(BaseCommand, LoggerMixin):
self.online_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
self.log_base, task_str))
raise Exception('pdf img empty')
elif pdf_handler.img_count >= max_img_count:
elif re_ocr_flag == 'N' and pdf_handler.img_count >= max_img_count:
self.online_log.info('{0} [too many pdf image] [task={1}] [img_count={2}]'.format(
self.log_base, task_str, pdf_handler.img_count))
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!