799a0e94 by 周伟奇

add pdf failed retry

1 parent 85b24dac
......@@ -101,20 +101,20 @@ class Command(BaseCommand, LoggerMixin):
self.log_base, task_str, is_priority))
return doc, business_type, task_str
def pdf_download(self, doc, pdf_path):
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
for times in range(consts.RETRY_TIMES):
try:
self.edms.download(pdf_path, doc.metadata_version_id)
except Exception as e:
self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
'[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
edms_exc = str(e)
else:
break
else:
raise EDMSException(edms_exc)
self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
# def pdf_download(self, doc, pdf_path):
# if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
# for times in range(consts.RETRY_TIMES):
# try:
# self.edms.download(pdf_path, doc.metadata_version_id)
# except Exception as e:
# self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
# '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
# edms_exc = str(e)
# else:
# break
# else:
# raise EDMSException(edms_exc)
# self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx):
sheets = ocr_data.get('data', [])
......@@ -392,19 +392,35 @@ class Command(BaseCommand, LoggerMixin):
# 2. 从EDMS获取PDF文件
doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
os.makedirs(doc_data_path, exist_ok=True)
img_save_path = os.path.join(doc_data_path, 'img')
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
self.pdf_download(doc, pdf_path)
# 3.PDF文件提取图片
self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
start_time = time.time()
img_save_path = os.path.join(doc_data_path, 'img')
pdf_handler = PDFHandler(pdf_path, img_save_path)
pdf_handler.extract_image()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
self.log_base, task_str, speed_time))
for times in range(consts.RETRY_TIMES):
try:
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
self.edms.download(pdf_path, doc.metadata_version_id)
self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] '
'[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path))
# 3.PDF文件提取图片
self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
self.log_base, task_str, times))
start_time = time.time()
pdf_handler.extract_image()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
self.log_base, task_str, times, speed_time))
except Exception as e:
self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
'[error={3}]'.format(self.log_base, task_str, times,
traceback.format_exc()))
else:
break
else:
raise Exception('download or pdf to img failed')
img_count = len(pdf_handler.img_path_list)
if img_count == 0:
......@@ -419,25 +435,25 @@ class Command(BaseCommand, LoggerMixin):
self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
time.sleep(self.sleep_time_img_put)
img_queue.put(img_path)
except EDMSException as e:
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
error_list.append(1)
return
# except EDMSException as e:
# try:
# doc.status = DocStatus.PROCESS_FAILED.value
# doc.save()
# self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
# self.log_base, task_str, traceback.format_exc()))
# except Exception as e:
# self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
# self.log_base, traceback.format_exc()))
# error_list.append(1)
# return
except Exception as e:
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
'[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format(
self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
error_list.append(1)
return
......
......@@ -187,6 +187,8 @@ class PDFHandler:
self.page_to_png(page)
def extract_image(self):
self.img_path_list = []
self.xref_set = set()
os.makedirs(self.img_dir_path, exist_ok=True)
with fitz.Document(self.path) as pdf:
for pno in range(pdf.pageCount):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!