799a0e94 by 周伟奇

add pdf failed retry

1 parent 85b24dac
...@@ -101,20 +101,20 @@ class Command(BaseCommand, LoggerMixin): ...@@ -101,20 +101,20 @@ class Command(BaseCommand, LoggerMixin):
101 self.log_base, task_str, is_priority)) 101 self.log_base, task_str, is_priority))
102 return doc, business_type, task_str 102 return doc, business_type, task_str
103 103
104 def pdf_download(self, doc, pdf_path): 104 # def pdf_download(self, doc, pdf_path):
105 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): 105 # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
106 for times in range(consts.RETRY_TIMES): 106 # for times in range(consts.RETRY_TIMES):
107 try: 107 # try:
108 self.edms.download(pdf_path, doc.metadata_version_id) 108 # self.edms.download(pdf_path, doc.metadata_version_id)
109 except Exception as e: 109 # except Exception as e:
110 self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' 110 # self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] '
111 '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) 111 # '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc()))
112 edms_exc = str(e) 112 # edms_exc = str(e)
113 else: 113 # else:
114 break 114 # break
115 else: 115 # else:
116 raise EDMSException(edms_exc) 116 # raise EDMSException(edms_exc)
117 self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) 117 # self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path))
118 118
119 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): 119 def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx):
120 sheets = ocr_data.get('data', []) 120 sheets = ocr_data.get('data', [])
...@@ -392,19 +392,35 @@ class Command(BaseCommand, LoggerMixin): ...@@ -392,19 +392,35 @@ class Command(BaseCommand, LoggerMixin):
392 # 2. 从EDMS获取PDF文件 392 # 2. 从EDMS获取PDF文件
393 doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) 393 doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
394 os.makedirs(doc_data_path, exist_ok=True) 394 os.makedirs(doc_data_path, exist_ok=True)
395 img_save_path = os.path.join(doc_data_path, 'img')
395 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) 396 pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
396 self.pdf_download(doc, pdf_path)
397 397
398 # 3.PDF文件提取图片
399 self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
400 start_time = time.time()
401 img_save_path = os.path.join(doc_data_path, 'img')
402 pdf_handler = PDFHandler(pdf_path, img_save_path) 398 pdf_handler = PDFHandler(pdf_path, img_save_path)
403 pdf_handler.extract_image() 399
404 end_time = time.time() 400 for times in range(consts.RETRY_TIMES):
405 speed_time = int(end_time - start_time) 401 try:
406 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( 402 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
407 self.log_base, task_str, speed_time)) 403 self.edms.download(pdf_path, doc.metadata_version_id)
404 self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] '
405 '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path))
406
407 # 3.PDF文件提取图片
408 self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format(
409 self.log_base, task_str, times))
410 start_time = time.time()
411 pdf_handler.extract_image()
412 end_time = time.time()
413 speed_time = int(end_time - start_time)
414 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format(
415 self.log_base, task_str, times, speed_time))
416 except Exception as e:
417 self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] '
418 '[error={3}]'.format(self.log_base, task_str, times,
419 traceback.format_exc()))
420 else:
421 break
422 else:
423 raise Exception('download or pdf to img failed')
408 424
409 img_count = len(pdf_handler.img_path_list) 425 img_count = len(pdf_handler.img_path_list)
410 if img_count == 0: 426 if img_count == 0:
...@@ -419,25 +435,25 @@ class Command(BaseCommand, LoggerMixin): ...@@ -419,25 +435,25 @@ class Command(BaseCommand, LoggerMixin):
419 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) 435 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
420 time.sleep(self.sleep_time_img_put) 436 time.sleep(self.sleep_time_img_put)
421 img_queue.put(img_path) 437 img_queue.put(img_path)
422 except EDMSException as e: 438 # except EDMSException as e:
423 try: 439 # try:
424 doc.status = DocStatus.PROCESS_FAILED.value 440 # doc.status = DocStatus.PROCESS_FAILED.value
425 doc.save() 441 # doc.save()
426 self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( 442 # self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
427 self.log_base, task_str, traceback.format_exc())) 443 # self.log_base, task_str, traceback.format_exc()))
428 except Exception as e: 444 # except Exception as e:
429 self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( 445 # self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
430 self.log_base, traceback.format_exc())) 446 # self.log_base, traceback.format_exc()))
431 error_list.append(1) 447 # error_list.append(1)
432 return 448 # return
433 except Exception as e: 449 except Exception as e:
434 try: 450 try:
435 doc.status = DocStatus.PROCESS_FAILED.value 451 doc.status = DocStatus.PROCESS_FAILED.value
436 doc.save() 452 doc.save()
437 self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( 453 self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] '
438 self.log_base, task_str, traceback.format_exc())) 454 '[error={2}]'.format(self.log_base, task_str, traceback.format_exc()))
439 except Exception as e: 455 except Exception as e:
440 self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format( 456 self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
441 self.log_base, traceback.format_exc())) 457 self.log_base, traceback.format_exc()))
442 error_list.append(1) 458 error_list.append(1)
443 return 459 return
......
...@@ -187,6 +187,8 @@ class PDFHandler: ...@@ -187,6 +187,8 @@ class PDFHandler:
187 self.page_to_png(page) 187 self.page_to_png(page)
188 188
189 def extract_image(self): 189 def extract_image(self):
190 self.img_path_list = []
191 self.xref_set = set()
190 os.makedirs(self.img_dir_path, exist_ok=True) 192 os.makedirs(self.img_dir_path, exist_ok=True)
191 with fitz.Document(self.path) as pdf: 193 with fitz.Document(self.path) as pdf:
192 for pno in range(pdf.pageCount): 194 for pno in range(pdf.pageCount):
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!