add pdf failed retry
Showing
2 changed files
with
56 additions
and
38 deletions
| ... | @@ -101,20 +101,20 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -101,20 +101,20 @@ class Command(BaseCommand, LoggerMixin): |
| 101 | self.log_base, task_str, is_priority)) | 101 | self.log_base, task_str, is_priority)) |
| 102 | return doc, business_type, task_str | 102 | return doc, business_type, task_str |
| 103 | 103 | ||
| 104 | def pdf_download(self, doc, pdf_path): | 104 | # def pdf_download(self, doc, pdf_path): |
| 105 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 105 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
| 106 | for times in range(consts.RETRY_TIMES): | 106 | # for times in range(consts.RETRY_TIMES): |
| 107 | try: | 107 | # try: |
| 108 | self.edms.download(pdf_path, doc.metadata_version_id) | 108 | # self.edms.download(pdf_path, doc.metadata_version_id) |
| 109 | except Exception as e: | 109 | # except Exception as e: |
| 110 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' | 110 | # self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' |
| 111 | '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) | 111 | # '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) |
| 112 | edms_exc = str(e) | 112 | # edms_exc = str(e) |
| 113 | else: | 113 | # else: |
| 114 | break | 114 | # break |
| 115 | else: | 115 | # else: |
| 116 | raise EDMSException(edms_exc) | 116 | # raise EDMSException(edms_exc) |
| 117 | self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) | 117 | # self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) |
| 118 | 118 | ||
| 119 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): | 119 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): |
| 120 | sheets = ocr_data.get('data', []) | 120 | sheets = ocr_data.get('data', []) |
| ... | @@ -392,19 +392,35 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -392,19 +392,35 @@ class Command(BaseCommand, LoggerMixin): |
| 392 | # 2. 从EDMS获取PDF文件 | 392 | # 2. 从EDMS获取PDF文件 |
| 393 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | 393 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) |
| 394 | os.makedirs(doc_data_path, exist_ok=True) | 394 | os.makedirs(doc_data_path, exist_ok=True) |
| 395 | img_save_path = os.path.join(doc_data_path, 'img') | ||
| 395 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 396 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
| 396 | self.pdf_download(doc, pdf_path) | ||
| 397 | 397 | ||
| 398 | # 3.PDF文件提取图片 | ||
| 399 | self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str)) | ||
| 400 | start_time = time.time() | ||
| 401 | img_save_path = os.path.join(doc_data_path, 'img') | ||
| 402 | pdf_handler = PDFHandler(pdf_path, img_save_path) | 398 | pdf_handler = PDFHandler(pdf_path, img_save_path) |
| 403 | pdf_handler.extract_image() | 399 | |
| 404 | end_time = time.time() | 400 | for times in range(consts.RETRY_TIMES): |
| 405 | speed_time = int(end_time - start_time) | 401 | try: |
| 406 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | 402 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
| 407 | self.log_base, task_str, speed_time)) | 403 | self.edms.download(pdf_path, doc.metadata_version_id) |
| 404 | self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] ' | ||
| 405 | '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path)) | ||
| 406 | |||
| 407 | # 3.PDF文件提取图片 | ||
| 408 | self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | ||
| 409 | self.log_base, task_str, times)) | ||
| 410 | start_time = time.time() | ||
| 411 | pdf_handler.extract_image() | ||
| 412 | end_time = time.time() | ||
| 413 | speed_time = int(end_time - start_time) | ||
| 414 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( | ||
| 415 | self.log_base, task_str, times, speed_time)) | ||
| 416 | except Exception as e: | ||
| 417 | self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | ||
| 418 | '[error={3}]'.format(self.log_base, task_str, times, | ||
| 419 | traceback.format_exc())) | ||
| 420 | else: | ||
| 421 | break | ||
| 422 | else: | ||
| 423 | raise Exception('download or pdf to img failed') | ||
| 408 | 424 | ||
| 409 | img_count = len(pdf_handler.img_path_list) | 425 | img_count = len(pdf_handler.img_path_list) |
| 410 | if img_count == 0: | 426 | if img_count == 0: |
| ... | @@ -419,25 +435,25 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -419,25 +435,25 @@ class Command(BaseCommand, LoggerMixin): |
| 419 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | 435 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) |
| 420 | time.sleep(self.sleep_time_img_put) | 436 | time.sleep(self.sleep_time_img_put) |
| 421 | img_queue.put(img_path) | 437 | img_queue.put(img_path) |
| 422 | except EDMSException as e: | 438 | # except EDMSException as e: |
| 423 | try: | 439 | # try: |
| 424 | doc.status = DocStatus.PROCESS_FAILED.value | 440 | # doc.status = DocStatus.PROCESS_FAILED.value |
| 425 | doc.save() | 441 | # doc.save() |
| 426 | self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | 442 | # self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( |
| 427 | self.log_base, task_str, traceback.format_exc())) | 443 | # self.log_base, task_str, traceback.format_exc())) |
| 428 | except Exception as e: | 444 | # except Exception as e: |
| 429 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( | 445 | # self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( |
| 430 | self.log_base, traceback.format_exc())) | 446 | # self.log_base, traceback.format_exc())) |
| 431 | error_list.append(1) | 447 | # error_list.append(1) |
| 432 | return | 448 | # return |
| 433 | except Exception as e: | 449 | except Exception as e: |
| 434 | try: | 450 | try: |
| 435 | doc.status = DocStatus.PROCESS_FAILED.value | 451 | doc.status = DocStatus.PROCESS_FAILED.value |
| 436 | doc.save() | 452 | doc.save() |
| 437 | self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | 453 | self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' |
| 438 | self.log_base, task_str, traceback.format_exc())) | 454 | '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) |
| 439 | except Exception as e: | 455 | except Exception as e: |
| 440 | self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format( | 456 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( |
| 441 | self.log_base, traceback.format_exc())) | 457 | self.log_base, traceback.format_exc())) |
| 442 | error_list.append(1) | 458 | error_list.append(1) |
| 443 | return | 459 | return | ... | ... |
| ... | @@ -187,6 +187,8 @@ class PDFHandler: | ... | @@ -187,6 +187,8 @@ class PDFHandler: |
| 187 | self.page_to_png(page) | 187 | self.page_to_png(page) |
| 188 | 188 | ||
| 189 | def extract_image(self): | 189 | def extract_image(self): |
| 190 | self.img_path_list = [] | ||
| 191 | self.xref_set = set() | ||
| 190 | os.makedirs(self.img_dir_path, exist_ok=True) | 192 | os.makedirs(self.img_dir_path, exist_ok=True) |
| 191 | with fitz.Document(self.path) as pdf: | 193 | with fitz.Document(self.path) as pdf: |
| 192 | for pno in range(pdf.pageCount): | 194 | for pno in range(pdf.pageCount): | ... | ... |
-
Please register or sign in to post a comment