add pdf failed retry
Showing
2 changed files
with
56 additions
and
38 deletions
... | @@ -101,20 +101,20 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -101,20 +101,20 @@ class Command(BaseCommand, LoggerMixin): |
101 | self.log_base, task_str, is_priority)) | 101 | self.log_base, task_str, is_priority)) |
102 | return doc, business_type, task_str | 102 | return doc, business_type, task_str |
103 | 103 | ||
104 | def pdf_download(self, doc, pdf_path): | 104 | # def pdf_download(self, doc, pdf_path): |
105 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): | 105 | # if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
106 | for times in range(consts.RETRY_TIMES): | 106 | # for times in range(consts.RETRY_TIMES): |
107 | try: | 107 | # try: |
108 | self.edms.download(pdf_path, doc.metadata_version_id) | 108 | # self.edms.download(pdf_path, doc.metadata_version_id) |
109 | except Exception as e: | 109 | # except Exception as e: |
110 | self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' | 110 | # self.cronjob_log.warn('{0} [edms download failed] [times={1}] [pdf_path={2}] ' |
111 | '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) | 111 | # '[error={3}]'.format(self.log_base, times, pdf_path, traceback.format_exc())) |
112 | edms_exc = str(e) | 112 | # edms_exc = str(e) |
113 | else: | 113 | # else: |
114 | break | 114 | # break |
115 | else: | 115 | # else: |
116 | raise EDMSException(edms_exc) | 116 | # raise EDMSException(edms_exc) |
117 | self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) | 117 | # self.cronjob_log.info('{0} [edms download success] [pdf_path={1}]'.format(self.log_base, pdf_path)) |
118 | 118 | ||
119 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): | 119 | def bs_process(self, wb, ocr_data, bs_summary, unknown_summary, classify, res_list, pno, ino, part_idx): |
120 | sheets = ocr_data.get('data', []) | 120 | sheets = ocr_data.get('data', []) |
... | @@ -392,19 +392,35 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -392,19 +392,35 @@ class Command(BaseCommand, LoggerMixin): |
392 | # 2. 从EDMS获取PDF文件 | 392 | # 2. 从EDMS获取PDF文件 |
393 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) | 393 | doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id)) |
394 | os.makedirs(doc_data_path, exist_ok=True) | 394 | os.makedirs(doc_data_path, exist_ok=True) |
395 | img_save_path = os.path.join(doc_data_path, 'img') | ||
395 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) | 396 | pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id)) |
396 | self.pdf_download(doc, pdf_path) | ||
397 | 397 | ||
398 | # 3.PDF文件提取图片 | ||
399 | self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str)) | ||
400 | start_time = time.time() | ||
401 | img_save_path = os.path.join(doc_data_path, 'img') | ||
402 | pdf_handler = PDFHandler(pdf_path, img_save_path) | 398 | pdf_handler = PDFHandler(pdf_path, img_save_path) |
403 | pdf_handler.extract_image() | 399 | |
404 | end_time = time.time() | 400 | for times in range(consts.RETRY_TIMES): |
405 | speed_time = int(end_time - start_time) | 401 | try: |
406 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | 402 | if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): |
407 | self.log_base, task_str, speed_time)) | 403 | self.edms.download(pdf_path, doc.metadata_version_id) |
404 | self.cronjob_log.info('{0} [edms download success] [task={1}] [times={2}] ' | ||
405 | '[pdf_path={3}]'.format(self.log_base, task_str, times, pdf_path)) | ||
406 | |||
407 | # 3.PDF文件提取图片 | ||
408 | self.cronjob_log.info('{0} [pdf to img start] [task={1}] [times={2}]'.format( | ||
409 | self.log_base, task_str, times)) | ||
410 | start_time = time.time() | ||
411 | pdf_handler.extract_image() | ||
412 | end_time = time.time() | ||
413 | speed_time = int(end_time - start_time) | ||
414 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [times={2}] [spend_time={3}]'.format( | ||
415 | self.log_base, task_str, times, speed_time)) | ||
416 | except Exception as e: | ||
417 | self.cronjob_log.warn('{0} [download or pdf to img failed] [task={1}] [times={2}] ' | ||
418 | '[error={3}]'.format(self.log_base, task_str, times, | ||
419 | traceback.format_exc())) | ||
420 | else: | ||
421 | break | ||
422 | else: | ||
423 | raise Exception('download or pdf to img failed') | ||
408 | 424 | ||
409 | img_count = len(pdf_handler.img_path_list) | 425 | img_count = len(pdf_handler.img_path_list) |
410 | if img_count == 0: | 426 | if img_count == 0: |
... | @@ -419,25 +435,25 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -419,25 +435,25 @@ class Command(BaseCommand, LoggerMixin): |
419 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | 435 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) |
420 | time.sleep(self.sleep_time_img_put) | 436 | time.sleep(self.sleep_time_img_put) |
421 | img_queue.put(img_path) | 437 | img_queue.put(img_path) |
422 | except EDMSException as e: | 438 | # except EDMSException as e: |
423 | try: | 439 | # try: |
424 | doc.status = DocStatus.PROCESS_FAILED.value | 440 | # doc.status = DocStatus.PROCESS_FAILED.value |
425 | doc.save() | 441 | # doc.save() |
426 | self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | 442 | # self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( |
427 | self.log_base, task_str, traceback.format_exc())) | 443 | # self.log_base, task_str, traceback.format_exc())) |
428 | except Exception as e: | 444 | # except Exception as e: |
429 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( | 445 | # self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( |
430 | self.log_base, traceback.format_exc())) | 446 | # self.log_base, traceback.format_exc())) |
431 | error_list.append(1) | 447 | # error_list.append(1) |
432 | return | 448 | # return |
433 | except Exception as e: | 449 | except Exception as e: |
434 | try: | 450 | try: |
435 | doc.status = DocStatus.PROCESS_FAILED.value | 451 | doc.status = DocStatus.PROCESS_FAILED.value |
436 | doc.save() | 452 | doc.save() |
437 | self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | 453 | self.cronjob_log.warn('{0} [process failed (pdf_2_img_2_queue)] [task={1}] ' |
438 | self.log_base, task_str, traceback.format_exc())) | 454 | '[error={2}]'.format(self.log_base, task_str, traceback.format_exc())) |
439 | except Exception as e: | 455 | except Exception as e: |
440 | self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format( | 456 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( |
441 | self.log_base, traceback.format_exc())) | 457 | self.log_base, traceback.format_exc())) |
442 | error_list.append(1) | 458 | error_list.append(1) |
443 | return | 459 | return | ... | ... |
... | @@ -187,6 +187,8 @@ class PDFHandler: | ... | @@ -187,6 +187,8 @@ class PDFHandler: |
187 | self.page_to_png(page) | 187 | self.page_to_png(page) |
188 | 188 | ||
189 | def extract_image(self): | 189 | def extract_image(self): |
190 | self.img_path_list = [] | ||
191 | self.xref_set = set() | ||
190 | os.makedirs(self.img_dir_path, exist_ok=True) | 192 | os.makedirs(self.img_dir_path, exist_ok=True) |
191 | with fitz.Document(self.path) as pdf: | 193 | with fitz.Document(self.path) as pdf: |
192 | for pno in range(pdf.pageCount): | 194 | for pno in range(pdf.pageCount): | ... | ... |
-
Please register or sign in to post a comment