fix flow
Showing
1 changed file
with
43 additions
and
25 deletions
| ... | @@ -55,8 +55,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -55,8 +55,7 @@ class Command(BaseCommand, LoggerMixin): |
| 55 | def signal_handler(self, sig, frame): | 55 | def signal_handler(self, sig, frame): |
| 56 | self.switch = False # 停止处理文件 | 56 | self.switch = False # 停止处理文件 |
| 57 | 57 | ||
| 58 | @staticmethod | 58 | def get_doc_object(self, task_str): |
| 59 | def get_doc_object(task_str): | ||
| 60 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) | 59 | business_type, doc_id_str = task_str.split(consts.SPLIT_STR) |
| 61 | doc_id = int(doc_id_str) | 60 | doc_id = int(doc_id_str) |
| 62 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc | 61 | doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc |
| ... | @@ -71,14 +70,10 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -71,14 +70,10 @@ class Command(BaseCommand, LoggerMixin): |
| 71 | self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) | 70 | self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) |
| 72 | return None, None, None | 71 | return None, None, None |
| 73 | 72 | ||
| 74 | self.cronjob_log.info('{0} [get_doc_info success] [task={1}] [is_priority={2}]'.format(self.log_base, task_str, is_priority)) | 73 | self.cronjob_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format( |
| 74 | self.log_base, task_str, is_priority)) | ||
| 75 | try: | 75 | try: |
| 76 | doc, business_type = self.get_doc_object(task_str) | 76 | doc, business_type = self.get_doc_object(task_str) |
| 77 | except Exception as e: | ||
| 78 | rh.enqueue([task_str], is_priority) | ||
| 79 | self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format(self.log_base, traceback.format_exc())) | ||
| 80 | raise e | ||
| 81 | |||
| 82 | if doc is None: | 77 | if doc is None: |
| 83 | self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( | 78 | self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( |
| 84 | self.log_base, task_str, is_priority)) | 79 | self.log_base, task_str, is_priority)) |
| ... | @@ -90,7 +85,13 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -90,7 +85,13 @@ class Command(BaseCommand, LoggerMixin): |
| 90 | doc.status = DocStatus.PROCESSING.value | 85 | doc.status = DocStatus.PROCESSING.value |
| 91 | doc.start_time = timezone.now() | 86 | doc.start_time = timezone.now() |
| 92 | doc.save() | 87 | doc.save() |
| 93 | self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format( | 88 | except Exception as e: |
| 89 | rh.enqueue([task_str], is_priority) | ||
| 90 | self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format( | ||
| 91 | self.log_base, traceback.format_exc())) | ||
| 92 | raise e | ||
| 93 | else: | ||
| 94 | self.cronjob_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format( | ||
| 94 | self.log_base, task_str, is_priority)) | 95 | self.log_base, task_str, is_priority)) |
| 95 | return doc, business_type, task_str | 96 | return doc, business_type, task_str |
| 96 | 97 | ||
| ... | @@ -210,21 +211,18 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -210,21 +211,18 @@ class Command(BaseCommand, LoggerMixin): |
| 210 | else: | 211 | else: |
| 211 | res_list.append((pno, ino, part_idx, consts.RES_FAILED_2)) | 212 | res_list.append((pno, ino, part_idx, consts.RES_FAILED_2)) |
| 212 | 213 | ||
| 213 | @staticmethod | 214 | def parse_img_path(self, img_path): |
| 214 | def parse_img_path(img_path): | ||
| 215 | img_name, _ = os.path.splitext(os.path.basename(img_path)) | 215 | img_name, _ = os.path.splitext(os.path.basename(img_path)) |
| 216 | part_list = img_name.split('_') | 216 | part_list = img_name.split('_') |
| 217 | # page_7_img_11_0 | 217 | # page_7_img_11_0 |
| 218 | return int(part_list[1])+1, int(part_list[3])+1 | 218 | return int(part_list[1])+1, int(part_list[3])+1 |
| 219 | 219 | ||
| 220 | @staticmethod | 220 | def get_most(self, value_list): |
| 221 | def get_most(value_list): | ||
| 222 | if value_list: | 221 | if value_list: |
| 223 | most_common = Counter(value_list).most_common(1) | 222 | most_common = Counter(value_list).most_common(1) |
| 224 | return most_common[0][0] if most_common else None | 223 | return most_common[0][0] if most_common else None |
| 225 | 224 | ||
| 226 | @staticmethod | 225 | def date_format(self, date_str, format_str): |
| 227 | def date_format(date_str, format_str): | ||
| 228 | try: | 226 | try: |
| 229 | date_res = datetime.strptime(date_str, format_str).date() | 227 | date_res = datetime.strptime(date_str, format_str).date() |
| 230 | except Exception as e: | 228 | except Exception as e: |
| ... | @@ -402,23 +400,41 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -402,23 +400,41 @@ class Command(BaseCommand, LoggerMixin): |
| 402 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( | 400 | self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( |
| 403 | self.log_base, task_str, speed_time)) | 401 | self.log_base, task_str, speed_time)) |
| 404 | 402 | ||
| 403 | img_count = len(pdf_handler.img_path_list) | ||
| 404 | if img_count == 0: | ||
| 405 | self.cronjob_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format( | ||
| 406 | self.log_base, task_str)) | ||
| 407 | raise Exception('pdf img empty') | ||
| 408 | else: | ||
| 405 | with lock: | 409 | with lock: |
| 406 | todo_count_dict[task_str] = len(pdf_handler.img_path_list) | 410 | todo_count_dict[task_str] = img_count |
| 407 | for img_path in pdf_handler.img_path_list: | 411 | for img_path in pdf_handler.img_path_list: |
| 408 | while img_queue.full(): | 412 | while img_queue.full(): |
| 409 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) | 413 | self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) |
| 410 | time.sleep(self.sleep_time_img_put) | 414 | time.sleep(self.sleep_time_img_put) |
| 411 | img_queue.put(img_path) | 415 | img_queue.put(img_path) |
| 412 | except EDMSException as e: | 416 | except EDMSException as e: |
| 417 | try: | ||
| 413 | doc.status = DocStatus.PROCESS_FAILED.value | 418 | doc.status = DocStatus.PROCESS_FAILED.value |
| 414 | doc.save() | 419 | doc.save() |
| 415 | self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( | 420 | self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( |
| 416 | self.log_base, task_str, traceback.format_exc())) | 421 | self.log_base, task_str, traceback.format_exc())) |
| 417 | except Exception as e: | 422 | except Exception as e: |
| 423 | self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format( | ||
| 424 | self.log_base, traceback.format_exc())) | ||
| 425 | error_list.append(1) | ||
| 426 | return | ||
| 427 | except Exception as e: | ||
| 428 | try: | ||
| 418 | doc.status = DocStatus.PROCESS_FAILED.value | 429 | doc.status = DocStatus.PROCESS_FAILED.value |
| 419 | doc.save() | 430 | doc.save() |
| 420 | self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( | 431 | self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( |
| 421 | self.log_base, task_str, traceback.format_exc())) | 432 | self.log_base, task_str, traceback.format_exc())) |
| 433 | except Exception as e: | ||
| 434 | self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format( | ||
| 435 | self.log_base, traceback.format_exc())) | ||
| 436 | error_list.append(1) | ||
| 437 | return | ||
| 422 | 438 | ||
| 423 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): | 439 | def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): |
| 424 | while len(error_list) == 0 or not img_queue.empty(): | 440 | while len(error_list) == 0 or not img_queue.empty(): |
| ... | @@ -447,8 +463,9 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -447,8 +463,9 @@ class Command(BaseCommand, LoggerMixin): |
| 447 | if ocr_1_response.status_code != 200: | 463 | if ocr_1_response.status_code != 200: |
| 448 | raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) | 464 | raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) |
| 449 | except Exception as e: | 465 | except Exception as e: |
| 450 | self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [img_path={2}] [error={3}]'.format( | 466 | self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [url={2}] [img_path={3}] ' |
| 451 | self.log_base, times, img_path, traceback.format_exc())) | 467 | '[error={4}]'.format(self.log_base, times, url, img_path, |
| 468 | traceback.format_exc())) | ||
| 452 | else: | 469 | else: |
| 453 | ocr_1_res = ocr_1_response.json() | 470 | ocr_1_res = ocr_1_response.json() |
| 454 | end_time = time.time() | 471 | end_time = time.time() |
| ... | @@ -458,7 +475,8 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -458,7 +475,8 @@ class Command(BaseCommand, LoggerMixin): |
| 458 | break | 475 | break |
| 459 | else: | 476 | else: |
| 460 | ocr_1_res = {} | 477 | ocr_1_res = {} |
| 461 | self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path)) | 478 | self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}] [url={2}]'.format( |
| 479 | self.log_base, img_path, url)) | ||
| 462 | # continue | 480 | # continue |
| 463 | except Exception as e: | 481 | except Exception as e: |
| 464 | self.cronjob_log.error('{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'.format( | 482 | self.cronjob_log.error('{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'.format( |
| ... | @@ -521,14 +539,14 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -521,14 +539,14 @@ class Command(BaseCommand, LoggerMixin): |
| 521 | ocr_data_list = res.get('data', []) | 539 | ocr_data_list = res.get('data', []) |
| 522 | if not isinstance(ocr_data_list, list): | 540 | if not isinstance(ocr_data_list, list): |
| 523 | res_list.append((pno, ino, part_idx, consts.RES_FAILED_3)) | 541 | res_list.append((pno, ino, part_idx, consts.RES_FAILED_3)) |
| 524 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path)) | 542 | self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path)) |
| 525 | else: | 543 | else: |
| 526 | for part_idx, ocr_data in enumerate(ocr_data_list): | 544 | for part_idx, ocr_data in enumerate(ocr_data_list): |
| 527 | part_idx = part_idx + 1 | 545 | part_idx = part_idx + 1 |
| 528 | classify = ocr_data.get('classify') | 546 | classify = ocr_data.get('classify') |
| 529 | if classify is None: | 547 | if classify is None: |
| 530 | res_list.append((pno, ino, part_idx, consts.RES_FAILED_3)) | 548 | res_list.append((pno, ino, part_idx, consts.RES_FAILED_3)) |
| 531 | self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format( | 549 | self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format( |
| 532 | self.log_base, img_path)) | 550 | self.log_base, img_path)) |
| 533 | continue | 551 | continue |
| 534 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 | 552 | elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 |
| ... | @@ -624,7 +642,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -624,7 +642,7 @@ class Command(BaseCommand, LoggerMixin): |
| 624 | doc, business_type = self.get_doc_object(task_str) | 642 | doc, business_type = self.get_doc_object(task_str) |
| 625 | doc.status = DocStatus.PROCESS_FAILED.value | 643 | doc.status = DocStatus.PROCESS_FAILED.value |
| 626 | doc.save() | 644 | doc.save() |
| 627 | self.cronjob_log.error('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format( | 645 | self.cronjob_log.warn('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format( |
| 628 | self.log_base, task_str, traceback.format_exc())) | 646 | self.log_base, task_str, traceback.format_exc())) |
| 629 | except Exception as e: | 647 | except Exception as e: |
| 630 | self.cronjob_log.error('{0} [process error (wb end)] [task={1}] [error={2}]'.format( | 648 | self.cronjob_log.error('{0} [process error (wb end)] [task={1}] [error={2}]'.format( |
| ... | @@ -673,7 +691,7 @@ class Command(BaseCommand, LoggerMixin): | ... | @@ -673,7 +691,7 @@ class Command(BaseCommand, LoggerMixin): |
| 673 | if hasattr(doc, field): | 691 | if hasattr(doc, field): |
| 674 | setattr(doc, field, count) | 692 | setattr(doc, field, count) |
| 675 | doc.save() | 693 | doc.save() |
| 676 | self.cronjob_log.error('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format( | 694 | self.cronjob_log.warn('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format( |
| 677 | self.log_base, task_str, traceback.format_exc())) | 695 | self.log_base, task_str, traceback.format_exc())) |
| 678 | except Exception as e: | 696 | except Exception as e: |
| 679 | self.cronjob_log.error('{0} [process error (edms upload)] [task={1}] [error={2}]'.format( | 697 | self.cronjob_log.error('{0} [process error (edms upload)] [task={1}] [error={2}]'.format( | ... | ... |
-
Please register or sign in to post a comment