1f46e609 by 周伟奇

fix flow

1 parent 55ba3382
...@@ -55,8 +55,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -55,8 +55,7 @@ class Command(BaseCommand, LoggerMixin):
55 def signal_handler(self, sig, frame): 55 def signal_handler(self, sig, frame):
56 self.switch = False # 停止处理文件 56 self.switch = False # 停止处理文件
57 57
58 @staticmethod 58 def get_doc_object(self, task_str):
59 def get_doc_object(task_str):
60 business_type, doc_id_str = task_str.split(consts.SPLIT_STR) 59 business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
61 doc_id = int(doc_id_str) 60 doc_id = int(doc_id_str)
62 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc 61 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
...@@ -71,14 +70,10 @@ class Command(BaseCommand, LoggerMixin): ...@@ -71,14 +70,10 @@ class Command(BaseCommand, LoggerMixin):
71 self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) 70 self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
72 return None, None, None 71 return None, None, None
73 72
74 self.cronjob_log.info('{0} [get_doc_info success] [task={1}] [is_priority={2}]'.format(self.log_base, task_str, is_priority)) 73 self.cronjob_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format(
74 self.log_base, task_str, is_priority))
75 try: 75 try:
76 doc, business_type = self.get_doc_object(task_str) 76 doc, business_type = self.get_doc_object(task_str)
77 except Exception as e:
78 rh.enqueue([task_str], is_priority)
79 self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format(self.log_base, traceback.format_exc()))
80 raise e
81
82 if doc is None: 77 if doc is None:
83 self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format( 78 self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
84 self.log_base, task_str, is_priority)) 79 self.log_base, task_str, is_priority))
...@@ -90,7 +85,13 @@ class Command(BaseCommand, LoggerMixin): ...@@ -90,7 +85,13 @@ class Command(BaseCommand, LoggerMixin):
90 doc.status = DocStatus.PROCESSING.value 85 doc.status = DocStatus.PROCESSING.value
91 doc.start_time = timezone.now() 86 doc.start_time = timezone.now()
92 doc.save() 87 doc.save()
93 self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format( 88 except Exception as e:
89 rh.enqueue([task_str], is_priority)
90 self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format(
91 self.log_base, traceback.format_exc()))
92 raise e
93 else:
94 self.cronjob_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
94 self.log_base, task_str, is_priority)) 95 self.log_base, task_str, is_priority))
95 return doc, business_type, task_str 96 return doc, business_type, task_str
96 97
...@@ -210,21 +211,18 @@ class Command(BaseCommand, LoggerMixin): ...@@ -210,21 +211,18 @@ class Command(BaseCommand, LoggerMixin):
210 else: 211 else:
211 res_list.append((pno, ino, part_idx, consts.RES_FAILED_2)) 212 res_list.append((pno, ino, part_idx, consts.RES_FAILED_2))
212 213
213 @staticmethod 214 def parse_img_path(self, img_path):
214 def parse_img_path(img_path):
215 img_name, _ = os.path.splitext(os.path.basename(img_path)) 215 img_name, _ = os.path.splitext(os.path.basename(img_path))
216 part_list = img_name.split('_') 216 part_list = img_name.split('_')
217 # page_7_img_11_0 217 # page_7_img_11_0
218 return int(part_list[1])+1, int(part_list[3])+1 218 return int(part_list[1])+1, int(part_list[3])+1
219 219
220 @staticmethod 220 def get_most(self, value_list):
221 def get_most(value_list):
222 if value_list: 221 if value_list:
223 most_common = Counter(value_list).most_common(1) 222 most_common = Counter(value_list).most_common(1)
224 return most_common[0][0] if most_common else None 223 return most_common[0][0] if most_common else None
225 224
226 @staticmethod 225 def date_format(self, date_str, format_str):
227 def date_format(date_str, format_str):
228 try: 226 try:
229 date_res = datetime.strptime(date_str, format_str).date() 227 date_res = datetime.strptime(date_str, format_str).date()
230 except Exception as e: 228 except Exception as e:
...@@ -402,23 +400,41 @@ class Command(BaseCommand, LoggerMixin): ...@@ -402,23 +400,41 @@ class Command(BaseCommand, LoggerMixin):
402 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( 400 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
403 self.log_base, task_str, speed_time)) 401 self.log_base, task_str, speed_time))
404 402
403 img_count = len(pdf_handler.img_path_list)
404 if img_count == 0:
405 self.cronjob_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
406 self.log_base, task_str))
407 raise Exception('pdf img empty')
408 else:
405 with lock: 409 with lock:
406 todo_count_dict[task_str] = len(pdf_handler.img_path_list) 410 todo_count_dict[task_str] = img_count
407 for img_path in pdf_handler.img_path_list: 411 for img_path in pdf_handler.img_path_list:
408 while img_queue.full(): 412 while img_queue.full():
409 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) 413 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
410 time.sleep(self.sleep_time_img_put) 414 time.sleep(self.sleep_time_img_put)
411 img_queue.put(img_path) 415 img_queue.put(img_path)
412 except EDMSException as e: 416 except EDMSException as e:
417 try:
413 doc.status = DocStatus.PROCESS_FAILED.value 418 doc.status = DocStatus.PROCESS_FAILED.value
414 doc.save() 419 doc.save()
415 self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( 420 self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
416 self.log_base, task_str, traceback.format_exc())) 421 self.log_base, task_str, traceback.format_exc()))
417 except Exception as e: 422 except Exception as e:
423 self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
424 self.log_base, traceback.format_exc()))
425 error_list.append(1)
426 return
427 except Exception as e:
428 try:
418 doc.status = DocStatus.PROCESS_FAILED.value 429 doc.status = DocStatus.PROCESS_FAILED.value
419 doc.save() 430 doc.save()
420 self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( 431 self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
421 self.log_base, task_str, traceback.format_exc())) 432 self.log_base, task_str, traceback.format_exc()))
433 except Exception as e:
434 self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format(
435 self.log_base, traceback.format_exc()))
436 error_list.append(1)
437 return
422 438
423 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): 439 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list):
424 while len(error_list) == 0 or not img_queue.empty(): 440 while len(error_list) == 0 or not img_queue.empty():
...@@ -447,8 +463,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -447,8 +463,9 @@ class Command(BaseCommand, LoggerMixin):
447 if ocr_1_response.status_code != 200: 463 if ocr_1_response.status_code != 200:
448 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) 464 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
449 except Exception as e: 465 except Exception as e:
450 self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [img_path={2}] [error={3}]'.format( 466 self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [url={2}] [img_path={3}] '
451 self.log_base, times, img_path, traceback.format_exc())) 467 '[error={4}]'.format(self.log_base, times, url, img_path,
468 traceback.format_exc()))
452 else: 469 else:
453 ocr_1_res = ocr_1_response.json() 470 ocr_1_res = ocr_1_response.json()
454 end_time = time.time() 471 end_time = time.time()
...@@ -458,7 +475,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -458,7 +475,8 @@ class Command(BaseCommand, LoggerMixin):
458 break 475 break
459 else: 476 else:
460 ocr_1_res = {} 477 ocr_1_res = {}
461 self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path)) 478 self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}] [url={2}]'.format(
479 self.log_base, img_path, url))
462 # continue 480 # continue
463 except Exception as e: 481 except Exception as e:
464 self.cronjob_log.error('{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'.format( 482 self.cronjob_log.error('{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'.format(
...@@ -521,14 +539,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -521,14 +539,14 @@ class Command(BaseCommand, LoggerMixin):
521 ocr_data_list = res.get('data', []) 539 ocr_data_list = res.get('data', [])
522 if not isinstance(ocr_data_list, list): 540 if not isinstance(ocr_data_list, list):
523 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3)) 541 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
524 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path)) 542 self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path))
525 else: 543 else:
526 for part_idx, ocr_data in enumerate(ocr_data_list): 544 for part_idx, ocr_data in enumerate(ocr_data_list):
527 part_idx = part_idx + 1 545 part_idx = part_idx + 1
528 classify = ocr_data.get('classify') 546 classify = ocr_data.get('classify')
529 if classify is None: 547 if classify is None:
530 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3)) 548 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
531 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format( 549 self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(
532 self.log_base, img_path)) 550 self.log_base, img_path))
533 continue 551 continue
534 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 552 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
...@@ -624,7 +642,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -624,7 +642,7 @@ class Command(BaseCommand, LoggerMixin):
624 doc, business_type = self.get_doc_object(task_str) 642 doc, business_type = self.get_doc_object(task_str)
625 doc.status = DocStatus.PROCESS_FAILED.value 643 doc.status = DocStatus.PROCESS_FAILED.value
626 doc.save() 644 doc.save()
627 self.cronjob_log.error('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format( 645 self.cronjob_log.warn('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format(
628 self.log_base, task_str, traceback.format_exc())) 646 self.log_base, task_str, traceback.format_exc()))
629 except Exception as e: 647 except Exception as e:
630 self.cronjob_log.error('{0} [process error (wb end)] [task={1}] [error={2}]'.format( 648 self.cronjob_log.error('{0} [process error (wb end)] [task={1}] [error={2}]'.format(
...@@ -673,7 +691,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -673,7 +691,7 @@ class Command(BaseCommand, LoggerMixin):
673 if hasattr(doc, field): 691 if hasattr(doc, field):
674 setattr(doc, field, count) 692 setattr(doc, field, count)
675 doc.save() 693 doc.save()
676 self.cronjob_log.error('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format( 694 self.cronjob_log.warn('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format(
677 self.log_base, task_str, traceback.format_exc())) 695 self.log_base, task_str, traceback.format_exc()))
678 except Exception as e: 696 except Exception as e:
679 self.cronjob_log.error('{0} [process error (edms upload)] [task={1}] [error={2}]'.format( 697 self.cronjob_log.error('{0} [process error (edms upload)] [task={1}] [error={2}]'.format(
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!