1f46e609 by 周伟奇

fix flow

1 parent 55ba3382
...@@ -55,8 +55,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -55,8 +55,7 @@ class Command(BaseCommand, LoggerMixin):
55 def signal_handler(self, sig, frame): 55 def signal_handler(self, sig, frame):
56 self.switch = False # 停止处理文件 56 self.switch = False # 停止处理文件
57 57
58 @staticmethod 58 def get_doc_object(self, task_str):
59 def get_doc_object(task_str):
60 business_type, doc_id_str = task_str.split(consts.SPLIT_STR) 59 business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
61 doc_id = int(doc_id_str) 60 doc_id = int(doc_id_str)
62 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc 61 doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
...@@ -71,28 +70,30 @@ class Command(BaseCommand, LoggerMixin): ...@@ -71,28 +70,30 @@ class Command(BaseCommand, LoggerMixin):
71 self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base)) 70 self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
72 return None, None, None 71 return None, None, None
73 72
74 self.cronjob_log.info('{0} [get_doc_info success] [task={1}] [is_priority={2}]'.format(self.log_base, task_str, is_priority)) 73 self.cronjob_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format(
74 self.log_base, task_str, is_priority))
75 try: 75 try:
76 doc, business_type = self.get_doc_object(task_str) 76 doc, business_type = self.get_doc_object(task_str)
77 if doc is None:
78 self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
79 self.log_base, task_str, is_priority))
80 return None, None, None
81 elif doc.status != DocStatus.INIT.value:
82 self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
83 '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
84 return None, None, None
85 doc.status = DocStatus.PROCESSING.value
86 doc.start_time = timezone.now()
87 doc.save()
77 except Exception as e: 88 except Exception as e:
78 rh.enqueue([task_str], is_priority) 89 rh.enqueue([task_str], is_priority)
79 self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format(self.log_base, traceback.format_exc())) 90 self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format(
91 self.log_base, traceback.format_exc()))
80 raise e 92 raise e
81 93 else:
82 if doc is None: 94 self.cronjob_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
83 self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
84 self.log_base, task_str, is_priority)) 95 self.log_base, task_str, is_priority))
85 return None, None, None 96 return doc, business_type, task_str
86 elif doc.status != DocStatus.INIT.value:
87 self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
88 '[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
89 return None, None, None
90 doc.status = DocStatus.PROCESSING.value
91 doc.start_time = timezone.now()
92 doc.save()
93 self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format(
94 self.log_base, task_str, is_priority))
95 return doc, business_type, task_str
96 97
97 def pdf_download(self, doc, pdf_path): 98 def pdf_download(self, doc, pdf_path):
98 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX): 99 if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
...@@ -210,21 +211,18 @@ class Command(BaseCommand, LoggerMixin): ...@@ -210,21 +211,18 @@ class Command(BaseCommand, LoggerMixin):
210 else: 211 else:
211 res_list.append((pno, ino, part_idx, consts.RES_FAILED_2)) 212 res_list.append((pno, ino, part_idx, consts.RES_FAILED_2))
212 213
213 @staticmethod 214 def parse_img_path(self, img_path):
214 def parse_img_path(img_path):
215 img_name, _ = os.path.splitext(os.path.basename(img_path)) 215 img_name, _ = os.path.splitext(os.path.basename(img_path))
216 part_list = img_name.split('_') 216 part_list = img_name.split('_')
217 # page_7_img_11_0 217 # page_7_img_11_0
218 return int(part_list[1])+1, int(part_list[3])+1 218 return int(part_list[1])+1, int(part_list[3])+1
219 219
220 @staticmethod 220 def get_most(self, value_list):
221 def get_most(value_list):
222 if value_list: 221 if value_list:
223 most_common = Counter(value_list).most_common(1) 222 most_common = Counter(value_list).most_common(1)
224 return most_common[0][0] if most_common else None 223 return most_common[0][0] if most_common else None
225 224
226 @staticmethod 225 def date_format(self, date_str, format_str):
227 def date_format(date_str, format_str):
228 try: 226 try:
229 date_res = datetime.strptime(date_str, format_str).date() 227 date_res = datetime.strptime(date_str, format_str).date()
230 except Exception as e: 228 except Exception as e:
...@@ -402,23 +400,41 @@ class Command(BaseCommand, LoggerMixin): ...@@ -402,23 +400,41 @@ class Command(BaseCommand, LoggerMixin):
402 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format( 400 self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
403 self.log_base, task_str, speed_time)) 401 self.log_base, task_str, speed_time))
404 402
405 with lock: 403 img_count = len(pdf_handler.img_path_list)
406 todo_count_dict[task_str] = len(pdf_handler.img_path_list) 404 if img_count == 0:
407 for img_path in pdf_handler.img_path_list: 405 self.cronjob_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
408 while img_queue.full(): 406 self.log_base, task_str))
409 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base)) 407 raise Exception('pdf img empty')
410 time.sleep(self.sleep_time_img_put) 408 else:
411 img_queue.put(img_path) 409 with lock:
410 todo_count_dict[task_str] = img_count
411 for img_path in pdf_handler.img_path_list:
412 while img_queue.full():
413 self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
414 time.sleep(self.sleep_time_img_put)
415 img_queue.put(img_path)
412 except EDMSException as e: 416 except EDMSException as e:
413 doc.status = DocStatus.PROCESS_FAILED.value 417 try:
414 doc.save() 418 doc.status = DocStatus.PROCESS_FAILED.value
415 self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format( 419 doc.save()
416 self.log_base, task_str, traceback.format_exc())) 420 self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
421 self.log_base, task_str, traceback.format_exc()))
422 except Exception as e:
423 self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
424 self.log_base, traceback.format_exc()))
425 error_list.append(1)
426 return
417 except Exception as e: 427 except Exception as e:
418 doc.status = DocStatus.PROCESS_FAILED.value 428 try:
419 doc.save() 429 doc.status = DocStatus.PROCESS_FAILED.value
420 self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format( 430 doc.save()
421 self.log_base, task_str, traceback.format_exc())) 431 self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
432 self.log_base, task_str, traceback.format_exc()))
433 except Exception as e:
434 self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format(
435 self.log_base, traceback.format_exc()))
436 error_list.append(1)
437 return
422 438
423 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list): 439 def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list):
424 while len(error_list) == 0 or not img_queue.empty(): 440 while len(error_list) == 0 or not img_queue.empty():
...@@ -447,8 +463,9 @@ class Command(BaseCommand, LoggerMixin): ...@@ -447,8 +463,9 @@ class Command(BaseCommand, LoggerMixin):
447 if ocr_1_response.status_code != 200: 463 if ocr_1_response.status_code != 200:
448 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code)) 464 raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
449 except Exception as e: 465 except Exception as e:
450 self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [img_path={2}] [error={3}]'.format( 466 self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [url={2}] [img_path={3}] '
451 self.log_base, times, img_path, traceback.format_exc())) 467 '[error={4}]'.format(self.log_base, times, url, img_path,
468 traceback.format_exc()))
452 else: 469 else:
453 ocr_1_res = ocr_1_response.json() 470 ocr_1_res = ocr_1_response.json()
454 end_time = time.time() 471 end_time = time.time()
...@@ -458,7 +475,8 @@ class Command(BaseCommand, LoggerMixin): ...@@ -458,7 +475,8 @@ class Command(BaseCommand, LoggerMixin):
458 break 475 break
459 else: 476 else:
460 ocr_1_res = {} 477 ocr_1_res = {}
461 self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path)) 478 self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}] [url={2}]'.format(
479 self.log_base, img_path, url))
462 # continue 480 # continue
463 except Exception as e: 481 except Exception as e:
464 self.cronjob_log.error('{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'.format( 482 self.cronjob_log.error('{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'.format(
...@@ -521,14 +539,14 @@ class Command(BaseCommand, LoggerMixin): ...@@ -521,14 +539,14 @@ class Command(BaseCommand, LoggerMixin):
521 ocr_data_list = res.get('data', []) 539 ocr_data_list = res.get('data', [])
522 if not isinstance(ocr_data_list, list): 540 if not isinstance(ocr_data_list, list):
523 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3)) 541 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
524 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path)) 542 self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path))
525 else: 543 else:
526 for part_idx, ocr_data in enumerate(ocr_data_list): 544 for part_idx, ocr_data in enumerate(ocr_data_list):
527 part_idx = part_idx + 1 545 part_idx = part_idx + 1
528 classify = ocr_data.get('classify') 546 classify = ocr_data.get('classify')
529 if classify is None: 547 if classify is None:
530 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3)) 548 res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
531 self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format( 549 self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(
532 self.log_base, img_path)) 550 self.log_base, img_path))
533 continue 551 continue
534 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类 552 elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
...@@ -624,7 +642,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -624,7 +642,7 @@ class Command(BaseCommand, LoggerMixin):
624 doc, business_type = self.get_doc_object(task_str) 642 doc, business_type = self.get_doc_object(task_str)
625 doc.status = DocStatus.PROCESS_FAILED.value 643 doc.status = DocStatus.PROCESS_FAILED.value
626 doc.save() 644 doc.save()
627 self.cronjob_log.error('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format( 645 self.cronjob_log.warn('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format(
628 self.log_base, task_str, traceback.format_exc())) 646 self.log_base, task_str, traceback.format_exc()))
629 except Exception as e: 647 except Exception as e:
630 self.cronjob_log.error('{0} [process error (wb end)] [task={1}] [error={2}]'.format( 648 self.cronjob_log.error('{0} [process error (wb end)] [task={1}] [error={2}]'.format(
...@@ -673,7 +691,7 @@ class Command(BaseCommand, LoggerMixin): ...@@ -673,7 +691,7 @@ class Command(BaseCommand, LoggerMixin):
673 if hasattr(doc, field): 691 if hasattr(doc, field):
674 setattr(doc, field, count) 692 setattr(doc, field, count)
675 doc.save() 693 doc.save()
676 self.cronjob_log.error('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format( 694 self.cronjob_log.warn('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format(
677 self.log_base, task_str, traceback.format_exc())) 695 self.log_base, task_str, traceback.format_exc()))
678 except Exception as e: 696 except Exception as e:
679 self.cronjob_log.error('{0} [process error (edms upload)] [task={1}] [error={2}]'.format( 697 self.cronjob_log.error('{0} [process error (edms upload)] [task={1}] [error={2}]'.format(
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!