1f46e609 by 周伟奇

fix flow

1 parent 55ba3382
......@@ -55,8 +55,7 @@ class Command(BaseCommand, LoggerMixin):
def signal_handler(self, sig, frame):
self.switch = False # 停止处理文件
@staticmethod
def get_doc_object(task_str):
def get_doc_object(self, task_str):
business_type, doc_id_str = task_str.split(consts.SPLIT_STR)
doc_id = int(doc_id_str)
doc_class = HILDoc if business_type == consts.HIL_PREFIX else AFCDoc
......@@ -71,28 +70,30 @@ class Command(BaseCommand, LoggerMixin):
self.cronjob_log.info('{0} [get_doc_info] [queue empty]'.format(self.log_base))
return None, None, None
self.cronjob_log.info('{0} [get_doc_info success] [task={1}] [is_priority={2}]'.format(self.log_base, task_str, is_priority))
self.cronjob_log.info('{0} [get_doc_info] [task={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
try:
doc, business_type = self.get_doc_object(task_str)
if doc is None:
self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return None, None, None
elif doc.status != DocStatus.INIT.value:
self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
'[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
return None, None, None
doc.status = DocStatus.PROCESSING.value
doc.start_time = timezone.now()
doc.save()
except Exception as e:
rh.enqueue([task_str], is_priority)
self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format(self.log_base, traceback.format_exc()))
self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
raise e
if doc is None:
self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
else:
self.cronjob_log.info('{0} [get_doc_info] [db save end] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return None, None, None
elif doc.status != DocStatus.INIT.value:
self.cronjob_log.warn('{0} [get_doc_info] [doc status error] [task_str={1}] [is_priority={2}] '
'[doc_status={3}]'.format(self.log_base, task_str, is_priority, doc.status))
return None, None, None
doc.status = DocStatus.PROCESSING.value
doc.start_time = timezone.now()
doc.save()
self.cronjob_log.info('{0} [get_doc_info] [success] [task_str={1}] [is_priority={2}]'.format(
self.log_base, task_str, is_priority))
return doc, business_type, task_str
return doc, business_type, task_str
def pdf_download(self, doc, pdf_path):
if not doc.application_id.startswith(consts.FIXED_APPLICATION_ID_PREFIX):
......@@ -210,21 +211,18 @@ class Command(BaseCommand, LoggerMixin):
else:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_2))
@staticmethod
def parse_img_path(img_path):
def parse_img_path(self, img_path):
img_name, _ = os.path.splitext(os.path.basename(img_path))
part_list = img_name.split('_')
# page_7_img_11_0
return int(part_list[1])+1, int(part_list[3])+1
@staticmethod
def get_most(value_list):
def get_most(self, value_list):
if value_list:
most_common = Counter(value_list).most_common(1)
return most_common[0][0] if most_common else None
@staticmethod
def date_format(date_str, format_str):
def date_format(self, date_str, format_str):
try:
date_res = datetime.strptime(date_str, format_str).date()
except Exception as e:
......@@ -402,23 +400,41 @@ class Command(BaseCommand, LoggerMixin):
self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
self.log_base, task_str, speed_time))
with lock:
todo_count_dict[task_str] = len(pdf_handler.img_path_list)
for img_path in pdf_handler.img_path_list:
while img_queue.full():
self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
time.sleep(self.sleep_time_img_put)
img_queue.put(img_path)
img_count = len(pdf_handler.img_path_list)
if img_count == 0:
self.cronjob_log.warn('{0} [pdf to img failed (pdf img empty)] [task={1}]'.format(
self.log_base, task_str))
raise Exception('pdf img empty')
else:
with lock:
todo_count_dict[task_str] = img_count
for img_path in pdf_handler.img_path_list:
while img_queue.full():
self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
time.sleep(self.sleep_time_img_put)
img_queue.put(img_path)
except EDMSException as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.warn('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (db save 1)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
error_list.append(1)
return
except Exception as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
try:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.warn('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (db save 2)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
error_list.append(1)
return
def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list):
while len(error_list) == 0 or not img_queue.empty():
......@@ -447,8 +463,9 @@ class Command(BaseCommand, LoggerMixin):
if ocr_1_response.status_code != 200:
raise OCR1Exception('ocr_1 status code: {0}'.format(ocr_1_response.status_code))
except Exception as e:
self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [img_path={2}] [error={3}]'.format(
self.log_base, times, img_path, traceback.format_exc()))
self.cronjob_log.warn('{0} [ocr_1 failed] [times={1}] [url={2}] [img_path={3}] '
'[error={4}]'.format(self.log_base, times, url, img_path,
traceback.format_exc()))
else:
ocr_1_res = ocr_1_response.json()
end_time = time.time()
......@@ -458,7 +475,8 @@ class Command(BaseCommand, LoggerMixin):
break
else:
ocr_1_res = {}
self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path))
self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}] [url={2}]'.format(
self.log_base, img_path, url))
# continue
except Exception as e:
self.cronjob_log.error('{0} [process error (ocr fetch)] [img_path={1}] [error={2}]'.format(
......@@ -521,14 +539,14 @@ class Command(BaseCommand, LoggerMixin):
ocr_data_list = res.get('data', [])
if not isinstance(ocr_data_list, list):
res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path))
self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(self.log_base, img_path))
else:
for part_idx, ocr_data in enumerate(ocr_data_list):
part_idx = part_idx + 1
classify = ocr_data.get('classify')
if classify is None:
res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
self.cronjob_log.info('{0} [ocr_1 res error] [img={1}]'.format(
self.cronjob_log.warn('{0} [ocr_1 res error] [img={1}]'.format(
self.log_base, img_path))
continue
elif classify in consts.OTHER_CLASSIFY_SET: # 其他类
......@@ -624,7 +642,7 @@ class Command(BaseCommand, LoggerMixin):
doc, business_type = self.get_doc_object(task_str)
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format(
self.cronjob_log.warn('{0} [process failed (res to wb)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (wb end)] [task={1}] [error={2}]'.format(
......@@ -673,7 +691,7 @@ class Command(BaseCommand, LoggerMixin):
if hasattr(doc, field):
setattr(doc, field, count)
doc.save()
self.cronjob_log.error('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format(
self.cronjob_log.warn('{0} [process failed (edms upload)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
self.cronjob_log.error('{0} [process error (edms upload)] [task={1}] [error={2}]'.format(
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!