6934c592 by 周伟奇

fix bug

1 parent b34cd942
......@@ -6,6 +6,7 @@ import base64
import signal
import requests
import traceback
from datetime import datetime
from django.core.management import BaseCommand
from multiprocessing import Process
......@@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin):
@staticmethod
def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir):
time_stamp = int(time.time())
time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
new_name = '{0}_{1}'.format(time_stamp, name)
img_save_path = os.path.join(img_output_dir, new_name)
pdf_save_path = os.path.join(pdf_output_dir, new_name)
......
......@@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin):
def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock):
while self.switch:
# 1. 从队列获取文件信息
doc, business_type, task_str = self.get_doc_info()
# 队列为空时的处理
if doc is None:
time.sleep(self.sleep_time_doc_get)
continue
try:
# 2. 从EDMS获取PDF文件
doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
os.makedirs(doc_data_path, exist_ok=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
self.pdf_download(doc, pdf_path)
# 3.PDF文件提取图片
self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
start_time = time.time()
img_save_path = os.path.join(doc_data_path, 'img')
pdf_handler = PDFHandler(pdf_path, img_save_path)
pdf_handler.extract_image()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
self.log_base, task_str, speed_time))
with lock:
todo_count_dict[task_str] = len(pdf_handler.img_path_list)
for img_path in pdf_handler.img_path_list:
while img_queue.full():
self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
time.sleep(self.sleep_time_img_put)
img_queue.put(img_path)
except EDMSException as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
# 1. 从队列获取文件信息
doc, business_type, task_str = self.get_doc_info()
# 队列为空时的处理
if doc is None:
time.sleep(self.sleep_time_doc_get)
continue
except Exception as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
self.cronjob_log.error('{0} [process failed (get doc into)] [error={1}]'.format(
self.log_base, traceback.format_exc()))
else:
try:
# 2. 从EDMS获取PDF文件
doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
os.makedirs(doc_data_path, exist_ok=True)
pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
self.pdf_download(doc, pdf_path)
# 3.PDF文件提取图片
self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
start_time = time.time()
img_save_path = os.path.join(doc_data_path, 'img')
pdf_handler = PDFHandler(pdf_path, img_save_path)
pdf_handler.extract_image()
end_time = time.time()
speed_time = int(end_time - start_time)
self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
self.log_base, task_str, speed_time))
with lock:
todo_count_dict[task_str] = len(pdf_handler.img_path_list)
for img_path in pdf_handler.img_path_list:
while img_queue.full():
self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
time.sleep(self.sleep_time_img_put)
img_queue.put(img_path)
except EDMSException as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
except Exception as e:
doc.status = DocStatus.PROCESS_FAILED.value
doc.save()
self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
self.log_base, task_str, traceback.format_exc()))
def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url):
while True:
......@@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin):
continue
else:
self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path))
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
file_data = base64_data.decode()
json_data_1 = {
"file": file_data
}
for times in range(consts.RETRY_TIMES):
try:
with open(img_path, 'rb') as f:
base64_data = base64.b64encode(f.read())
# 获取解码后的base64值
file_data = base64_data.decode()
json_data_1 = {
"file": file_data
}
start_time = time.time()
ocr_1_response = requests.post(url, json=json_data_1)
if ocr_1_response.status_code != 200:
......@@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin):
self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path))
# continue
del json_data_1
# /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
# AFC_2
path_split = img_path.split('/')
task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
with lock:
doc_res_dict = res_dict.setdefault(task_str, {})
doc_res_dict[img_path] = ocr_1_res
res_dict[task_str] = doc_res_dict
todo_count = todo_count_dict.get(task_str)
if todo_count == 1:
finish_queue.put(task_str)
del todo_count_dict[task_str]
else:
todo_count_dict[task_str] = todo_count - 1
try:
del json_data_1
# /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
# AFC_2
path_split = img_path.split('/')
task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
with lock:
doc_res_dict = res_dict.setdefault(task_str, {})
doc_res_dict[img_path] = ocr_1_res
res_dict[task_str] = doc_res_dict
todo_count = todo_count_dict.get(task_str)
if todo_count == 1:
finish_queue.put(task_str)
del todo_count_dict[task_str]
else:
todo_count_dict[task_str] = todo_count - 1
except Exception as e:
self.cronjob_log.error('{0} [process failed (store ocr res)] [img_path={1}] [error={2}]'.format(
self.log_base, img_path, traceback.format_exc()))
def res_2_wb(self, res_dict, finish_queue, lock):
while True:
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!