fix bug

周伟奇
Showing 2 changed files with 74 additions and 64 deletions
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/management/commands/ocr_process.py
--- a/src/apps/doc/management/commands/folder_ocr_process.py
View file @6934c59
+++ b/src/apps/doc/management/commands/folder_ocr_process.py
View file @6934c59
@@ -6,6 +6,7 @@ import base64
 import signal
 import requests
 import traceback
+from datetime import datetime
 from django.core.management import BaseCommand
 from multiprocessing import Process
@@ -57,7 +58,7 @@ class Command(BaseCommand, LoggerMixin):
    @staticmethod
    def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir):
-        time_stamp = int(time.time())
+        time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
        new_name = '{0}_{1}'.format(time_stamp, name)
        img_save_path = os.path.join(img_output_dir, new_name)
        pdf_save_path = os.path.join(pdf_output_dir, new_name)
--- a/src/apps/doc/management/commands/ocr_process.py
View file @6934c59
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @6934c59
@@ -365,48 +365,52 @@ class Command(BaseCommand, LoggerMixin):
    def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock):
        while self.switch:
-            # 1. 从队列获取文件信息
-            doc, business_type, task_str = self.get_doc_info()
-            # 队列为空时的处理
-            if doc is None:
-                time.sleep(self.sleep_time_doc_get)
-                continue
            try:
-                # 2. 从EDMS获取PDF文件
+                # 1. 从队列获取文件信息
-                doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
+                doc, business_type, task_str = self.get_doc_info()
-                os.makedirs(doc_data_path, exist_ok=True)
+                # 队列为空时的处理
-                pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
+                if doc is None:
-                self.pdf_download(doc, pdf_path)
+                    time.sleep(self.sleep_time_doc_get)
+                    continue
-                # 3.PDF文件提取图片
-                self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
-                start_time = time.time()
-                img_save_path = os.path.join(doc_data_path, 'img')
-                pdf_handler = PDFHandler(pdf_path, img_save_path)
-                pdf_handler.extract_image()
-                end_time = time.time()
-                speed_time = int(end_time - start_time)
-                self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
-                    self.log_base, task_str, speed_time))
-                with lock:
-                    todo_count_dict[task_str] = len(pdf_handler.img_path_list)
-                for img_path in pdf_handler.img_path_list:
-                    while img_queue.full():
-                        self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
-                        time.sleep(self.sleep_time_img_put)
-                    img_queue.put(img_path)
-            except EDMSException as e:
-                doc.status = DocStatus.PROCESS_FAILED.value
-                doc.save()
-                self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
-                    self.log_base, task_str, traceback.format_exc()))
            except Exception as e:
-                doc.status = DocStatus.PROCESS_FAILED.value
+                self.cronjob_log.error('{0} [process failed (get doc into)] [error={1}]'.format(
-                doc.save()
+                    self.log_base, traceback.format_exc()))
-                self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
+            else:
-                    self.log_base, task_str, traceback.format_exc()))
+                try:
+                    # 2. 从EDMS获取PDF文件
+                    doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
+                    os.makedirs(doc_data_path, exist_ok=True)
+                    pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
+                    self.pdf_download(doc, pdf_path)
+                    # 3.PDF文件提取图片
+                    self.cronjob_log.info('{0} [pdf to img start] [task={1}]'.format(self.log_base, task_str))
+                    start_time = time.time()
+                    img_save_path = os.path.join(doc_data_path, 'img')
+                    pdf_handler = PDFHandler(pdf_path, img_save_path)
+                    pdf_handler.extract_image()
+                    end_time = time.time()
+                    speed_time = int(end_time - start_time)
+                    self.cronjob_log.info('{0} [pdf to img end] [task={1}] [spend_time={2}]'.format(
+                        self.log_base, task_str, speed_time))
+                    with lock:
+                        todo_count_dict[task_str] = len(pdf_handler.img_path_list)
+                    for img_path in pdf_handler.img_path_list:
+                        while img_queue.full():
+                            self.cronjob_log.info('{0} [pdf_2_img_2_queue] [img queue full]'.format(self.log_base))
+                            time.sleep(self.sleep_time_img_put)
+                        img_queue.put(img_path)
+                except EDMSException as e:
+                    doc.status = DocStatus.PROCESS_FAILED.value
+                    doc.save()
+                    self.cronjob_log.error('{0} [process failed (edms download)] [task={1}] [error={2}]'.format(
+                        self.log_base, task_str, traceback.format_exc()))
+                except Exception as e:
+                    doc.status = DocStatus.PROCESS_FAILED.value
+                    doc.save()
+                    self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
+                        self.log_base, task_str, traceback.format_exc()))
    def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url):
        while True:
@@ -418,16 +422,17 @@ class Command(BaseCommand, LoggerMixin):
                continue
            else:
                self.cronjob_log.info('{0} [img_2_ocr_1] [get img] [img_path={1}]'.format(self.log_base, img_path))
-                with open(img_path, 'rb') as f:
-                    base64_data = base64.b64encode(f.read())
-                    # 获取解码后的base64值
-                    file_data = base64_data.decode()
-                json_data_1 = {
-                    "file": file_data
-                }
                for times in range(consts.RETRY_TIMES):
                    try:
+                        with open(img_path, 'rb') as f:
+                            base64_data = base64.b64encode(f.read())
+                            # 获取解码后的base64值
+                            file_data = base64_data.decode()
+                        json_data_1 = {
+                            "file": file_data
+                        }
                        start_time = time.time()
                        ocr_1_response = requests.post(url, json=json_data_1)
                        if ocr_1_response.status_code != 200:
@@ -447,22 +452,26 @@ class Command(BaseCommand, LoggerMixin):
                    self.cronjob_log.warn('{0} [ocr_1 failed] [img_path={1}]'.format(self.log_base, img_path))
                    # continue
-                del json_data_1
+                try:
-                # /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
+                    del json_data_1
-                # AFC_2
+                    # /data/bmw-ocr-data/AFC/tmp/6/img/page_0_img_0.jpeg
-                path_split = img_path.split('/')
+                    # AFC_2
-                task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
+                    path_split = img_path.split('/')
+                    task_str = consts.SPLIT_STR.join((path_split[-5], path_split[-3]))
-                with lock:
-                    doc_res_dict = res_dict.setdefault(task_str, {})
+                    with lock:
-                    doc_res_dict[img_path] = ocr_1_res
+                        doc_res_dict = res_dict.setdefault(task_str, {})
-                    res_dict[task_str] = doc_res_dict
+                        doc_res_dict[img_path] = ocr_1_res
-                    todo_count = todo_count_dict.get(task_str)
+                        res_dict[task_str] = doc_res_dict
-                    if todo_count == 1:
+                        todo_count = todo_count_dict.get(task_str)
-                        finish_queue.put(task_str)
+                        if todo_count == 1:
-                        del todo_count_dict[task_str]
+                            finish_queue.put(task_str)
-                    else:
+                            del todo_count_dict[task_str]
-                        todo_count_dict[task_str] = todo_count - 1
+                        else:
+                            todo_count_dict[task_str] = todo_count - 1
+                except Exception as e:
+                    self.cronjob_log.error('{0} [process failed (store ocr res)] [img_path={1}] [error={2}]'.format(
+                        self.log_base, img_path, traceback.format_exc()))
    def res_2_wb(self, res_dict, finish_queue, lock):
        while True: