PROD Version

周伟奇
Showing 2 changed files with 25 additions and 17 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/ocr_process.py
--- a/src/apps/doc/consts.py
View file @ec638e4
+++ b/src/apps/doc/consts.py
View file @ec638e4
@@ -8,7 +8,7 @@ PAGE_SIZE_DEFAULT = 10

 FIXED_APPLICATION_ID_PREFIX = 'CH-S'

-DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACT MANAGEMENT']
+DOC_SCHEME_LIST = ['ACCEPTANCE', 'SETTLEMENT', 'CONTRACTMANAGEMENT']
 DATA_SOURCE_LIST = ['POS', 'EAPP', 'ECONTRACT']

 HIL_PREFIX = 'HIL'
@@ -31,7 +31,7 @@ DOWNLOAD_ACTION_TYPE = 'Downloaded'
 DOC_SCHEMA_ID_FILL = {
    'ACCEPTANCE': (1, 'DFE-AutoFilingScript'),
    'SETTLEMENT': (20, 'DFE-AutoFilingScript'),
-    'CONTRACT MANAGEMENT': (86, 'Schema-Based')
+    'CONTRACTMANAGEMENT': (86, 'Schema-Based')
 }
 BUSINESS_TYPE_DICT = {
    HIL_PREFIX: 'CO00002',
--- a/src/apps/doc/management/commands/ocr_process.py
View file @ec638e4
+++ b/src/apps/doc/management/commands/ocr_process.py
View file @ec638e4
@@ -72,7 +72,12 @@ class Command(BaseCommand, LoggerMixin):
            return None, None, None

        self.cronjob_log.info('{0} [get_doc_info success] [task={1}] [is_priority={2}]'.format(self.log_base, task_str, is_priority))
-        doc, business_type = self.get_doc_object(task_str)
+        try:
+            doc, business_type = self.get_doc_object(task_str)
+        except Exception as e:
+            rh.enqueue([task_str], is_priority)
+            self.cronjob_log.error('{0} [process error (get doc info in)] [error={1}]'.format(self.log_base, traceback.format_exc()))
+            raise e

        if doc is None:
            self.cronjob_log.warn('{0} [get_doc_info] [doc not exist] [task_str={1}] [is_priority={2}]'.format(
@@ -364,7 +369,7 @@ class Command(BaseCommand, LoggerMixin):
            # summary['confidence'] = max(summary['confidence'])
        return merged_bs_summary

-    def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock):
+    def pdf_2_img_2_queue(self, img_queue, todo_count_dict, lock, error_list):
        while self.switch:
            try:
                # 1. 从队列获取文件信息
@@ -374,8 +379,10 @@ class Command(BaseCommand, LoggerMixin):
                    time.sleep(self.sleep_time_doc_get)
                    continue
            except Exception as e:
-                self.cronjob_log.error('{0} [process failed (get doc into)] [error={1}]'.format(
+                self.cronjob_log.error('{0} [process error (get doc info out)] [error={1}]'.format(
                    self.log_base, traceback.format_exc()))
+                error_list.append(1)
+                return
            else:
                try:
                    # 2. 从EDMS获取PDF文件
@@ -413,8 +420,8 @@ class Command(BaseCommand, LoggerMixin):
                    self.cronjob_log.error('{0} [process failed (pdf to img)] [task={1}] [error={2}]'.format(
                        self.log_base, task_str, traceback.format_exc()))

-    def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url):
-        while True:
+    def img_2_ocr_1(self, img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list):
+        while len(error_list) == 0 or not img_queue.empty():
            try:
                img_path = img_queue.get(block=False)
            except Exception as e:
@@ -478,8 +485,8 @@ class Command(BaseCommand, LoggerMixin):
                    self.cronjob_log.error('{0} [process error (store ocr res)] [img_path={1}] [error={2}]'.format(
                        self.log_base, img_path, traceback.format_exc()))

-    def res_2_wb(self, res_dict, finish_queue, lock):
-        while True:
+    def res_2_wb(self, res_dict, img_queue, finish_queue, lock, error_list):
+        while len(error_list) == 0 or not img_queue.empty() or not finish_queue.empty():
            try:
                task_str = finish_queue.get(block=False)
            except Exception as e:
@@ -605,8 +612,8 @@ class Command(BaseCommand, LoggerMixin):
                    doc, business_type = self.get_doc_object(task_str)
                    doc_data_path = os.path.join(self.data_dir, business_type, consts.TMP_DIR_NAME, str(doc.id))
                    excel_path = os.path.join(doc_data_path, '{0}.xlsx'.format(doc.id))
-                    src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
-                    wb.save(src_excel_path)
+                    # src_excel_path = os.path.join(doc_data_path, 'src.xlsx')
+                    # wb.save(src_excel_path)
                    count_list = wb.rebuild(merged_bs_summary, license_summary, res_list, doc.document_scheme)
                    wb.save(excel_path)
                except Exception as e:
@@ -637,8 +644,8 @@ class Command(BaseCommand, LoggerMixin):
                        img_save_path = os.path.join(doc_data_path, 'img')
                        write_zip_file(img_save_path, os.path.join(doc_data_path, '{0}_img.zip'.format(doc.id)))
                        shutil.rmtree(img_save_path, ignore_errors=True)
-                        # pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
-                        # os.remove(pdf_path)
+                        pdf_path = os.path.join(doc_data_path, '{0}.pdf'.format(doc.id))
+                        os.remove(pdf_path)
                        # os.remove(src_excel_path)
                    except Exception as e:
                        self.cronjob_log.error('{0} [process error (file remove 2)] [task={1}] [error={2}]'.format(
@@ -681,7 +688,7 @@ class Command(BaseCommand, LoggerMixin):
                                    setattr(doc, field, count)
                            doc.save()
                            self.cronjob_log.info('{0} [process complete] [task={1}]'.format(self.log_base, task_str))
-                            # os.remove(excel_path)
+                            os.remove(excel_path)
                        except Exception as e:
                            self.cronjob_log.error('{0} [process error (completed)] [task={1}] [error={2}]'.format(
                                self.log_base, task_str, traceback.format_exc()))
@@ -695,21 +702,22 @@ class Command(BaseCommand, LoggerMixin):
    def handle(self, *args, **kwargs):
        lock = Lock()
        with Manager() as manager:
+            error_list = manager.list()
            todo_count_dict = manager.dict()
            res_dict = manager.dict()
            img_queue = Queue(self.img_queue_size)
            finish_queue = Queue()

            process_list = []
-            pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock))
+            pdf_process = Process(target=self.pdf_2_img_2_queue, args=(img_queue, todo_count_dict, lock, error_list))
            process_list.append(pdf_process)

            for url in self.ocr_1_urls.values():
                ocr_1_process = Process(target=self.img_2_ocr_1, args=(
-                    img_queue, todo_count_dict, res_dict, finish_queue, lock, url))
+                    img_queue, todo_count_dict, res_dict, finish_queue, lock, url, error_list))
                process_list.append(ocr_1_process)

-            wb_process = Process(target=self.res_2_wb, args=(res_dict, finish_queue, lock))
+            wb_process = Process(target=self.res_2_wb, args=(res_dict, img_queue, finish_queue, lock, error_list))
            process_list.append(wb_process)

            for p in process_list: