Merge branch 'feature/ltgt' into feature/0611

周伟奇
Showing 5 changed files with 57 additions and 31 deletions
src/apps/doc/exceptions.py
src/apps/doc/management/commands/folder_ltgt_process.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/exceptions.py
View file @423427c
+++ b/src/apps/doc/exceptions.py
View file @423427c
@@ -13,6 +13,9 @@ class OCR2Exception(Exception):
 class OCR4Exception(Exception):
    pass

+class LTGTException(Exception):
+    pass
+

 class GCAPException(Exception):
    pass
--- a/src/apps/doc/management/commands/folder_ltgt_process.py 0 → 100644
View file @423427c
+++ b/src/apps/doc/management/commands/folder_ltgt_process.py 0 → 100644
View file @423427c
--- a/src/apps/doc/management/commands/folder_ocr_process.py
View file @423427c
+++ b/src/apps/doc/management/commands/folder_ocr_process.py
View file @423427c
@@ -61,13 +61,11 @@ class Command(BaseCommand, LoggerMixin):
    def signal_handler(self, sig, frame):
        self.switch = False  # 停止处理文件

-    def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino, part_idx, img_path):
+    def license1_process(self, ocr_data, license_summary, classify, img_path):
        # 类别：'0'身份证， '1'居住证
        license_data = ocr_data.get('data', [])
        if not license_data:
-            res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
            return
-        res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
        if classify == consts.MVC_CLASSIFY:  # 车辆登记证 3/4页结果整合
            for mvc_dict in license_data:
                try:
@@ -154,29 +152,21 @@ class Command(BaseCommand, LoggerMixin):
    def res_process(self, all_res, classify, excel_path):
        try:
            license_summary = {}
-            res_list = []

            if not all_res:
                return
            else:
                for img_path, ocr_res in all_res.items():
-                    img_name, pno, ino = self.parse_img_path(img_path)
-                    part_idx = 1
+                    # img_name, pno, ino = self.parse_img_path(img_path)
+                    # part_idx = 1

                    if isinstance(ocr_res, dict):
                        if ocr_res.get('code') == 1:
                            data_list = ocr_res.get('data', [])
                            if isinstance(data_list, list):
-                                for part_idx, ocr_data in enumerate(data_list):
-                                    part_idx = part_idx + 1
-                                    self.license1_process(ocr_data, license_summary, classify,
-                                                          res_list, pno, ino, part_idx, img_path)
-                            else:
-                                res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
-                        else:
-                            res_list.append((pno, ino, part_idx, consts.RES_FAILED))
-                    else:
-                        res_list.append((pno, ino, part_idx, consts.RES_FAILED))
+                                for ocr_data in data_list:
+                                    # part_idx = part_idx + 1
+                                    self.license1_process(ocr_data, license_summary, classify, img_path)

                wb = BSWorkbook(set(), set(), set(), set(), set())
                wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
@@ -216,6 +206,13 @@ class Command(BaseCommand, LoggerMixin):
                    return ocr_res
            else:
                self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
+                
+    def images_process(self, img_path_list, classify, excel_path):
+        all_res = {}
+        for img_path in img_path_list:
+            ocr_res = self.ocr_process(img_path, classify)
+            all_res[img_path] = ocr_res
+        self.res_process(all_res, classify, excel_path)

    def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
        if os.path.exists(path):
@@ -230,11 +227,7 @@ class Command(BaseCommand, LoggerMixin):
                    self.log_base, path, traceback.format_exc()))
                raise e
            else:
-                all_res = {}
-                for img_path in pdf_handler.img_path_list:
-                    ocr_res = self.ocr_process(img_path, classify)
-                    all_res[img_path] = ocr_res
-                self.res_process(all_res, classify, excel_path)
+                self.images_process(pdf_handler.img_path_list, classify, excel_path)
                shutil.move(path, pdf_save_path)

    def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
@@ -250,23 +243,18 @@ class Command(BaseCommand, LoggerMixin):
                    self.log_base, path, traceback.format_exc()))
                raise e
            else:
-                all_res = {}
-                for img_path in tiff_handler.img_path_list:
-                    ocr_res = self.ocr_process(img_path, classify)
-                    all_res[img_path] = ocr_res
-                self.res_process(all_res, classify, excel_path)
+                self.images_process(tiff_handler.img_path_list, classify, excel_path)
                shutil.move(path, tiff_save_path)

    def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
-        ocr_res = self.ocr_process(path, classify)
-        all_res = {path: ocr_res}
-
        try:
            img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
        except Exception as e:
            self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
                self.log_base, path, traceback.format_exc()))
        else:
+            ocr_res = self.ocr_process(path, classify)
+            all_res = {path: ocr_res}
            self.res_process(all_res, classify, excel_path)
            shutil.move(path, img_save_path)

@@ -312,9 +300,9 @@ class Command(BaseCommand, LoggerMixin):
                try:
                    if os.path.isfile(path):
                        self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
-                        if name.endswith('.pdf'):
+                        if name.endswith('.pdf') or name.endswith('.PDF'):
                            self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
-                        elif name.endswith('.tif'):
+                        elif name.endswith('.tif') or name.endswith('.TIF'):
                            self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir)
                        else:
                            self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
--- a/src/apps/doc/ocr/wb.py
View file @423427c
+++ b/src/apps/doc/ocr/wb.py
View file @423427c
@@ -702,6 +702,31 @@ class BSWorkbook(Workbook):
            if field_str is not None:
                count_list.append((field_str, count))

+    def ltgt_build(self, label, result_dict):
+        ws = self.create_sheet(label)
+        rebuild_res = {}
+        for key, value in result_dict.items():
+            if isinstance(value, list):
+                value_list = [dict_item.get('words') for dict_item in value]
+                ws.append((key, '、'.join(value_list)))
+                rebuild_res[key] = '、'.join(value_list)
+            elif isinstance(value, dict):
+                if 'words' in value:
+                    ws.append((key, value['words']))
+                    rebuild_res[key] = value['words']
+                else:
+                    for sub_key, sub_value in value.items():
+                        if isinstance(sub_value, dict):
+                            ws.append(('{0}: {1}'.format(key, sub_key), sub_value.get('words', '')))
+                            rebuild_res['{0}: {1}'.format(key, sub_key)] = sub_value.get('words', '')
+                        else:
+                            ws.append(('{0}: {1}'.format(key, sub_key), sub_value))
+                            rebuild_res['{0}: {1}'.format(key, sub_key)] = sub_value
+            else:
+                ws.append((key, value))
+                rebuild_res[key] = value
+        return rebuild_res
+
    def simple_license_rebuild(self, license_summary, document_scheme):
        # for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
        #     if ic_license_dict.get('类别') == '1':
--- a/src/common/tools/pdf_to_img.py
View file @423427c
+++ b/src/common/tools/pdf_to_img.py
View file @423427c
@@ -225,3 +225,13 @@ class PDFHandler:
                else:
                    self.merge_il(pdf, pno, il)
        self.img_count = len(self.img_path_list)
+
+    def extract_page_image(self):
+        self.img_path_list = []
+        self.xref_set = set()
+        os.makedirs(self.img_dir_path, exist_ok=True)
+        with fitz.Document(self.path) as pdf:
+            for pno in range(pdf.pageCount):
+                page = pdf.loadPage(pno)
+                self.page_to_png(page)
+        self.img_count = len(self.img_path_list)