part 2

周伟奇
Showing 2 changed files with 144 additions and 111 deletions
src/apps/doc/consts.py
src/apps/doc/management/commands/folder_ltgt_process.py
--- a/src/apps/doc/consts.py
View file @d213eb7
+++ b/src/apps/doc/consts.py
View file @d213eb7
@@ -1449,4 +1449,16 @@ SE_SECOND_ID_FIELD_MAPPING = {

 HEAD_LIST = ['Info', 'Index', 'License', 'Field', 'Input', 'OCR', 'Result', 'Position', 'Image', 'errorType']

+# ----------------litigation------------------------
+IC_FIELD_ORDER_2 = (('姓名', '姓名'),
+                    ('公民身份号码', '公民身份号码'),
+                    ('出生年月', '出生年月'),
+                    ('住址', '住址'),
+                    ('性别', '性别'),
+                    ('民族', '民族'),)
+IC_FIELD_ORDER_3 = (('有效期限', '有效期限'), ('签发机关', '签发机关'),)
+
+BC_FIELD_ORDER_2 = (('BankName', '发卡行名称'),
+                    ('CardNum', '银行卡号'),
+                    ('CardType', '银行卡类型'),)

--- a/src/apps/doc/management/commands/folder_ltgt_process.py
View file @d213eb7
+++ b/src/apps/doc/management/commands/folder_ltgt_process.py
View file @d213eb7
 import os
 import re
 import time
+import json
 import shutil
 import base64
 import signal
@@ -16,7 +17,7 @@ from settings import conf
 from common.mixins import LoggerMixin
 from common.tools.pdf_to_img import PDFHandler
 from apps.doc import consts
-from apps.doc.exceptions import OCR1Exception, OCR4Exception, LTGTException
+from apps.doc.exceptions import OCR1Exception, OCR2Exception, LTGTException
 from apps.doc.ocr.wb import BSWorkbook


@@ -69,8 +70,19 @@ class Command(BaseCommand, LoggerMixin):
        self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
        # input folder
        self.input_dirs = conf.get_namespace('LTGT_DIR_')
+        # seperate folder name
+        self.seperate_map = {
+            consts.IC_CLASSIFY: 'IDCard',
+            consts.BC_CLASSIFY: 'BankCard'
+        }
+        self.field_map = {
+            consts.VAT_CLASSIFY: (consts.VAT_CN_NAME, None, None, consts.VATS_FIELD_ORDER),
+            consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2),
+            consts.BC_CLASSIFY: (consts.BC_CN_NAME, None, None, consts.BC_FIELD_ORDER_2)
+        }
        # ocr相关
-        # self.ocr_url = conf.OCR_URL_FOLDER
+        self.ocr_url = conf.OCR_URL_FOLDER
+        self.ocr_url_2 = conf.OCR2_URL_FOLDER
        # self.ocr_url_4 = conf.IC_URL
        self.ltgt_ocr_url = conf.LTGT_URL
        # 优雅退出信号：15
@@ -79,73 +91,60 @@ class Command(BaseCommand, LoggerMixin):
    def signal_handler(self, sig, frame):
        self.switch = False  # 停止处理文件

-    def license1_process(self, ocr_data, license_summary, classify, img_path):
+    def license1_process(self, ocr_data, all_res, classify):
        # 类别：'0'身份证， '1'居住证
        license_data = ocr_data.get('data', [])
        if not license_data:
            return
-        if classify == consts.MVC_CLASSIFY:  # 车辆登记证 3/4页结果整合
-            for mvc_dict in license_data:
-                try:
-                    mvc_page = mvc_dict.pop('page')
-                except Exception as e:
-                    pass
-                else:
-                    if mvc_page == 'VehicleRegArea':
-                        mvc_res = mvc_dict.pop('results', {})
-                        mvc_dict['机动车登记证书编号'] = mvc_res.get('register_no', {}).get('words', '')
-                        for register_info in mvc_res.get('register_info', []):
-                            for detail_dict in register_info.get('details', {}).values():
-                                mvc_dict.setdefault(detail_dict.get('chinese_key', '未知'), []).append(
-                                    detail_dict.get('words', ''))
-                        del mvc_res
        if classify == consts.IC_CLASSIFY:
            for id_card_dict in license_data:
                try:
-                    base64_img = id_card_dict.pop('base64_img')
+                    id_card_dict.pop('base64_img')
                except Exception as e:
                    continue
-                else:
-                    card_type = -1
-                    json_data_4 = {
-                        'mode': 1,
-                        'user_info': {
-                            'image_content': base64_img,
-                        },
-                        'options': {
-                            'distinguish_type': 1,
-                            'auto_rotate': True,
-                        },
-                    }
-                    for times in range(consts.RETRY_TIMES):
-                        try:
-                            start_time = time.time()
-                            ocr_4_response = requests.post(self.ocr_url_4, json=json_data_4)
-                            if ocr_4_response.status_code != 200:
-                                raise OCR4Exception('ocr_4 status code: {0}'.format(ocr_4_response.status_code))
-                        except Exception as e:
-                            self.folder_log.warn(
-                                '{0} [ocr_4 failed] [times={1}] [img_path={2}] [error={3}]'.format(
-                                    self.log_base, times, img_path, traceback.format_exc()))
-                        else:
-                            ocr_4_res = ocr_4_response.json()
-                            end_time = time.time()
-                            speed_time = int(end_time - start_time)
-
-                            if ocr_4_res.get('code') == 0 and ocr_4_res.get('result', {}).get('rtn') == 0:
-                                card_type = ocr_4_res.get('result', {}).get(
-                                    'idcard_distinguish_result', {}).get('result', -1)
-
-                            self.folder_log.info(
-                                '{0} [ocr_4 success] [img_path={1}] [speed_time={2}]'.format(
-                                    self.log_base, img_path, speed_time))
-                            break
-                    else:
-                        self.folder_log.warn(
-                            '{0} [ocr_4 failed] [img_path={1}]'.format(self.log_base, img_path))
+        all_res.append(license_data)
+
+    def license2_process(self, ocr_data, all_res, classify, img_path):
+        pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
+        file_data = ocr_data.get('section_img')
+        if file_data is None:
+            with open(img_path, 'rb') as f:
+                base64_data = base64.b64encode(f.read())
+                # 获取解码后的base64值
+                file_data = base64_data.decode()
+        json_data_2 = {
+            "pid": str(pid),
+            "filedata": file_data
+        }
+
+        for times in range(consts.RETRY_TIMES):
+            try:
+                start_time = time.time()
+                ocr_2_response = requests.post(self.ocr_url_2, data=json_data_2)
+                if ocr_2_response.status_code != 200:
+                    raise OCR2Exception('ocr_2 status code: {0}'.format(ocr_2_response.status_code))
+            except Exception as e:
+                self.folder_log.warn(
+                    '{0} [ocr_2 failed] [times={1}] [img_path={2}] [error={3}]'.format(
+                        self.log_base, times, img_path, traceback.format_exc()))
+            else:
+                ocr_res_2 = json.loads(ocr_2_response.text)
+                end_time = time.time()
+                speed_time = int(end_time - start_time)
+                self.folder_log.info(
+                    '{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
+                        self.log_base, img_path, speed_time))

-                    id_card_dict[consts.IC_TURE_OR_FALSE] = consts.IC_RES_MAPPING.get(card_type)
-        license_summary.setdefault(classify, []).extend(license_data)
+                if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
+                    if pid == consts.BC_PID:
+                        all_res.append(ocr_res_2)
+                    else:
+                        # 营业执照等
+                        for result_dict in ocr_res_2.get('ResultList', []):
+                            res_dict = {}
+                            for field_dict in result_dict.get('FieldList', []):
+                                res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
+                            all_res.append(res_dict)

    @staticmethod
    def parse_img_path(img_path):
@@ -158,43 +157,38 @@ class Command(BaseCommand, LoggerMixin):
            return img_name, 1, 1

    @staticmethod
-    def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir):
+    def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
        time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
        new_name = '{0}_{1}'.format(time_stamp, name)
        img_save_path = os.path.join(img_output_dir, new_name)
        pdf_save_path = os.path.join(pdf_output_dir, new_name)
        excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
        excel_path = os.path.join(wb_output_dir, excel_name)
-        return img_save_path, excel_path, pdf_save_path
+        seperate_path = None if seperate_dir is None else os.path.join(seperate_dir, new_name)
+        return img_save_path, excel_path, pdf_save_path, seperate_path

-    def res_process(self, all_res, classify, excel_path):
+    def res_process(self, all_res, excel_path, classify):
        try:
-            license_summary = {}
-
-            if not all_res:
-                return
-            else:
-                for img_path, ocr_res in all_res.items():
-                    # img_name, pno, ino = self.parse_img_path(img_path)
-                    # part_idx = 1
-
-                    if isinstance(ocr_res, dict):
-                        if ocr_res.get('code') == 1:
-                            data_list = ocr_res.get('data', [])
-                            if isinstance(data_list, list):
-                                for ocr_data in data_list:
-                                    # part_idx = part_idx + 1
-                                    self.license1_process(ocr_data, license_summary, classify, img_path)
-
-                wb = BSWorkbook(set(), set(), set(), set(), set())
-                wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
-                wb.remove_base_sheet()
-                wb.save(excel_path)
+            wb = BSWorkbook(set(), set(), set(), set(), set())
+            sheet_name, key_field, side_field_order, field_order = self.field_map.get(classify)
+            ws = wb.create_sheet(sheet_name)
+            for res in all_res:
+                if key_field is not None and key_field in res:
+                    field_order = side_field_order
+                for search_field, write_field in field_order:
+                    field_value = res.get(search_field, '')
+                    if isinstance(field_value, list):
+                        ws.append((write_field, *field_value))
+                    else:
+                        ws.append((write_field, field_value))
+                ws.append((None,))
+            wb.remove_base_sheet()
+            wb.save(excel_path)
        except Exception as e:
            self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
                self.log_base, excel_path, traceback.format_exc()))

-    def ocr_process(self, img_path, classify):
+    def ocr_process(self, img_path, classify, all_res, seperate_dir):
        if os.path.exists(img_path):
            # TODO 图片验证
            with open(img_path, 'rb') as f:
@@ -203,8 +197,9 @@ class Command(BaseCommand, LoggerMixin):
                file_data = base64_data.decode()
            json_data = {
                "file": file_data,
-                "classify": classify
            }
+            if seperate_dir is None:
+                json_data["classify"] = classify

            for times in range(consts.RETRY_TIMES):
                try:
@@ -221,7 +216,20 @@ class Command(BaseCommand, LoggerMixin):
                    speed_time = int(end_time - start_time)
                    self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format(
                        self.log_base, img_path, ocr_res, speed_time))
-                    return ocr_res
+
+                    if isinstance(ocr_res, dict):
+                        if ocr_res.get('code') == 1:
+                            data_list = ocr_res.get('data', [])
+                            if isinstance(data_list, list):
+                                for ocr_data in data_list:
+                                    if ocr_data.get('classify') == classify:
+                                        if seperate_dir is not None:
+                                            os.makedirs(seperate_dir, exist_ok=True)
+                                            shutil.move(img_path, seperate_dir)
+                                        if classify in consts.LICENSE_CLASSIFY_SET_1:
+                                            self.license1_process(ocr_data, all_res, classify)
+                                        elif classify in consts.LICENSE_CLASSIFY_SET_2:
+                                            self.license2_process(ocr_data, all_res, classify, img_path)
            else:
                self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
                
@@ -280,18 +288,20 @@ class Command(BaseCommand, LoggerMixin):
        rebuild_res = self.ltgt_res_process(ocr_res, label, excel_path)
        return rebuild_res
        
-    def images_process(self, img_path_list, classify, excel_path):
-        all_res = {}
+    def images_process(self, img_path_list, classify, excel_path, seperate_dir):
+        all_res = []
        for img_path in img_path_list:
-            ocr_res = self.ocr_process(img_path, classify)
-            all_res[img_path] = ocr_res
-        self.res_process(all_res, classify, excel_path)
+            self.ocr_process(img_path, classify, all_res, seperate_dir)
+        if len(all_res) > 0:
+            self.res_process(all_res, excel_path, classify)
+            return all_res

-    def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
+    def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
        if os.path.exists(path):
            rebuild_res = None
            try:
-                img_save_path, excel_path, pdf_save_path = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
+                img_save_path, excel_path, pdf_save_path, seperate_path = self.get_path(
+                    name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
                self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
                pdf_handler = PDFHandler(path, img_save_path)
                if classify in self.ltgt_classify_mapping:
@@ -308,15 +318,16 @@ class Command(BaseCommand, LoggerMixin):
                    rebuild_res = self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify],
                                                    excel_path, path)
                else:
-                    self.images_process(pdf_handler.img_path_list, classify, excel_path)
+                    rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path)
                shutil.move(path, pdf_save_path)
                return rebuild_res

-    def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
+    def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir):
        if os.path.exists(path):
            rebuild_res = None
            try:
-                img_save_path, excel_path, tiff_save_path = self.get_path(name, img_output_dir, wb_output_dir, tiff_output_dir)
+                img_save_path, excel_path, tiff_save_path, seperate_path = self.get_path(
+                    name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir)
                self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
                tiff_handler = TIFFHandler(path, img_save_path)
                tiff_handler.extract_image()
@@ -330,14 +341,15 @@ class Command(BaseCommand, LoggerMixin):
                    rebuild_res = self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify],
                                                    excel_path, path)
                else:
-                    self.images_process(tiff_handler.img_path_list, classify, excel_path)
+                    rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path)
                shutil.move(path, tiff_save_path)
                return rebuild_res

-    def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
+    def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir):
        rebuild_res = None
        try:
-            img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
+            img_save_path, excel_path, _, seperate_path = self.get_path(
+                name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
        except Exception as e:
            self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
                self.log_base, path, traceback.format_exc()))
@@ -345,9 +357,7 @@ class Command(BaseCommand, LoggerMixin):
            if classify in self.ltgt_classify_mapping:
                rebuild_res = self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
            else:
-                ocr_res = self.ocr_process(path, classify)
-                all_res = {path: ocr_res}
-                self.res_process(all_res, classify, excel_path)
+                rebuild_res = self.images_process([img_save_path], classify, excel_path, seperate_path)
            shutil.move(path, img_save_path)
            return rebuild_res

@@ -380,7 +390,14 @@ class Command(BaseCommand, LoggerMixin):
                    wb = Workbook()
                for result in result_list:
                    try:
-                        sheet_name, head_fields = self.sheet_content[result[self.CLASSIFY_KEY]]
+                        if result[self.CLASSIFY_KEY] in self.sheet_content:
+                            sheet_name, head_fields = self.sheet_content[result[self.CLASSIFY_KEY]]
+                        else:
+                            sheet_name, key_field, side_field_order, field_order = self.field_map[result[self.CLASSIFY_KEY]]
+                            if key_field is not None and key_field in result[self.RESULT_KEY]:
+                                head_fields = [b for _, b in side_field_order]
+                            else:
+                                head_fields = [b for _, b in field_order]
                        row = []
                        for field in head_fields:
                            row.append(result[self.RESULT_KEY].get(field))
@@ -395,7 +412,7 @@ class Command(BaseCommand, LoggerMixin):
                            self.log_base, result, traceback.format_exc()))
                wb.save(wb_path)

-    def folder_process(self, input_dir, classify, result_queue):
+    def folder_process(self, input_dir, classify, is_combined, result_queue):
        while not os.path.isdir(input_dir):
            self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
            if self.switch:
@@ -404,6 +421,7 @@ class Command(BaseCommand, LoggerMixin):
            else:
                return
        output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
+        seperate_dir = os.path.join(output_dir, self.seperate_map.get(classify, 'Unknown')) if is_combined else None
        img_output_dir = os.path.join(output_dir, 'image')
        wb_output_dir = os.path.join(output_dir, 'excel')
        pdf_output_dir = os.path.join(output_dir, 'pdf')
@@ -415,6 +433,8 @@ class Command(BaseCommand, LoggerMixin):
        os.makedirs(pdf_output_dir, exist_ok=True)
        os.makedirs(tiff_output_dir, exist_ok=True)
        os.makedirs(failed_output_dir, exist_ok=True)
+        if seperate_dir is not None:
+            os.makedirs(seperate_dir, exist_ok=True)
        os_error_filename_set = set()
        while self.switch:
            # if not os.path.isdir(input_dir):
@@ -438,14 +458,14 @@ class Command(BaseCommand, LoggerMixin):
                    if os.path.isfile(path):
                        self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
                        if name.endswith('.pdf') or name.endswith('.PDF'):
-                            result = self.pdf_process(name, path, classify, img_output_dir,
-                                                           wb_output_dir, pdf_output_dir)
+                            result = self.pdf_process(name, path, classify, img_output_dir, wb_output_dir,
+                                                      pdf_output_dir, seperate_dir)
                        elif name.endswith('.tif') or name.endswith('.TIF'):
-                            result = self.tif_process(name, path, classify, img_output_dir,
-                                                           wb_output_dir, tiff_output_dir)
+                            result = self.tif_process(name, path, classify, img_output_dir, wb_output_dir,
+                                                      tiff_output_dir, seperate_dir)
                        else:
-                            result = self.img_process(name, path, classify, wb_output_dir,
-                                                           img_output_dir, pdf_output_dir)
+                            result = self.img_process(name, path, classify, wb_output_dir, img_output_dir,
+                                                      pdf_output_dir, seperate_dir)
                        self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
                    else:
                        result = None
@@ -497,7 +517,8 @@ class Command(BaseCommand, LoggerMixin):
            if one_input_dir is None:
                one_input_dir = input_dir
            classify = int(classify_idx.split('_')[0])
-            process = Process(target=self.folder_process, args=(input_dir, classify, result_queue))
+            is_combined = True if int(classify_idx.split('_')[2]) == 1 else False
+            process = Process(target=self.folder_process, args=(input_dir, classify, is_combined, result_queue))
            process_list.append(process)

        wb_dir = os.path.dirname(os.path.dirname(one_input_dir))