Merge branch 'feature/ltgt' into feature/0611

周伟奇
Showing 6 changed files with 613 additions and 31 deletions
src/apps/doc/consts.py
src/apps/doc/exceptions.py
src/apps/doc/management/commands/folder_ltgt_process.py
src/apps/doc/management/commands/folder_ocr_process.py
src/apps/doc/ocr/wb.py
src/common/tools/pdf_to_img.py
--- a/src/apps/doc/consts.py
View file @b8745dc
+++ b/src/apps/doc/consts.py
View file @b8745dc
@@ -1448,4 +1448,16 @@ SE_SECOND_ID_FIELD_MAPPING = {
 HEAD_LIST = ['Info', 'Index', 'License', 'Field', 'Input', 'OCR', 'Result', 'Position', 'Image', 'errorType']
+# ----------------litigation------------------------
+IC_FIELD_ORDER_2 = (('姓名', '姓名'),
+                    ('公民身份号码', '公民身份号码'),
+                    ('出生年月', '出生年月'),
+                    ('住址', '住址'),
+                    ('性别', '性别'),
+                    ('民族', '民族'),)
+IC_FIELD_ORDER_3 = (('有效期限', '有效期限'), ('签发机关', '签发机关'),)
+BC_FIELD_ORDER_2 = (('BankName', '发卡行名称'),
+                    ('CardNum', '银行卡号'),
+                    ('CardType', '银行卡类型'),)
--- a/src/apps/doc/exceptions.py
View file @b8745dc
+++ b/src/apps/doc/exceptions.py
View file @b8745dc
@@ -13,6 +13,9 @@ class OCR2Exception(Exception):
 class OCR4Exception(Exception):
    pass
+class LTGTException(Exception):
+    pass
 class GCAPException(Exception):
    pass
--- a/src/apps/doc/management/commands/folder_ltgt_process.py 0 → 100644
View file @b8745dc
+++ b/src/apps/doc/management/commands/folder_ltgt_process.py 0 → 100644
View file @b8745dc
+import os
+import re
+import time
+import json
+import shutil
+import base64
+import signal
+import requests
+import traceback
+from PIL import Image
+from datetime import datetime
+from django.core.management import BaseCommand
+from multiprocessing import Process, Queue
+from openpyxl import load_workbook, Workbook
+from settings import conf
+from common.mixins import LoggerMixin
+from common.tools.pdf_to_img import PDFHandler
+from apps.doc import consts
+from apps.doc.exceptions import OCR1Exception, OCR2Exception, LTGTException
+from apps.doc.ocr.wb import BSWorkbook
+class TIFFHandler:
+    def __init__(self, path, img_save_path):
+        self.path = path
+        self.img_save_path = img_save_path
+        self.img_path_list = []
+    def extract_image(self):
+        os.makedirs(self.img_save_path, exist_ok=True)
+        tiff = Image.open(self.path)
+        tiff.load()
+        for i in range(tiff.n_frames):
+            try:
+                save_path = os.path.join(self.img_save_path, 'page_{0}.jpeg'.format(i))
+                tiff.seek(i)
+                tiff.save(save_path)
+                self.img_path_list.append(save_path)
+            except EOFError:
+                break
+class Command(BaseCommand, LoggerMixin):
+    def __init__(self):
+        super().__init__()
+        self.log_base = '[folder ltgt process]'
+        # 处理文件开关
+        self.switch = True
+        self.ltgt_classify_mapping = {
+            128: '执行裁定书',
+            129: '民事判决书',
+            130: '民事调解书'
+        }
+        self.sheet_content = {
+            128: ('执行裁定书', ('承办法院', '案号/标号', '被执行人', '债权金额', '诉讼时间')),
+            129: ('民事判决书', ('承办法院', '案号/标号', '被告', '判决结果: 贷款本金', '判决结果: 罚息', '判决结果: 律师费', '判决结果: 案件受理费', '诉讼时间')),
+            130: ('民事调解书', ('承办法院', '案号/标号', '被告', '协议内容: 支付金额', '协议内容: 案件受理费', '诉讼时间')),
+        }
+        self.DATE_KEY = 'date'
+        self.CLASSIFY_KEY = 'classify'
+        self.RESULT_KEY = 'result'
+        self.daily_wb_name = 'Output_{0}.xlsx'
+        self.short_sleep_time = 10
+        self.long_sleep_time = 3600
+        # 睡眠时间
+        self.sleep_time = float(conf.SLEEP_SECOND_FOLDER)
+        # input folder
+        self.input_dirs = conf.get_namespace('LTGT_DIR_')
+        # seperate folder name
+        self.seperate_map = {
+            consts.IC_CLASSIFY: 'IDCard',
+            consts.BC_CLASSIFY: 'BankCard'
+        }
+        self.field_map = {
+            consts.VAT_CLASSIFY: (consts.VAT_CN_NAME, None, None, consts.VATS_FIELD_ORDER),
+            consts.IC_CLASSIFY: (consts.IC_CN_NAME, '有效期限', consts.IC_FIELD_ORDER_3, consts.IC_FIELD_ORDER_2),
+            consts.BC_CLASSIFY: (consts.BC_CN_NAME, None, None, consts.BC_FIELD_ORDER_2)
+        }
+        # ocr相关
+        self.ocr_url = conf.OCR_URL_FOLDER
+        self.ocr_url_2 = conf.OCR2_URL_FOLDER
+        # self.ocr_url_4 = conf.IC_URL
+        self.ltgt_ocr_url = conf.LTGT_URL
+        # 优雅退出信号：15
+        signal.signal(signal.SIGTERM, self.signal_handler)
+    def signal_handler(self, sig, frame):
+        self.switch = False  # 停止处理文件
+    def license1_process(self, ocr_data, all_res, classify):
+        # 类别：'0'身份证， '1'居住证
+        license_data = ocr_data.get('data', [])
+        if not license_data:
+            return
+        if classify == consts.IC_CLASSIFY:
+            for id_card_dict in license_data:
+                try:
+                    id_card_dict.pop('base64_img')
+                except Exception as e:
+                    continue
+        all_res.extend(license_data)
+    def license2_process(self, ocr_data, all_res, classify, img_path):
+        pid, _, _, _, _, _ = consts.LICENSE_CLASSIFY_MAPPING.get(classify)
+        file_data = ocr_data.get('section_img')
+        if file_data is None:
+            with open(img_path, 'rb') as f:
+                base64_data = base64.b64encode(f.read())
+                # 获取解码后的base64值
+                file_data = base64_data.decode()
+        json_data_2 = {
+            "pid": str(pid),
+            "filedata": file_data
+        }
+        for times in range(consts.RETRY_TIMES):
+            try:
+                start_time = time.time()
+                ocr_2_response = requests.post(self.ocr_url_2, data=json_data_2)
+                if ocr_2_response.status_code != 200:
+                    raise OCR2Exception('ocr_2 status code: {0}'.format(ocr_2_response.status_code))
+            except Exception as e:
+                self.folder_log.warn(
+                    '{0} [ocr_2 failed] [times={1}] [img_path={2}] [error={3}]'.format(
+                        self.log_base, times, img_path, traceback.format_exc()))
+            else:
+                ocr_res_2 = json.loads(ocr_2_response.text)
+                end_time = time.time()
+                speed_time = int(end_time - start_time)
+                self.folder_log.info(
+                    '{0} [ocr_2 success] [img={1}] [speed_time={2}]'.format(
+                        self.log_base, img_path, speed_time))
+                if ocr_res_2.get('ErrorCode') in consts.SUCCESS_CODE_SET:
+                    if pid == consts.BC_PID:
+                        all_res.append(ocr_res_2)
+                    else:
+                        # 营业执照等
+                        for result_dict in ocr_res_2.get('ResultList', []):
+                            res_dict = {}
+                            for field_dict in result_dict.get('FieldList', []):
+                                res_dict[field_dict.get('chn_key', '')] = field_dict.get('value', '')
+                            all_res.append(res_dict)
+    @staticmethod
+    def parse_img_path(img_path):
+        # 'page_{0}_img_{1}.{2}'.format(pno, img_index, ext)
+        img_name, _ = os.path.splitext(os.path.basename(img_path))
+        if re.match(r'page_\d+_img_\d+', img_name):
+            part_list = img_name.split('_')
+            return img_name, int(part_list[1])+1, int(part_list[3])+1
+        else:
+            return img_name, 1, 1
+    @staticmethod
+    def get_path(name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
+        time_stamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
+        new_name = '{0}_{1}'.format(time_stamp, name)
+        img_save_path = os.path.join(img_output_dir, new_name)
+        pdf_save_path = os.path.join(pdf_output_dir, new_name)
+        excel_name = '{0}.xlsx'.format(os.path.splitext(new_name)[0])
+        excel_path = os.path.join(wb_output_dir, excel_name)
+        seperate_path = None if seperate_dir is None else os.path.join(seperate_dir, new_name)
+        return img_save_path, excel_path, pdf_save_path, seperate_path
+    def res_process(self, all_res, excel_path, classify):
+        try:
+            wb = BSWorkbook(set(), set(), set(), set(), set())
+            sheet_name, key_field, side_field_order, src_field_order = self.field_map.get(classify)
+            ws = wb.create_sheet(sheet_name)
+            for res in all_res:
+                if key_field is not None and key_field in res:
+                    field_order = side_field_order
+                else:
+                    field_order = src_field_order
+                for search_field, write_field in field_order:
+                    field_value = res.get(search_field, '')
+                    if isinstance(field_value, list):
+                        ws.append((write_field, *field_value))
+                    else:
+                        ws.append((write_field, field_value))
+                ws.append((None,))
+            wb.remove_base_sheet()
+            wb.save(excel_path)
+        except Exception as e:
+            self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
+                self.log_base, excel_path, traceback.format_exc()))
+    def basename(self, path):
+        # A basename() variant which first strips the trailing slash, if present.
+        # Thus we always get the last component of the path, even for directories.
+        sep = os.path.sep + (os.path.altsep or '')
+        return os.path.basename(path.rstrip(sep))
+    def ocr_process(self, img_path, classify, all_res, seperate_dir):
+        if os.path.exists(img_path):
+            # TODO 图片验证
+            with open(img_path, 'rb') as f:
+                base64_data = base64.b64encode(f.read())
+                # 获取解码后的base64值
+                file_data = base64_data.decode()
+            json_data = {
+                "file": file_data,
+            }
+            if seperate_dir is None:
+                json_data["classify"] = classify
+            for times in range(consts.RETRY_TIMES):
+                try:
+                    start_time = time.time()
+                    ocr_response = requests.post(self.ocr_url, json=json_data)
+                    if ocr_response.status_code != 200:
+                        raise OCR1Exception('{0} ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
+                except Exception as e:
+                    self.folder_log.warn('{0} [ocr failed] [times={1}] [img_path={2}] [error={3}]'.format(
+                        self.log_base, times, img_path, traceback.format_exc()))
+                else:
+                    ocr_res = ocr_response.json()
+                    end_time = time.time()
+                    speed_time = int(end_time - start_time)
+                    self.folder_log.info('{0} [ocr success] [img={1}] [res={2}] [speed_time={3}]'.format(
+                        self.log_base, img_path, ocr_res, speed_time))
+                    if isinstance(ocr_res, dict):
+                        if ocr_res.get('code') == 1:
+                            data_list = ocr_res.get('data', [])
+                            if isinstance(data_list, list):
+                                for ocr_data in data_list:
+                                    if ocr_data.get('classify') == classify:
+                                        if seperate_dir is not None:
+                                            os.makedirs(seperate_dir, exist_ok=True)
+                                            real_dst = os.path.join(seperate_dir, self.basename(img_path))
+                                            if not os.path.exists(real_dst):
+                                                shutil.move(img_path, seperate_dir)
+                                        if classify in consts.LICENSE_CLASSIFY_SET_1:
+                                            self.license1_process(ocr_data, all_res, classify)
+                                        elif classify in consts.LICENSE_CLASSIFY_SET_2:
+                                            self.license2_process(ocr_data, all_res, classify, img_path)
+                    break
+            else:
+                self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
+    def ltgt_ocr_process(self, img_path_list, label, path):
+        img_data_list = []
+        for img_path in img_path_list:
+            if os.path.exists(img_path):
+                with open(img_path, 'rb') as f:
+                    base64_data = base64.b64encode(f.read())
+                    # 获取解码后的base64值
+                    file_data = base64_data.decode()
+                img_data_list.append(file_data)
+        json_data = {
+            "label": label,
+            "img_data_list": img_data_list
+        }
+        for times in range(consts.RETRY_TIMES):
+            try:
+                start_time = time.time()
+                ocr_response = requests.post(self.ltgt_ocr_url, json=json_data)
+                if ocr_response.status_code != 200:
+                    raise LTGTException('{0} ltgt ocr status code: {1}'.format(self.log_base, ocr_response.status_code))
+            except Exception as e:
+                self.folder_log.warn('{0} [ltgt ocr failed] [times={1}] [path={2}] [error={3}]'.format(
+                    self.log_base, times, path, traceback.format_exc()))
+            else:
+                ocr_res = ocr_response.json()
+                end_time = time.time()
+                speed_time = int(end_time - start_time)
+                self.folder_log.info('{0} [ltgt ocr success] [path={1}] [res={2}] [speed_time={3}]'.format(
+                    self.log_base, path, ocr_res, speed_time))
+                return ocr_res
+        else:
+            self.folder_log.warn('{0} [ltgt ocr failed] [path={1}]'.format(self.log_base, path))
+    def ltgt_res_process(self, ocr_res, label, excel_path):
+        try:
+            if isinstance(ocr_res, dict):
+                if ocr_res.get('code') == 1:
+                    result_dict = ocr_res.get('data', {})
+                    wb = BSWorkbook(set(), set(), set(), set(), set())
+                    rebuild_res = wb.ltgt_build(label, result_dict)
+                    wb.remove_base_sheet()
+                    wb.save(excel_path)
+                    return rebuild_res
+        except Exception as e:
+            self.folder_log.error('{0} [wb build error] [path={1}] [error={2}]'.format(
+                self.log_base, excel_path, traceback.format_exc()))
+    def ltgt_process(self, img_path_list, label, excel_path, path):
+        ocr_res = self.ltgt_ocr_process(img_path_list, label, path)
+        rebuild_res = self.ltgt_res_process(ocr_res, label, excel_path)
+        return rebuild_res
+    def images_process(self, img_path_list, classify, excel_path, seperate_dir):
+        all_res = []
+        for img_path in img_path_list:
+            self.ocr_process(img_path, classify, all_res, seperate_dir)
+        # if len(all_res) > 0:
+        self.res_process(all_res, excel_path, classify)
+        return all_res
+    def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir):
+        if os.path.exists(path):
+            rebuild_res = None
+            try:
+                img_save_path, excel_path, pdf_save_path, seperate_path = self.get_path(
+                    name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
+                self.folder_log.info('{0} [pdf to img start] [path={1}]'.format(self.log_base, path))
+                pdf_handler = PDFHandler(path, img_save_path)
+                if classify in self.ltgt_classify_mapping:
+                    pdf_handler.extract_page_image()
+                else:
+                    pdf_handler.extract_image()
+                self.folder_log.info('{0} [pdf to img end] [path={1}]'.format(self.log_base, path))
+            except Exception as e:
+                self.folder_log.error('{0} [pdf to img error] [path={1}] [error={2}]'.format(
+                    self.log_base, path, traceback.format_exc()))
+                raise e
+            else:
+                if classify in self.ltgt_classify_mapping:
+                    rebuild_res = self.ltgt_process(pdf_handler.img_path_list, self.ltgt_classify_mapping[classify],
+                                                    excel_path, path)
+                else:
+                    rebuild_res = self.images_process(pdf_handler.img_path_list, classify, excel_path, seperate_path)
+                shutil.move(path, pdf_save_path)
+                return rebuild_res
+    def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir):
+        if os.path.exists(path):
+            rebuild_res = None
+            try:
+                img_save_path, excel_path, tiff_save_path, seperate_path = self.get_path(
+                    name, img_output_dir, wb_output_dir, tiff_output_dir, seperate_dir)
+                self.folder_log.info('{0} [tiff to img start] [path={1}]'.format(self.log_base, path))
+                tiff_handler = TIFFHandler(path, img_save_path)
+                tiff_handler.extract_image()
+                self.folder_log.info('{0} [tiff to img end] [path={1}]'.format(self.log_base, path))
+            except Exception as e:
+                self.folder_log.error('{0} [tiff to img error] [path={1}] [error={2}]'.format(
+                    self.log_base, path, traceback.format_exc()))
+                raise e
+            else:
+                if classify in self.ltgt_classify_mapping:
+                    rebuild_res = self.ltgt_process(tiff_handler.img_path_list, self.ltgt_classify_mapping[classify],
+                                                    excel_path, path)
+                else:
+                    rebuild_res = self.images_process(tiff_handler.img_path_list, classify, excel_path, seperate_path)
+                shutil.move(path, tiff_save_path)
+                return rebuild_res
+    def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir, seperate_dir):
+        rebuild_res = None
+        try:
+            img_save_path, excel_path, _, seperate_path = self.get_path(
+                name, img_output_dir, wb_output_dir, pdf_output_dir, seperate_dir)
+        except Exception as e:
+            self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
+                self.log_base, path, traceback.format_exc()))
+        else:
+            if classify in self.ltgt_classify_mapping:
+                rebuild_res = self.ltgt_process([path], self.ltgt_classify_mapping[classify], excel_path, path)
+            else:
+                rebuild_res = self.images_process([path], classify, excel_path, seperate_path)
+            shutil.move(path, img_save_path)
+            return rebuild_res
+    def wb_process(self, wb_dir, result_queue):
+        while self.switch:
+            result_list = []
+            date_str = None
+            for i in range(100):
+                try:
+                    result = result_queue.get(block=False)
+                except Exception as e:
+                    time.sleep(self.short_sleep_time)
+                else:
+                    if date_str is None:
+                        date_str = result[self.DATE_KEY]
+                        result_list.append(result)
+                    elif result[self.DATE_KEY] == date_str:
+                        result_list.append(result)
+                    else:
+                        break
+            if date_str is None:
+                time.sleep(self.long_sleep_time)
+                continue
+            else:
+                wb_name = self.daily_wb_name.format(date_str)
+                wb_path = os.path.join(wb_dir, wb_name)
+                if os.path.isfile(wb_path):
+                    wb = load_workbook(wb_path)
+                else:
+                    wb = Workbook()
+                for result in result_list:
+                    try:
+                        if result[self.CLASSIFY_KEY] in self.sheet_content:
+                            sheet_name, head_fields = self.sheet_content[result[self.CLASSIFY_KEY]]
+                        else:
+                            sheet_name, key_field, side_field_order, field_order = self.field_map[result[self.CLASSIFY_KEY]]
+                            if key_field is not None and key_field in result[self.RESULT_KEY]:
+                                head_fields = [a for a, _ in side_field_order]
+                            else:
+                                head_fields = [a for a, _ in field_order]
+                        row = []
+                        for field in head_fields:
+                            row.append(result[self.RESULT_KEY].get(field))
+                        if sheet_name in wb.sheetnames:
+                            ws = wb.get_sheet_by_name(sheet_name)
+                        else:
+                            ws = wb.create_sheet(sheet_name)
+                            ws.append(head_fields)
+                        ws.append(row)
+                    except Exception as e:
+                        self.folder_log.info('{0} [daily wb failed] [result={1}] [error={2}]'.format(
+                            self.log_base, result, traceback.format_exc()))
+                wb.save(wb_path)
+    def folder_process(self, input_dir, classify, is_combined, result_queue):
+        while not os.path.isdir(input_dir):
+            self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
+            if self.switch:
+                time.sleep(self.sleep_time)
+                continue
+            else:
+                return
+        output_dir = os.path.join(os.path.dirname(input_dir), 'Output')
+        seperate_dir = os.path.join(output_dir, self.seperate_map.get(classify, 'Unknown')) if is_combined else None
+        img_output_dir = os.path.join(output_dir, 'image')
+        wb_output_dir = os.path.join(output_dir, 'excel')
+        pdf_output_dir = os.path.join(output_dir, 'pdf')
+        tiff_output_dir = os.path.join(output_dir, 'tiff')
+        failed_output_dir = os.path.join(output_dir, 'failed')
+        os.makedirs(output_dir, exist_ok=True)
+        os.makedirs(img_output_dir, exist_ok=True)
+        os.makedirs(wb_output_dir, exist_ok=True)
+        os.makedirs(pdf_output_dir, exist_ok=True)
+        os.makedirs(tiff_output_dir, exist_ok=True)
+        os.makedirs(failed_output_dir, exist_ok=True)
+        if seperate_dir is not None:
+            os.makedirs(seperate_dir, exist_ok=True)
+        os_error_filename_set = set()
+        while self.switch:
+            # if not os.path.isdir(input_dir):
+            #     self.folder_log.info('{0} [input dir is not dir] [input_dir={1}]'.format(self.log_base, input_dir))
+            #     time.sleep(self.sleep_time)
+            #     continue
+            # 1. 从input dir获取pdf or image
+            list_dir = os.listdir(input_dir)
+            if not list_dir and len(os_error_filename_set) == 0:
+                self.folder_log.info('{0} [input dir empty] [input_dir={1}]'.format(self.log_base, input_dir))
+                time.sleep(self.sleep_time)
+                continue
+            all_file_set = set(list_dir)
+            true_file_set = all_file_set - os_error_filename_set
+            if len(true_file_set) == 0 and len(os_error_filename_set) > 0:
+                true_file_set.add(os_error_filename_set.pop())
+            for name in true_file_set:
+                path = os.path.join(input_dir, name)
+                try:
+                    if os.path.isfile(path):
+                        self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
+                        if name.endswith('.pdf') or name.endswith('.PDF'):
+                            result = self.pdf_process(name, path, classify, img_output_dir, wb_output_dir,
+                                                      pdf_output_dir, seperate_dir)
+                        elif name.endswith('.tif') or name.endswith('.TIF'):
+                            result = self.tif_process(name, path, classify, img_output_dir, wb_output_dir,
+                                                      tiff_output_dir, seperate_dir)
+                        else:
+                            result = self.img_process(name, path, classify, wb_output_dir, img_output_dir,
+                                                      pdf_output_dir, seperate_dir)
+                        self.folder_log.info('{0} [file end] [path={1}]'.format(self.log_base, path))
+                    else:
+                        result = None
+                        self.folder_log.info('{0} [path is dir] [path={1}]'.format(self.log_base, input_dir))
+                        failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
+                        shutil.move(path, failed_path)
+                except OSError:
+                    os_error_filename_set.add(name)
+                    self.folder_log.error('{0} [os error] [path={1}] [error={2}]'.format(
+                        self.log_base, path, traceback.format_exc()))
+                except Exception as e:
+                    try:
+                        self.folder_log.error('{0} [file error] [path={1}] [error={2}]'.format(self.log_base, path,
+                                                                                               traceback.format_exc()))
+                        failed_path = os.path.join(failed_output_dir, '{0}_{1}'.format(time.time(), name))
+                        shutil.move(path, failed_path)
+                    except Exception as e:
+                        os_error_filename_set.add(name)
+                        self.folder_log.error('{0} [file move error] [path={1}] [error={2}]'.format(
+                            self.log_base, path, traceback.format_exc()))
+                else:
+                    if isinstance(result, dict) and len(result) > 0:
+                        date_str = time.strftime("%Y-%m-%d")
+                        result_queue.put(
+                            {
+                                self.CLASSIFY_KEY: classify,
+                                self.RESULT_KEY: result,
+                                self.DATE_KEY: date_str
+                             }
+                        )
+                    elif isinstance(result, list) and len(result) > 0:
+                        date_str = time.strftime("%Y-%m-%d")
+                        for res in result:
+                            result_queue.put(
+                                {
+                                    self.CLASSIFY_KEY: classify,
+                                    self.RESULT_KEY: res,
+                                    self.DATE_KEY: date_str
+                                }
+                            )
+    def handle(self, *args, **kwargs):
+        if len(self.input_dirs) == 0:
+            return
+        result_queue = Queue()
+        process_list = []
+        one_input_dir = None
+        for classify_idx, input_dir in self.input_dirs.items():
+            if one_input_dir is None:
+                one_input_dir = input_dir
+            classify = int(classify_idx.split('_')[0])
+            is_combined = True if int(classify_idx.split('_')[2]) == 1 else False
+            process = Process(target=self.folder_process, args=(input_dir, classify, is_combined, result_queue))
+            process_list.append(process)
+        wb_dir = os.path.dirname(os.path.dirname(one_input_dir))
+        wb_process = Process(target=self.wb_process, args=(wb_dir, result_queue, ))
+        process_list.append(wb_process)
+        for p in process_list:
+            p.start()
+        for p in process_list:
+            p.join()
+        self.folder_log.info('{0} [stop safely]'.format(self.log_base))
--- a/src/apps/doc/management/commands/folder_ocr_process.py
View file @b8745dc
+++ b/src/apps/doc/management/commands/folder_ocr_process.py
View file @b8745dc
@@ -61,13 +61,11 @@ class Command(BaseCommand, LoggerMixin):
    def signal_handler(self, sig, frame):
        self.switch = False  # 停止处理文件
-    def license1_process(self, ocr_data, license_summary, classify, res_list, pno, ino, part_idx, img_path):
+    def license1_process(self, ocr_data, license_summary, classify, img_path):
        # 类别：'0'身份证， '1'居住证
        license_data = ocr_data.get('data', [])
        if not license_data:
-            res_list.append((pno, ino, part_idx, consts.RES_SUCCESS_EMPTY))
            return
-        res_list.append((pno, ino, part_idx, consts.RES_SUCCESS))
        if classify == consts.MVC_CLASSIFY:  # 车辆登记证 3/4页结果整合
            for mvc_dict in license_data:
                try:
@@ -154,29 +152,21 @@ class Command(BaseCommand, LoggerMixin):
    def res_process(self, all_res, classify, excel_path):
        try:
            license_summary = {}
-            res_list = []
            if not all_res:
                return
            else:
                for img_path, ocr_res in all_res.items():
-                    img_name, pno, ino = self.parse_img_path(img_path)
+                    # img_name, pno, ino = self.parse_img_path(img_path)
-                    part_idx = 1
+                    # part_idx = 1
                    if isinstance(ocr_res, dict):
                        if ocr_res.get('code') == 1:
                            data_list = ocr_res.get('data', [])
                            if isinstance(data_list, list):
-                                for part_idx, ocr_data in enumerate(data_list):
+                                for ocr_data in data_list:
-                                    part_idx = part_idx + 1
+                                    # part_idx = part_idx + 1
-                                    self.license1_process(ocr_data, license_summary, classify,
+                                    self.license1_process(ocr_data, license_summary, classify, img_path)
-                                                          res_list, pno, ino, part_idx, img_path)
-                            else:
-                                res_list.append((pno, ino, part_idx, consts.RES_FAILED_3))
-                        else:
-                            res_list.append((pno, ino, part_idx, consts.RES_FAILED))
-                    else:
-                        res_list.append((pno, ino, part_idx, consts.RES_FAILED))
                wb = BSWorkbook(set(), set(), set(), set(), set())
                wb.simple_license_rebuild(license_summary, consts.DOC_SCHEME_LIST[0])
@@ -216,6 +206,13 @@ class Command(BaseCommand, LoggerMixin):
                    return ocr_res
            else:
                self.folder_log.warn('{0} [ocr failed] [img_path={1}]'.format(self.log_base, img_path))
+    def images_process(self, img_path_list, classify, excel_path):
+        all_res = {}
+        for img_path in img_path_list:
+            ocr_res = self.ocr_process(img_path, classify)
+            all_res[img_path] = ocr_res
+        self.res_process(all_res, classify, excel_path)
    def pdf_process(self, name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir):
        if os.path.exists(path):
@@ -230,11 +227,7 @@ class Command(BaseCommand, LoggerMixin):
                    self.log_base, path, traceback.format_exc()))
                raise e
            else:
-                all_res = {}
+                self.images_process(pdf_handler.img_path_list, classify, excel_path)
-                for img_path in pdf_handler.img_path_list:
-                    ocr_res = self.ocr_process(img_path, classify)
-                    all_res[img_path] = ocr_res
-                self.res_process(all_res, classify, excel_path)
                shutil.move(path, pdf_save_path)
    def tif_process(self, name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir):
@@ -250,23 +243,18 @@ class Command(BaseCommand, LoggerMixin):
                    self.log_base, path, traceback.format_exc()))
                raise e
            else:
-                all_res = {}
+                self.images_process(tiff_handler.img_path_list, classify, excel_path)
-                for img_path in tiff_handler.img_path_list:
-                    ocr_res = self.ocr_process(img_path, classify)
-                    all_res[img_path] = ocr_res
-                self.res_process(all_res, classify, excel_path)
                shutil.move(path, tiff_save_path)
    def img_process(self, name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir):
-        ocr_res = self.ocr_process(path, classify)
-        all_res = {path: ocr_res}
        try:
            img_save_path, excel_path, _ = self.get_path(name, img_output_dir, wb_output_dir, pdf_output_dir)
        except Exception as e:
            self.folder_log.error('{0} [get path error] [path={1}] [error={2}]'.format(
                self.log_base, path, traceback.format_exc()))
        else:
+            ocr_res = self.ocr_process(path, classify)
+            all_res = {path: ocr_res}
            self.res_process(all_res, classify, excel_path)
            shutil.move(path, img_save_path)
@@ -312,9 +300,9 @@ class Command(BaseCommand, LoggerMixin):
                try:
                    if os.path.isfile(path):
                        self.folder_log.info('{0} [file start] [path={1}]'.format(self.log_base, path))
-                        if name.endswith('.pdf'):
+                        if name.endswith('.pdf') or name.endswith('.PDF'):
                            self.pdf_process(name, path, classify, img_output_dir, wb_output_dir, pdf_output_dir)
-                        elif name.endswith('.tif'):
+                        elif name.endswith('.tif') or name.endswith('.TIF'):
                            self.tif_process(name, path, classify, img_output_dir, wb_output_dir, tiff_output_dir)
                        else:
                            self.img_process(name, path, classify, wb_output_dir, img_output_dir, pdf_output_dir)
--- a/src/apps/doc/ocr/wb.py
View file @b8745dc
+++ b/src/apps/doc/ocr/wb.py
View file @b8745dc
@@ -702,6 +702,31 @@ class BSWorkbook(Workbook):
            if field_str is not None:
                count_list.append((field_str, count))
+    def ltgt_build(self, label, result_dict):
+        ws = self.create_sheet(label)
+        rebuild_res = {}
+        for key, value in result_dict.items():
+            if isinstance(value, list):
+                value_list = [dict_item.get('words') for dict_item in value]
+                ws.append((key, '、'.join(value_list)))
+                rebuild_res[key] = '、'.join(value_list)
+            elif isinstance(value, dict):
+                if 'words' in value:
+                    ws.append((key, value['words']))
+                    rebuild_res[key] = value['words']
+                else:
+                    for sub_key, sub_value in value.items():
+                        if isinstance(sub_value, dict):
+                            ws.append(('{0}: {1}'.format(key, sub_key), sub_value.get('words', '')))
+                            rebuild_res['{0}: {1}'.format(key, sub_key)] = sub_value.get('words', '')
+                        else:
+                            ws.append(('{0}: {1}'.format(key, sub_key), sub_value))
+                            rebuild_res['{0}: {1}'.format(key, sub_key)] = sub_value
+            else:
+                ws.append((key, value))
+                rebuild_res[key] = value
+        return rebuild_res
    def simple_license_rebuild(self, license_summary, document_scheme):
        # for ic_license_dict in license_summary.get(consts.IC_CLASSIFY, []):
        #     if ic_license_dict.get('类别') == '1':
--- a/src/common/tools/pdf_to_img.py
View file @b8745dc
+++ b/src/common/tools/pdf_to_img.py
View file @b8745dc
@@ -225,3 +225,13 @@ class PDFHandler:
                else:
                    self.merge_il(pdf, pno, il)
        self.img_count = len(self.img_path_list)
+    def extract_page_image(self):
+        self.img_path_list = []
+        self.xref_set = set()
+        os.makedirs(self.img_dir_path, exist_ok=True)
+        with fitz.Document(self.path) as pdf:
+            for pno in range(pdf.pageCount):
+                page = pdf.loadPage(pno)
+                self.page_to_png(page)
+        self.img_count = len(self.img_path_list)