hil_contract_ocr.py 2.86 KB
# -*- coding: utf-8 -*-
# @Author        : lk
# @Email         : 9428.al@gmail.com
# @Created Date  : 2021-06-29 17:43:46
# @Last Modified : 2021-11-03 16:07:36
# @Description   :

from .get_char import Finder
from .get_char_fsm import Finder as FSMFinder


def predict(pdf_info, file_cls, is_fsm=False):
    """Summary

    Args:
        pdf_info (TYPE): Description
        file_cls (TYPE): file_cls = 0: 售后回租合同; file_cls = 1: 车辆处置协议; file_cls = 2: 车辆租赁抵押合同

    Returns:
        TYPE: Description
    """
    # 0: 售后回租合同
    pdf_info_0 = []
    for pno in pdf_info:
        for block in pdf_info[f'{pno}']['blocks']:
            if block['type'] != 0:
                continue
            for line in block['lines']:
                for span in line['spans']:
                    bbox, text = span['bbox'], span['text']
                    if '售后回租合同_' in text:
                        pdf_info_0.append(pdf_info[pno])
    # 1: 车辆处置协议
    pdf_info_1 = []
    for pno in pdf_info:
        for block in pdf_info[f'{pno}']['blocks']:
            if block['type'] != 0:
                continue
            for line in block['lines']:
                for span in line['spans']:
                    bbox, text = span['bbox'], span['text']
                    if '售后回租合同附件一' in text:
                        pdf_info_1.append(pdf_info[pno])
    # 2: 车辆租赁抵押合同
    pdf_info_2 = []
    for pno in pdf_info:
        for block in pdf_info[f'{pno}']['blocks']:
            if block['type'] != 0:
                continue
            for line in block['lines']:
                for span in line['spans']:
                    bbox, text = span['bbox'], span['text']
                    if '车辆租赁抵押合同_' in text:
                        pdf_info_2.append(pdf_info[pno])
    is_clczxy = False
    # 如果 pdf_info_1 == 4 页,则说明此时输入包含了车辆处置协议
    if len(pdf_info_1) == 4 and file_cls == 1 and len(pdf_info_0) != 0:
        is_clczxy = True
        pdf_info = dict()
        for pno, page_info in enumerate(pdf_info_1):
            pdf_info[str(pno)] = page_info

    if is_fsm:
        f = FSMFinder(pdf_info) 
    else:
        f = Finder(pdf_info)
    if file_cls == 0:
        results = f.get_info()
    if file_cls == 1:
        # 提取信息 ———— 车辆处置协议
        results = f.get_info_1()
    if file_cls == 2:
        # 提取信息 ———— 车辆租赁抵押合同
        results = f.get_info_2()
    # if is_clczxy is True:
    #     for key in results:
    #         if results[key]['page'] is not None:
    #             results[key]['page'] = str(int(results[key]['page']) + 6)
    for key in results:
        if results[key]['page'] is not None:
            results[key]['page'] = 'page_' + str(int(results[key]['page']) + 1)
    return results